diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..b6996a385f7c877a81241bd702b61456306c8806 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/InternVL3.5-8B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/Qwen3-VL-2B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/Qwen3-VL-8B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/Qwen3.5-0.8B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/Qwen3.5-2B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/Qwen3.5-9B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/.watchdog.log b/.watchdog.log
new file mode 100644
index 0000000000000000000000000000000000000000..20444bfe0861bc495ec0c2794babe3fcee26a72e
--- /dev/null
+++ b/.watchdog.log
@@ -0,0 +1,281 @@
+[2026-05-07 01:23:57] Watchdog started, stall threshold = 300 s
+[2026-05-07 01:23:58] No upload python found. Restarting...
+[2026-05-07 01:23:58] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 01:23:58] Restart issued, WMI ReturnValue=0, launcher PID=20160
+[2026-05-07 01:24:18] Tracking PID 22168, init Read=12.72 GB
+[2026-05-07 03:23:40] No upload python found. Restarting...
+[2026-05-07 03:23:40] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:23:40] Restart issued, WMI ReturnValue=0, launcher PID=27392
+[2026-05-07 03:24:01] Tracking PID 4136, init Read=9.47 GB
+[2026-05-07 03:27:03] No upload python found. Restarting...
+[2026-05-07 03:27:03] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:27:03] Restart issued, WMI ReturnValue=0, launcher PID=20248
+[2026-05-07 03:27:24] Tracking PID 13668, init Read=10.66 GB
+[2026-05-07 03:30:26] No upload python found. Restarting...
+[2026-05-07 03:30:26] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:30:26] Restart issued, WMI ReturnValue=0, launcher PID=14656
+[2026-05-07 03:30:46] Tracking PID 11616, init Read=9.98 GB
+[2026-05-07 03:33:48] No upload python found. Restarting...
+[2026-05-07 03:33:48] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:33:49] Restart issued, WMI ReturnValue=0, launcher PID=26872
+[2026-05-07 03:34:09] Tracking PID 1688, init Read=8.44 GB
+[2026-05-07 03:37:11] No upload python found. Restarting...
+[2026-05-07 03:37:11] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:37:11] Restart issued, WMI ReturnValue=0, launcher PID=25172
+[2026-05-07 03:37:31] Tracking PID 20240, init Read=10.62 GB
+[2026-05-07 03:40:34] No upload python found. Restarting...
+[2026-05-07 03:40:34] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:40:34] Restart issued, WMI ReturnValue=0, launcher PID=15440
+[2026-05-07 03:40:54] Tracking PID 7668, init Read=9.31 GB
+[2026-05-07 03:43:56] No upload python found. Restarting...
+[2026-05-07 03:43:56] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:43:56] Restart issued, WMI ReturnValue=0, launcher PID=12332
+[2026-05-07 03:44:17] Tracking PID 16364, init Read=9.01 GB
+[2026-05-07 03:47:19] No upload python found. Restarting...
+[2026-05-07 03:47:19] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:47:19] Restart issued, WMI ReturnValue=0, launcher PID=25116
+[2026-05-07 03:47:39] Tracking PID 21724, init Read=8.21 GB
+[2026-05-07 03:50:41] No upload python found. Restarting...
+[2026-05-07 03:50:41] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:50:42] Restart issued, WMI ReturnValue=0, launcher PID=29036
+[2026-05-07 03:51:02] Tracking PID 28372, init Read=8.72 GB
+[2026-05-07 03:54:04] No upload python found. Restarting...
+[2026-05-07 03:54:04] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:54:04] Restart issued, WMI ReturnValue=0, launcher PID=29684
+[2026-05-07 03:54:24] Tracking PID 20664, init Read=8.09 GB
+[2026-05-07 03:57:26] No upload python found. Restarting...
+[2026-05-07 03:57:26] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 03:57:26] Restart issued, WMI ReturnValue=0, launcher PID=25116
+[2026-05-07 03:57:47] Tracking PID 15052, init Read=8.79 GB
+[2026-05-07 04:00:49] No upload python found. Restarting...
+[2026-05-07 04:00:49] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:00:49] Restart issued, WMI ReturnValue=0, launcher PID=2668
+[2026-05-07 04:01:09] Tracking PID 8028, init Read=9.38 GB
+[2026-05-07 04:04:11] No upload python found. Restarting...
+[2026-05-07 04:04:11] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:04:11] Restart issued, WMI ReturnValue=0, launcher PID=4128
+[2026-05-07 04:04:31] Tracking PID 27280, init Read=9.46 GB
+[2026-05-07 04:08:04] No upload python found. Restarting...
+[2026-05-07 04:08:04] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:08:04] Restart issued, WMI ReturnValue=0, launcher PID=27408
+[2026-05-07 04:08:24] Tracking PID 29060, init Read=10.32 GB
+[2026-05-07 04:11:27] No upload python found. Restarting...
+[2026-05-07 04:11:27] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:11:27] Restart issued, WMI ReturnValue=0, launcher PID=18400
+[2026-05-07 04:11:47] Tracking PID 28568, init Read=9.09 GB
+[2026-05-07 04:14:49] No upload python found. Restarting...
+[2026-05-07 04:14:49] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:14:49] Restart issued, WMI ReturnValue=0, launcher PID=25660
+[2026-05-07 04:15:10] Tracking PID 7216, init Read=9.88 GB
+[2026-05-07 04:18:12] No upload python found. Restarting...
+[2026-05-07 04:18:12] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:18:12] Restart issued, WMI ReturnValue=0, launcher PID=27632
+[2026-05-07 04:18:32] Tracking PID 26584, init Read=8.21 GB
+[2026-05-07 04:21:34] No upload python found. Restarting...
+[2026-05-07 04:21:34] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:21:34] Restart issued, WMI ReturnValue=0, launcher PID=29684
+[2026-05-07 04:21:54] Tracking PID 1452, init Read=8.94 GB
+[2026-05-07 04:24:57] No upload python found. Restarting...
+[2026-05-07 04:24:57] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:24:57] Restart issued, WMI ReturnValue=0, launcher PID=23396
+[2026-05-07 04:25:17] Tracking PID 2080, init Read=9.61 GB
+[2026-05-07 04:28:19] No upload python found. Restarting...
+[2026-05-07 04:28:19] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:28:19] Restart issued, WMI ReturnValue=0, launcher PID=29288
+[2026-05-07 04:28:40] Tracking PID 12628, init Read=9.36 GB
+[2026-05-07 04:31:42] No upload python found. Restarting...
+[2026-05-07 04:31:42] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:31:42] Restart issued, WMI ReturnValue=0, launcher PID=29080
+[2026-05-07 04:32:02] Tracking PID 20776, init Read=8.87 GB
+[2026-05-07 04:35:04] No upload python found. Restarting...
+[2026-05-07 04:35:04] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:35:04] Restart issued, WMI ReturnValue=0, launcher PID=25012
+[2026-05-07 04:35:24] Tracking PID 23744, init Read=9.38 GB
+[2026-05-07 04:38:27] No upload python found. Restarting...
+[2026-05-07 04:38:27] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:38:27] Restart issued, WMI ReturnValue=0, launcher PID=8960
+[2026-05-07 04:38:47] Tracking PID 28516, init Read=8.45 GB
+[2026-05-07 04:42:19] No upload python found. Restarting...
+[2026-05-07 04:42:19] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:42:20] Restart issued, WMI ReturnValue=0, launcher PID=24896
+[2026-05-07 04:42:40] Tracking PID 20416, init Read=7.3 GB
+[2026-05-07 04:45:42] No upload python found. Restarting...
+[2026-05-07 04:45:42] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:45:42] Restart issued, WMI ReturnValue=0, launcher PID=16408
+[2026-05-07 04:46:02] Tracking PID 28992, init Read=9.8 GB
+[2026-05-07 04:49:05] No upload python found. Restarting...
+[2026-05-07 04:49:05] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:49:05] Restart issued, WMI ReturnValue=0, launcher PID=27912
+[2026-05-07 04:49:25] Tracking PID 960, init Read=9.27 GB
+[2026-05-07 04:52:27] No upload python found. Restarting...
+[2026-05-07 04:52:27] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:52:27] Restart issued, WMI ReturnValue=0, launcher PID=15432
+[2026-05-07 04:52:47] Tracking PID 24880, init Read=9.64 GB
+[2026-05-07 04:56:20] No upload python found. Restarting...
+[2026-05-07 04:56:20] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:56:20] Restart issued, WMI ReturnValue=0, launcher PID=3744
+[2026-05-07 04:56:41] Tracking PID 25356, init Read=8.27 GB
+[2026-05-07 04:59:43] No upload python found. Restarting...
+[2026-05-07 04:59:43] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 04:59:43] Restart issued, WMI ReturnValue=0, launcher PID=27888
+[2026-05-07 05:00:04] Tracking PID 27952, init Read=10.45 GB
+[2026-05-07 05:03:06] No upload python found. Restarting...
+[2026-05-07 05:03:06] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:03:06] Restart issued, WMI ReturnValue=0, launcher PID=20772
+[2026-05-07 05:03:26] Tracking PID 1456, init Read=10.27 GB
+[2026-05-07 05:06:29] No upload python found. Restarting...
+[2026-05-07 05:06:29] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:06:29] Restart issued, WMI ReturnValue=0, launcher PID=13848
+[2026-05-07 05:06:49] Tracking PID 28648, init Read=10.36 GB
+[2026-05-07 05:09:51] No upload python found. Restarting...
+[2026-05-07 05:09:51] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:09:52] Restart issued, WMI ReturnValue=0, launcher PID=6508
+[2026-05-07 05:10:12] Tracking PID 29120, init Read=9.57 GB
+[2026-05-07 05:13:14] No upload python found. Restarting...
+[2026-05-07 05:13:14] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:13:14] Restart issued, WMI ReturnValue=0, launcher PID=29080
+[2026-05-07 05:13:34] Tracking PID 29408, init Read=7.42 GB
+[2026-05-07 05:17:07] No upload python found. Restarting...
+[2026-05-07 05:17:07] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:17:07] Restart issued, WMI ReturnValue=0, launcher PID=5536
+[2026-05-07 05:17:27] Tracking PID 24176, init Read=9.01 GB
+[2026-05-07 05:20:29] No upload python found. Restarting...
+[2026-05-07 05:20:29] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:20:29] Restart issued, WMI ReturnValue=0, launcher PID=27784
+[2026-05-07 05:20:50] Tracking PID 27904, init Read=10.14 GB
+[2026-05-07 05:23:52] No upload python found. Restarting...
+[2026-05-07 05:23:52] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:23:52] Restart issued, WMI ReturnValue=0, launcher PID=3892
+[2026-05-07 05:24:12] Tracking PID 23124, init Read=8.03 GB
+[2026-05-07 05:27:14] No upload python found. Restarting...
+[2026-05-07 05:27:14] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:27:14] Restart issued, WMI ReturnValue=0, launcher PID=924
+[2026-05-07 05:27:35] Tracking PID 6124, init Read=8.05 GB
+[2026-05-07 05:30:37] No upload python found. Restarting...
+[2026-05-07 05:30:37] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:30:37] Restart issued, WMI ReturnValue=0, launcher PID=28232
+[2026-05-07 05:30:57] Tracking PID 1836, init Read=9.41 GB
+[2026-05-07 05:33:59] No upload python found. Restarting...
+[2026-05-07 05:33:59] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:33:59] Restart issued, WMI ReturnValue=0, launcher PID=29568
+[2026-05-07 05:34:20] Tracking PID 14728, init Read=8.76 GB
+[2026-05-07 05:37:22] No upload python found. Restarting...
+[2026-05-07 05:37:22] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:37:22] Restart issued, WMI ReturnValue=0, launcher PID=29036
+[2026-05-07 05:37:42] Tracking PID 21932, init Read=9.63 GB
+[2026-05-07 05:40:44] No upload python found. Restarting...
+[2026-05-07 05:40:44] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:40:44] Restart issued, WMI ReturnValue=0, launcher PID=5956
+[2026-05-07 05:41:05] Tracking PID 16784, init Read=10.13 GB
+[2026-05-07 05:44:07] No upload python found. Restarting...
+[2026-05-07 05:44:07] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:44:07] Restart issued, WMI ReturnValue=0, launcher PID=29208
+[2026-05-07 05:44:27] Tracking PID 26468, init Read=9.85 GB
+[2026-05-07 05:47:29] No upload python found. Restarting...
+[2026-05-07 05:47:29] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:47:30] Restart issued, WMI ReturnValue=0, launcher PID=17204
+[2026-05-07 05:47:50] Tracking PID 27924, init Read=9.21 GB
+[2026-05-07 05:50:52] No upload python found. Restarting...
+[2026-05-07 05:50:52] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:50:52] Restart issued, WMI ReturnValue=0, launcher PID=7704
+[2026-05-07 05:51:12] Tracking PID 25912, init Read=10.19 GB
+[2026-05-07 05:54:14] No upload python found. Restarting...
+[2026-05-07 05:54:14] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:54:14] Restart issued, WMI ReturnValue=0, launcher PID=28952
+[2026-05-07 05:54:35] Tracking PID 29272, init Read=8.83 GB
+[2026-05-07 05:57:37] No upload python found. Restarting...
+[2026-05-07 05:57:37] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 05:57:37] Restart issued, WMI ReturnValue=0, launcher PID=25524
+[2026-05-07 05:57:58] Tracking PID 8760, init Read=7.8 GB
+[2026-05-07 06:01:00] No upload python found. Restarting...
+[2026-05-07 06:01:00] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:01:00] Restart issued, WMI ReturnValue=0, launcher PID=29016
+[2026-05-07 06:01:20] Tracking PID 8040, init Read=9.98 GB
+[2026-05-07 06:04:22] No upload python found. Restarting...
+[2026-05-07 06:04:22] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:04:22] Restart issued, WMI ReturnValue=0, launcher PID=28840
+[2026-05-07 06:04:42] Tracking PID 25172, init Read=8.78 GB
+[2026-05-07 06:07:44] No upload python found. Restarting...
+[2026-05-07 06:07:44] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:07:44] Restart issued, WMI ReturnValue=0, launcher PID=14524
+[2026-05-07 06:08:05] Tracking PID 11872, init Read=8.56 GB
+[2026-05-07 06:11:07] No upload python found. Restarting...
+[2026-05-07 06:11:07] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:11:07] Restart issued, WMI ReturnValue=0, launcher PID=13756
+[2026-05-07 06:11:27] Tracking PID 15716, init Read=10.04 GB
+[2026-05-07 06:15:00] No upload python found. Restarting...
+[2026-05-07 06:15:00] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:15:00] Restart issued, WMI ReturnValue=0, launcher PID=28472
+[2026-05-07 06:15:20] Tracking PID 20180, init Read=7.92 GB
+[2026-05-07 06:18:22] No upload python found. Restarting...
+[2026-05-07 06:18:22] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:18:22] Restart issued, WMI ReturnValue=0, launcher PID=15712
+[2026-05-07 06:18:43] Tracking PID 12508, init Read=9.04 GB
+[2026-05-07 06:21:45] No upload python found. Restarting...
+[2026-05-07 06:21:45] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:21:45] Restart issued, WMI ReturnValue=0, launcher PID=22588
+[2026-05-07 06:22:06] Tracking PID 20564, init Read=8.24 GB
+[2026-05-07 06:25:07] No upload python found. Restarting...
+[2026-05-07 06:25:07] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:25:08] Restart issued, WMI ReturnValue=0, launcher PID=21216
+[2026-05-07 06:25:28] Tracking PID 27056, init Read=8.88 GB
+[2026-05-07 06:28:30] No upload python found. Restarting...
+[2026-05-07 06:28:30] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:28:30] Restart issued, WMI ReturnValue=0, launcher PID=15504
+[2026-05-07 06:28:50] Tracking PID 23240, init Read=8.62 GB
+[2026-05-07 06:31:53] No upload python found. Restarting...
+[2026-05-07 06:31:53] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:31:53] Restart issued, WMI ReturnValue=0, launcher PID=12632
+[2026-05-07 06:32:13] Tracking PID 29112, init Read=7.91 GB
+[2026-05-07 06:35:46] No upload python found. Restarting...
+[2026-05-07 06:35:46] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:35:46] Restart issued, WMI ReturnValue=0, launcher PID=6688
+[2026-05-07 06:36:06] Tracking PID 1880, init Read=9.38 GB
+[2026-05-07 06:39:08] No upload python found. Restarting...
+[2026-05-07 06:39:08] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:39:08] Restart issued, WMI ReturnValue=0, launcher PID=28860
+[2026-05-07 06:39:28] Tracking PID 20996, init Read=10.1 GB
+[2026-05-07 06:42:31] No upload python found. Restarting...
+[2026-05-07 06:42:31] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:42:31] Restart issued, WMI ReturnValue=0, launcher PID=12436
+[2026-05-07 06:42:51] Tracking PID 23428, init Read=8.12 GB
+[2026-05-07 06:45:53] No upload python found. Restarting...
+[2026-05-07 06:45:53] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:45:53] Restart issued, WMI ReturnValue=0, launcher PID=15440
+[2026-05-07 06:46:14] Tracking PID 26756, init Read=9.91 GB
+[2026-05-07 06:49:16] No upload python found. Restarting...
+[2026-05-07 06:49:16] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:49:16] Restart issued, WMI ReturnValue=0, launcher PID=28312
+[2026-05-07 06:49:36] Tracking PID 13260, init Read=8.84 GB
+[2026-05-07 06:53:09] No upload python found. Restarting...
+[2026-05-07 06:53:09] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:53:09] Restart issued, WMI ReturnValue=0, launcher PID=13476
+[2026-05-07 06:53:29] Tracking PID 18072, init Read=8.38 GB
+[2026-05-07 06:56:32] No upload python found. Restarting...
+[2026-05-07 06:56:32] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 06:56:32] Restart issued, WMI ReturnValue=0, launcher PID=17460
+[2026-05-07 06:56:52] Tracking PID 29056, init Read=8.07 GB
+[2026-05-07 07:00:25] No upload python found. Restarting...
+[2026-05-07 07:00:25] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 07:00:25] Restart issued, WMI ReturnValue=0, launcher PID=29528
+[2026-05-07 07:00:45] Tracking PID 21456, init Read=9.85 GB
+[2026-05-07 07:03:47] No upload python found. Restarting...
+[2026-05-07 07:03:47] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 07:03:47] Restart issued, WMI ReturnValue=0, launcher PID=24264
+[2026-05-07 07:04:08] Tracking PID 3440, init Read=9.81 GB
+[2026-05-07 07:07:10] No upload python found. Restarting...
+[2026-05-07 07:07:10] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 07:07:10] Restart issued, WMI ReturnValue=0, launcher PID=19964
+[2026-05-07 07:07:30] Tracking PID 29076, init Read=10.53 GB
+[2026-05-07 07:11:03] No upload python found. Restarting...
+[2026-05-07 07:11:03] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 07:11:03] Restart issued, WMI ReturnValue=0, launcher PID=5848
+[2026-05-07 07:11:23] Tracking PID 20508, init Read=9.59 GB
+[2026-05-07 07:14:26] No upload python found. Restarting...
+[2026-05-07 07:14:26] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 07:14:26] Restart issued, WMI ReturnValue=0, launcher PID=23236
+[2026-05-07 07:14:46] Tracking PID 2916, init Read=8.73 GB
+[2026-05-07 07:17:48] No upload python found. Restarting...
+[2026-05-07 07:17:48] Restarting upload (LFS dedup will skip already uploaded chunks)...
+[2026-05-07 07:17:48] Restart issued, WMI ReturnValue=0, launcher PID=29588
+[2026-05-07 07:18:09] Tracking PID 26428, init Read=9.4 GB
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/all_results.json b/checkpoints/GLM-4.6V-Flash-SFT/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6e3b419bb08f6b62c9d9805d663158c57f63ea8
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 2477163648385024.0,
+    "train_loss": 0.20598802658081056,
+    "train_runtime": 35266.4791,
+    "train_samples_per_second": 5.671,
+    "train_steps_per_second": 0.089
+}
\ No newline at end of file
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/chat_template.jinja b/checkpoints/GLM-4.6V-Flash-SFT/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..efe6364aa1a49684fe075d7ff43003340e1db78a
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/chat_template.jinja
@@ -0,0 +1,140 @@
+[gMASK]<sop>
+{%- if tools -%}
+<|system|>
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{% for tool in tools %}
+{{ tool | tojson(ensure_ascii=False) }}
+{% endfor %}
+</tools>
+
+For each function call, output the function name and arguments within the following XML format:
+<tool_call>{function-name}
+<arg_key>{arg-key-1}</arg_key>
+<arg_value>{arg-value-1}</arg_value>
+<arg_key>{arg-key-2}</arg_key>
+<arg_value>{arg-value-2}</arg_value>
+...
+</tool_call>{%- endif -%}
+{%- macro visible_text(content) -%}
+    {%- if content is string -%}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping -%}
+        {%- for item in content -%}
+            {%- if item is mapping and item.type == 'text' -%}
+                {{- item.text }}
+            {%- elif item is mapping and (item.type == 'image' or 'image' in item) -%}
+                <|begin_of_image|><|image|><|end_of_image|>
+            {%- elif item is mapping and (item.type == 'video' or 'video' in item) -%}
+                <|begin_of_video|><|video|><|end_of_video|>
+            {%- elif item is string -%}
+                {{- item }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{- content }}
+    {%- endif -%}
+{%- endmacro -%}
+{%- set ns = namespace(last_user_index=-1) %}
+{%- for m in messages %}
+    {%- if m.role == 'user' %}
+        {% set ns.last_user_index = loop.index0 -%}
+    {%- endif %}
+{%- endfor %}
+{% for m in messages %}
+{%- if m.role == 'user' -%}<|user|>
+{% if m.content is string %}
+{{ m.content }}
+{%- else %}
+{%- for item in m.content %}
+{% if item.type == 'video' or 'video' in item %}
+<|begin_of_video|><|video|><|end_of_video|>{% elif item.type == 'image' or 'image' in item %}
+<|begin_of_image|><|image|><|end_of_image|>{% elif item.type == 'text' %}
+{{ item.text }}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}
+{%- elif m.role == 'assistant' -%}
+<|assistant|>
+{%- set reasoning_content = '' %}
+{%- set content = visible_text(m.content) %}
+{%- if m.reasoning_content is string %}
+    {%- set reasoning_content = m.reasoning_content %}
+{%- else %}
+    {%- if '</think>' in content %}
+        {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+        {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+    {%- endif %}
+{%- endif %}
+{%- if loop.index0 > ns.last_user_index and reasoning_content -%}
+{{ '\n<think>' + reasoning_content.strip() +  '</think>'}}
+{%- else -%}
+{{ '\n<think></think>' }}
+{%- endif -%}
+{%- if content.strip() -%}
+{{ '\n' + content.strip() }}
+{%- endif -%}
+{% if m.tool_calls %}
+{% for tc in m.tool_calls %}
+{%- if tc.function %}
+    {%- set tc = tc.function %}
+{%- endif %}
+{{ '\n<tool_call>' + tc.name }}
+{% set _args = tc.arguments %}
+{% for k, v in _args.items() %}
+<arg_key>{{ k }}</arg_key>
+<arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>
+{% endfor %}
+</tool_call>{% endfor %}
+{% endif %}
+{%- elif m.role == 'tool' -%}
+{%- if m.content is string -%}
+{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+    {{- '<|observation|>' }}
+{%- endif %}
+{{- '\n<tool_response>\n' }}
+{{- m.content }}
+{{- '\n</tool_response>' }}
+{% elif m.content is iterable and m.content is not mapping %}
+{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+{{- '<|observation|>' }}
+{%- endif %}
+{{- '\n<tool_response>\n' }}
+{%- for tr in m.content -%}
+  {%- if tr is mapping and tr.type is defined -%}
+    {%- set t = tr.type | lower -%}
+    {%- if t == 'text' and tr.text is defined -%}
+{{ tr.text }}
+    {%- elif t in ['image', 'image_url'] -%}
+<|begin_of_image|><|image|><|end_of_image|>
+    {%- elif t in ['video', 'video_url'] -%}
+<|begin_of_video|><|video|><|end_of_video|>
+    {%- else -%}
+{{ tr | tojson(ensure_ascii=False) }}
+    {%- endif -%}
+  {%- else -%}
+{{ tr.output if tr.output is defined else tr }}
+  {%- endif -%}
+{%- endfor -%}
+{{- '\n</tool_response>' }}
+{%- else -%}
+<|observation|>{% for tr in m.content %}
+
+<tool_response>
+{{ tr.output if tr.output is defined else tr }}
+</tool_response>{% endfor -%}
+{% endif -%}
+{%- elif m.role == 'system' -%}
+<|system|>
+{{ visible_text(m.content) }}
+{%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+<|assistant|>
+{{'<think></think>\n' if (enable_thinking is defined and not enable_thinking) else ''}}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/config.json b/checkpoints/GLM-4.6V-Flash-SFT/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b8c90732f3778a5bb41fde87b5cca52730074fe
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/config.json
@@ -0,0 +1,72 @@
+{
+  "architectures": [
+    "Glm4vForConditionalGeneration"
+  ],
+  "dtype": "bfloat16",
+  "eos_token_id": 151329,
+  "hidden_size": 4096,
+  "image_end_token_id": 151340,
+  "image_start_token_id": 151339,
+  "image_token_id": 151363,
+  "model_type": "glm4v",
+  "pad_token_id": 151329,
+  "text_config": {
+    "attention_bias": true,
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "eos_token_id": [
+      151329,
+      151336,
+      151338
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 13696,
+    "max_position_embeddings": 131072,
+    "model_type": "glm4v_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 40,
+    "num_key_value_heads": 2,
+    "pad_token_id": 151329,
+    "rms_norm_eps": 1e-05,
+    "rope_parameters": {
+      "mrope_section": [
+        8,
+        12,
+        12
+      ],
+      "partial_rotary_factor": 0.5,
+      "rope_theta": 500000,
+      "rope_type": "default"
+    },
+    "use_cache": false,
+    "vocab_size": 151552
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.5.3",
+  "use_cache": false,
+  "video_end_token_id": 151342,
+  "video_start_token_id": 151341,
+  "video_token_id": 151364,
+  "vision_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "depth": 24,
+    "dtype": "bfloat16",
+    "hidden_act": "silu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 1536,
+    "image_size": 336,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 13696,
+    "model_type": "glm4v_vision",
+    "num_heads": 12,
+    "out_hidden_size": 4096,
+    "patch_size": 14,
+    "rms_norm_eps": 1e-05,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  }
+}
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/eval_results_job_glm_glm_46v_flash_20260430_010119.json b/checkpoints/GLM-4.6V-Flash-SFT/eval_results_job_glm_glm_46v_flash_20260430_010119.json
new file mode 100644
index 0000000000000000000000000000000000000000..d68a7434c3c70e2d63a3dab9c6bed2eef9716d64
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/eval_results_job_glm_glm_46v_flash_20260430_010119.json
@@ -0,0 +1,56 @@
+{
+  "mae_dx": 0.1517896551724138,
+  "rmse_dx": 0.5050280292665226,
+  "mae_dy": 0.13570689655172413,
+  "rmse_dy": 0.40379185488190017,
+  "mae_dz": 0.017967241379310345,
+  "rmse_dz": 0.15680698656144998,
+  "mae_dpitch": 0.24627758620689652,
+  "rmse_dpitch": 0.5965444891927231,
+  "mae_dyaw": 1.0261448275862068,
+  "rmse_dyaw": 2.459724339755617,
+  "mae_droll": 0.0,
+  "rmse_droll": 0.0,
+  "mae_overall": 0.26298103448275856,
+  "mae_position": 0.10182126436781609,
+  "mae_rotation": 0.42414080459770115,
+  "rmse_overall": 1.068394337204253,
+  "wp1_euc_mae": 0.0698010264307822,
+  "wp1_euc_median": 0.01999999999999999,
+  "wp2_euc_mae": 0.1401695004658457,
+  "wp2_euc_median": 0.04123105625617661,
+  "wp3_euc_mae": 0.22301934350856006,
+  "wp3_euc_median": 0.07211102550927984,
+  "wp4_euc_mae": 0.32865394783587415,
+  "wp4_euc_median": 0.1104536101718727,
+  "wp5_euc_mae": 0.44338792793915116,
+  "wp5_euc_median": 0.15905694150420963,
+  "euclidean_mae": 0.24100634923604267,
+  "ADE": 0.24100634923604267,
+  "FDE": 0.44338792793915116,
+  "ADE_median": 0.08327688731593763,
+  "FDE_median": 0.15905694150420963,
+  "SR@0.5m": 0.8951724137931034,
+  "SR@1.0m": 0.9513793103448276,
+  "SR@2.0m": 0.9808620689655172,
+  "SR@5.0m": 0.9968965517241379,
+  "TrajSR@1.0m": 0.8974137931034483,
+  "TrajSR@2.0m": 0.9577586206896552,
+  "TrajSR@5.0m": 0.9922413793103448,
+  "RotAcc@1.0deg": 0.7027586206896552,
+  "RotAcc@5.0deg": 0.9586206896551724,
+  "RotAcc@10.0deg": 0.9889655172413793,
+  "wp1_rot_mae": 0.5029051706109685,
+  "wp2_rot_mae": 0.7513635215329055,
+  "wp3_rot_mae": 1.0546360645612183,
+  "wp4_rot_mae": 1.4243170022546052,
+  "wp5_rot_mae": 1.784744600833039,
+  "rotation_euc_mae": 1.1035932719585473,
+  "parse_failure_rate": 0.0,
+  "parse_success_rate": 1.0,
+  "valid_samples": 1160,
+  "total_samples": 1160,
+  "parse_failures": 0,
+  "inference_engine": "vllm",
+  "vllm_version": "0.19.0"
+}
\ No newline at end of file
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/generation_config.json b/checkpoints/GLM-4.6V-Flash-SFT/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1329f41b4b2856cdec0a6f99d5946550add7d47c
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/generation_config.json
@@ -0,0 +1,16 @@
+{
+  "_from_model_config": true,
+  "do_sample": true,
+  "eos_token_id": [
+    151329,
+    151329,
+    151336,
+    151338,
+    151348
+  ],
+  "pad_token_id": 151329,
+  "temperature": 0.8,
+  "top_k": 2,
+  "top_p": 0.6,
+  "transformers_version": "5.5.3"
+}
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/model.safetensors b/checkpoints/GLM-4.6V-Flash-SFT/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cc7b59ff3f6a066137f54581c56df169c99f1d50
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8a32229e6fe30d156e4259207d341d5b0022d08d8df59cd08760bf85cd5d215
+size 20585645128
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/processor_config.json b/checkpoints/GLM-4.6V-Flash-SFT/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b14e663a0204b0d1d28fdc1e6515145147b5ce85
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/processor_config.json
@@ -0,0 +1,63 @@
+{
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_processor_type": "Glm46VImageProcessor",
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "merge_size": 2,
+    "patch_size": 14,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "longest_edge": 9633792,
+      "shortest_edge": 12544
+    },
+    "temporal_patch_size": 2
+  },
+  "processor_class": "Glm46VProcessor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "fps": 2,
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "max_duration": 300,
+    "max_image_size": {
+      "longest_edge": 47040000
+    },
+    "merge_size": 2,
+    "num_frames": 16,
+    "patch_size": 14,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "size": {
+      "longest_edge": 100352000,
+      "shortest_edge": 12544
+    },
+    "temporal_patch_size": 2,
+    "video_processor_type": "Glm46VVideoProcessor"
+  }
+}
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json b/checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..5045eeb65854e3e7732f8d69dda6529fd862a0bc
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eecde1f225a86abef606164ceeb446737e592c4e7a40afe5cbf3ce8328e3df99
+size 19970886
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/tokenizer_config.json b/checkpoints/GLM-4.6V-Flash-SFT/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..70b612ef9461b1f4390d0773c15c0fa9dfabf11c
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/tokenizer_config.json
@@ -0,0 +1,19 @@
+{
+  "backend": "tokenizers",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": [
+    "<|user|>",
+    "<|observation|>",
+    "</answer>"
+  ],
+  "is_local": true,
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "Glm46VProcessor",
+  "remove_space": false,
+  "split_special_tokens": false,
+  "tokenizer_class": "TokenizersBackend"
+}
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/train_results.json b/checkpoints/GLM-4.6V-Flash-SFT/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6e3b419bb08f6b62c9d9805d663158c57f63ea8
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 2477163648385024.0,
+    "train_loss": 0.20598802658081056,
+    "train_runtime": 35266.4791,
+    "train_samples_per_second": 5.671,
+    "train_steps_per_second": 0.089
+}
\ No newline at end of file
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/trainer_state.json b/checkpoints/GLM-4.6V-Flash-SFT/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..01464cb06b021fac72b88d87d0fd1df501adb789
--- /dev/null
+++ b/checkpoints/GLM-4.6V-Flash-SFT/trainer_state.json
@@ -0,0 +1,2227 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 20.093808181688754,
+      "learning_rate": 1.437699680511182e-07,
+      "loss": 0.7523126602172852,
+      "step": 10
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 16.520568445399164,
+      "learning_rate": 3.0351437699680514e-07,
+      "loss": 0.684361743927002,
+      "step": 20
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 7.062991511064744,
+      "learning_rate": 4.6325878594249205e-07,
+      "loss": 0.46736898422241213,
+      "step": 30
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.0572338350229438,
+      "learning_rate": 6.230031948881789e-07,
+      "loss": 0.3222517013549805,
+      "step": 40
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.768970780796944,
+      "learning_rate": 7.82747603833866e-07,
+      "loss": 0.29146518707275393,
+      "step": 50
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.8158618748659492,
+      "learning_rate": 9.424920127795528e-07,
+      "loss": 0.28341834545135497,
+      "step": 60
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7218086220464439,
+      "learning_rate": 1.1022364217252397e-06,
+      "loss": 0.2903137683868408,
+      "step": 70
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.7459109221323802,
+      "learning_rate": 1.2619808306709266e-06,
+      "loss": 0.2718811988830566,
+      "step": 80
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.7186860317140319,
+      "learning_rate": 1.4217252396166134e-06,
+      "loss": 0.2660067558288574,
+      "step": 90
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.765918500231858,
+      "learning_rate": 1.5814696485623005e-06,
+      "loss": 0.26980152130126955,
+      "step": 100
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.7344200083929374,
+      "learning_rate": 1.7412140575079875e-06,
+      "loss": 0.2695180416107178,
+      "step": 110
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7057487416602337,
+      "learning_rate": 1.9009584664536742e-06,
+      "loss": 0.2582674264907837,
+      "step": 120
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.6996888798419932,
+      "learning_rate": 2.060702875399361e-06,
+      "loss": 0.2612154960632324,
+      "step": 130
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.7150606291134206,
+      "learning_rate": 2.220447284345048e-06,
+      "loss": 0.2520437717437744,
+      "step": 140
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7697242977250355,
+      "learning_rate": 2.380191693290735e-06,
+      "loss": 0.2501786470413208,
+      "step": 150
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.6327215717833664,
+      "learning_rate": 2.539936102236422e-06,
+      "loss": 0.24434318542480468,
+      "step": 160
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.7947096523807732,
+      "learning_rate": 2.699680511182109e-06,
+      "loss": 0.25281600952148436,
+      "step": 170
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.6717890611061146,
+      "learning_rate": 2.8594249201277955e-06,
+      "loss": 0.2454531669616699,
+      "step": 180
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.7151585341922304,
+      "learning_rate": 3.0191693290734825e-06,
+      "loss": 0.2505363464355469,
+      "step": 190
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.8601334705182279,
+      "learning_rate": 3.17891373801917e-06,
+      "loss": 0.2505714178085327,
+      "step": 200
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.6106680426063227,
+      "learning_rate": 3.3386581469648564e-06,
+      "loss": 0.24775364398956298,
+      "step": 210
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6262984320818072,
+      "learning_rate": 3.4984025559105434e-06,
+      "loss": 0.24066565036773682,
+      "step": 220
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.6078537303186395,
+      "learning_rate": 3.6581469648562303e-06,
+      "loss": 0.24378209114074706,
+      "step": 230
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5889510426869463,
+      "learning_rate": 3.817891373801918e-06,
+      "loss": 0.23820171356201172,
+      "step": 240
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5658292689427505,
+      "learning_rate": 3.977635782747604e-06,
+      "loss": 0.23654117584228515,
+      "step": 250
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.5757166706348428,
+      "learning_rate": 4.137380191693291e-06,
+      "loss": 0.23743386268615724,
+      "step": 260
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5807034355359694,
+      "learning_rate": 4.297124600638978e-06,
+      "loss": 0.23970918655395507,
+      "step": 270
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5634022487351626,
+      "learning_rate": 4.456869009584665e-06,
+      "loss": 0.23490209579467775,
+      "step": 280
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5520223075835592,
+      "learning_rate": 4.616613418530352e-06,
+      "loss": 0.2404552936553955,
+      "step": 290
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.5587222430473198,
+      "learning_rate": 4.776357827476039e-06,
+      "loss": 0.24298410415649413,
+      "step": 300
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.542281258937415,
+      "learning_rate": 4.936102236421725e-06,
+      "loss": 0.22964231967926024,
+      "step": 310
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.6339707011249724,
+      "learning_rate": 4.999943833158769e-06,
+      "loss": 0.22938170433044433,
+      "step": 320
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5290859105179109,
+      "learning_rate": 4.999600600490783e-06,
+      "loss": 0.23717782497406006,
+      "step": 330
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.574404257271199,
+      "learning_rate": 4.9989453817439345e-06,
+      "loss": 0.23035426139831544,
+      "step": 340
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5887719210155044,
+      "learning_rate": 4.997978258698942e-06,
+      "loss": 0.230421781539917,
+      "step": 350
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5618660264892863,
+      "learning_rate": 4.996699352066659e-06,
+      "loss": 0.23192777633666992,
+      "step": 360
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.589113954603133,
+      "learning_rate": 4.995108821473014e-06,
+      "loss": 0.23194873332977295,
+      "step": 370
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.552581223712263,
+      "learning_rate": 4.993206865439084e-06,
+      "loss": 0.22629022598266602,
+      "step": 380
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.5506631212695152,
+      "learning_rate": 4.990993721356317e-06,
+      "loss": 0.22567858695983886,
+      "step": 390
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.5210832665844604,
+      "learning_rate": 4.988469665456901e-06,
+      "loss": 0.22596418857574463,
+      "step": 400
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5132503738005023,
+      "learning_rate": 4.985635012779288e-06,
+      "loss": 0.23435051441192628,
+      "step": 410
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.5264119522984109,
+      "learning_rate": 4.98249011712887e-06,
+      "loss": 0.2258882999420166,
+      "step": 420
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5122311697688684,
+      "learning_rate": 4.979035371033824e-06,
+      "loss": 0.22527906894683838,
+      "step": 430
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5105227090020142,
+      "learning_rate": 4.975271205696115e-06,
+      "loss": 0.2246992588043213,
+      "step": 440
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.5307268054645026,
+      "learning_rate": 4.971198090937671e-06,
+      "loss": 0.2193459987640381,
+      "step": 450
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.46923570087876276,
+      "learning_rate": 4.966816535141756e-06,
+      "loss": 0.21553544998168944,
+      "step": 460
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.4881836025298746,
+      "learning_rate": 4.9621270851895035e-06,
+      "loss": 0.22505784034729004,
+      "step": 470
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.50506411723612,
+      "learning_rate": 4.957130326391662e-06,
+      "loss": 0.22673957347869872,
+      "step": 480
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5086993434891525,
+      "learning_rate": 4.951826882415544e-06,
+      "loss": 0.22294471263885499,
+      "step": 490
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5280465251135189,
+      "learning_rate": 4.946217415207177e-06,
+      "loss": 0.21789300441741943,
+      "step": 500
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5337843871964275,
+      "learning_rate": 4.940302624908689e-06,
+      "loss": 0.22192811965942383,
+      "step": 510
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.4884343559217744,
+      "learning_rate": 4.934083249770912e-06,
+      "loss": 0.21614904403686525,
+      "step": 520
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.5316592538281818,
+      "learning_rate": 4.927560066061251e-06,
+      "loss": 0.21973915100097657,
+      "step": 530
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.518761429695226,
+      "learning_rate": 4.920733887966783e-06,
+      "loss": 0.23207192420959472,
+      "step": 540
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.511452747175852,
+      "learning_rate": 4.913605567492636e-06,
+      "loss": 0.21878607273101808,
+      "step": 550
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.49924599926539726,
+      "learning_rate": 4.906175994355656e-06,
+      "loss": 0.22075920104980468,
+      "step": 560
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.5259698850641532,
+      "learning_rate": 4.898446095873345e-06,
+      "loss": 0.22276382446289061,
+      "step": 570
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.501751014152873,
+      "learning_rate": 4.890416836848128e-06,
+      "loss": 0.21954989433288574,
+      "step": 580
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5167201593356286,
+      "learning_rate": 4.882089219446925e-06,
+      "loss": 0.2145029067993164,
+      "step": 590
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5006060240232905,
+      "learning_rate": 4.873464283076074e-06,
+      "loss": 0.22003324031829835,
+      "step": 600
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.4477538874438277,
+      "learning_rate": 4.864543104251587e-06,
+      "loss": 0.21916275024414061,
+      "step": 610
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4832933241270485,
+      "learning_rate": 4.855326796464798e-06,
+      "loss": 0.2203526973724365,
+      "step": 620
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5359361967005408,
+      "learning_rate": 4.8458165100433725e-06,
+      "loss": 0.21596732139587402,
+      "step": 630
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.5708003689943741,
+      "learning_rate": 4.836013432007738e-06,
+      "loss": 0.2171140193939209,
+      "step": 640
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.4831169531465719,
+      "learning_rate": 4.825918785922921e-06,
+      "loss": 0.22040581703186035,
+      "step": 650
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.4982382400104379,
+      "learning_rate": 4.8155338317458315e-06,
+      "loss": 0.21841506958007811,
+      "step": 660
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.4741071764041748,
+      "learning_rate": 4.804859865668002e-06,
+      "loss": 0.2143453598022461,
+      "step": 670
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.47853550451884025,
+      "learning_rate": 4.793898219953804e-06,
+      "loss": 0.21545085906982422,
+      "step": 680
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4902247743421047,
+      "learning_rate": 4.782650262774164e-06,
+      "loss": 0.2166231393814087,
+      "step": 690
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4611717059287351,
+      "learning_rate": 4.7711173980357886e-06,
+      "loss": 0.21284222602844238,
+      "step": 700
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.4815654128340087,
+      "learning_rate": 4.759301065205947e-06,
+      "loss": 0.21358721256256102,
+      "step": 710
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5049245613626656,
+      "learning_rate": 4.7472027391328e-06,
+      "loss": 0.21447527408599854,
+      "step": 720
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.4758997167389971,
+      "learning_rate": 4.734823929861317e-06,
+      "loss": 0.21809780597686768,
+      "step": 730
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.5423173365143716,
+      "learning_rate": 4.722166182444801e-06,
+      "loss": 0.21390962600708008,
+      "step": 740
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.44572231492476455,
+      "learning_rate": 4.709231076752045e-06,
+      "loss": 0.21404554843902587,
+      "step": 750
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4848421373802031,
+      "learning_rate": 4.696020227270142e-06,
+      "loss": 0.21710457801818847,
+      "step": 760
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.518532765750562,
+      "learning_rate": 4.6825352829029705e-06,
+      "loss": 0.21285481452941896,
+      "step": 770
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5008678397970389,
+      "learning_rate": 4.668777926765392e-06,
+      "loss": 0.21155524253845215,
+      "step": 780
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.48720974823345864,
+      "learning_rate": 4.6547498759731725e-06,
+      "loss": 0.20655455589294433,
+      "step": 790
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.49528977499161353,
+      "learning_rate": 4.6404528814286575e-06,
+      "loss": 0.2101435422897339,
+      "step": 800
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.4532686250809506,
+      "learning_rate": 4.6258887276022425e-06,
+      "loss": 0.21684365272521972,
+      "step": 810
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.49803115837380546,
+      "learning_rate": 4.611059232309639e-06,
+      "loss": 0.21193151473999022,
+      "step": 820
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.5153783225404047,
+      "learning_rate": 4.595966246484986e-06,
+      "loss": 0.21344296932220458,
+      "step": 830
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4765272009238815,
+      "learning_rate": 4.580611653949829e-06,
+      "loss": 0.21319386959075928,
+      "step": 840
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.5228745905777464,
+      "learning_rate": 4.564997371177992e-06,
+      "loss": 0.21112470626831054,
+      "step": 850
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.4583805155148445,
+      "learning_rate": 4.54912534705637e-06,
+      "loss": 0.2108391284942627,
+      "step": 860
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4920259584441244,
+      "learning_rate": 4.532997562641683e-06,
+      "loss": 0.20768051147460936,
+      "step": 870
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5200095181799963,
+      "learning_rate": 4.516616030913214e-06,
+      "loss": 0.21211957931518555,
+      "step": 880
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4788503683270311,
+      "learning_rate": 4.499982796521556e-06,
+      "loss": 0.20693025588989258,
+      "step": 890
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4666456137071941,
+      "learning_rate": 4.48309993553341e-06,
+      "loss": 0.20890872478485106,
+      "step": 900
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.4794527139448749,
+      "learning_rate": 4.465969555172468e-06,
+      "loss": 0.20777955055236816,
+      "step": 910
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.4616610840587355,
+      "learning_rate": 4.448593793556391e-06,
+      "loss": 0.21416122913360597,
+      "step": 920
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.47725407011391663,
+      "learning_rate": 4.430974819429954e-06,
+      "loss": 0.20783448219299316,
+      "step": 930
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4596350013424985,
+      "learning_rate": 4.413114831894344e-06,
+      "loss": 0.20199823379516602,
+      "step": 940
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4940149958405755,
+      "learning_rate": 4.3950160601326865e-06,
+      "loss": 0.20049993991851806,
+      "step": 950
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.4891958940488766,
+      "learning_rate": 4.376680763131811e-06,
+      "loss": 0.20765538215637208,
+      "step": 960
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5373640149223949,
+      "learning_rate": 4.358111229400296e-06,
+      "loss": 0.2103745460510254,
+      "step": 970
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.5035919946088194,
+      "learning_rate": 4.33930977668283e-06,
+      "loss": 0.2148181438446045,
+      "step": 980
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.498832420199319,
+      "learning_rate": 4.320278751670922e-06,
+      "loss": 0.20667800903320313,
+      "step": 990
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5016480811009209,
+      "learning_rate": 4.301020529710009e-06,
+      "loss": 0.20847175121307374,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5355131410598809,
+      "learning_rate": 4.281537514502962e-06,
+      "loss": 0.21192097663879395,
+      "step": 1010
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.49710771531514497,
+      "learning_rate": 4.261832137810093e-06,
+      "loss": 0.20849306583404542,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.4702938633516668,
+      "learning_rate": 4.241906859145611e-06,
+      "loss": 0.20947628021240233,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.47328762785100176,
+      "learning_rate": 4.221764165470661e-06,
+      "loss": 0.20568199157714845,
+      "step": 1040
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.48090607151875236,
+      "learning_rate": 4.201406570882898e-06,
+      "loss": 0.20522446632385255,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.46870182419574746,
+      "learning_rate": 4.180836616302704e-06,
+      "loss": 0.2044762134552002,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.49284234006242156,
+      "learning_rate": 4.160056869156041e-06,
+      "loss": 0.20835609436035157,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.425482663225026,
+      "learning_rate": 4.139069923053995e-06,
+      "loss": 0.20575876235961915,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.46647669293000804,
+      "learning_rate": 4.117878397469062e-06,
+      "loss": 0.20992250442504884,
+      "step": 1090
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.4464343988416538,
+      "learning_rate": 4.096484937408195e-06,
+      "loss": 0.20092244148254396,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.5116088744854695,
+      "learning_rate": 4.074892213082676e-06,
+      "loss": 0.20036702156066893,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 4.940314739525779,
+      "learning_rate": 4.0531029195748265e-06,
+      "loss": 0.21338913440704346,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4721397920115156,
+      "learning_rate": 4.03111977650163e-06,
+      "loss": 0.20792775154113768,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.5105519348301445,
+      "learning_rate": 4.008945527675281e-06,
+      "loss": 0.2061443328857422,
+      "step": 1140
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.523180958068929,
+      "learning_rate": 3.986582940760717e-06,
+      "loss": 0.1962942123413086,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.5027335828799008,
+      "learning_rate": 3.9640348069301785e-06,
+      "loss": 0.2031947612762451,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.48735270934050073,
+      "learning_rate": 3.941303940514826e-06,
+      "loss": 0.20448057651519774,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5075332440871839,
+      "learning_rate": 3.918393178653472e-06,
+      "loss": 0.20594587326049804,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.4485083644552742,
+      "learning_rate": 3.895305380938468e-06,
+      "loss": 0.20264167785644532,
+      "step": 1190
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.4568492727427137,
+      "learning_rate": 3.872043429058783e-06,
+      "loss": 0.20010733604431152,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.46103501808297814,
+      "learning_rate": 3.84861022644033e-06,
+      "loss": 0.2026883602142334,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.46609834517793386,
+      "learning_rate": 3.825008697883574e-06,
+      "loss": 0.21079249382019044,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.49992288047242467,
+      "learning_rate": 3.8012417891984776e-06,
+      "loss": 0.2031094551086426,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.4746264528155682,
+      "learning_rate": 3.777312466836819e-06,
+      "loss": 0.20238199234008789,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.45243385346205817,
+      "learning_rate": 3.7532237175219378e-06,
+      "loss": 0.20085253715515136,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.48931316379420287,
+      "learning_rate": 3.728978547875948e-06,
+      "loss": 0.20520598888397218,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.5229456414008956,
+      "learning_rate": 3.7045799840444712e-06,
+      "loss": 0.19984333515167235,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4773055647919508,
+      "learning_rate": 3.6800310713189258e-06,
+      "loss": 0.20064287185668944,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4824962267097886,
+      "learning_rate": 3.6553348737564328e-06,
+      "loss": 0.20138092041015626,
+      "step": 1290
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.47245858486532044,
+      "learning_rate": 3.6304944737973794e-06,
+      "loss": 0.20704314708709717,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.47670774891547607,
+      "learning_rate": 3.6055129718806836e-06,
+      "loss": 0.20015296936035157,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.4553061754046557,
+      "learning_rate": 3.5803934860568134e-06,
+      "loss": 0.19692450761795044,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5124220374815842,
+      "learning_rate": 3.5551391515986163e-06,
+      "loss": 0.2016448497772217,
+      "step": 1330
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.4809826187082155,
+      "learning_rate": 3.529753120609982e-06,
+      "loss": 0.19793987274169922,
+      "step": 1340
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.48798480914379067,
+      "learning_rate": 3.5042385616324243e-06,
+      "loss": 0.20041651725769044,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4589600174491072,
+      "learning_rate": 3.4785986592495934e-06,
+      "loss": 0.19874777793884277,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.44810416886840765,
+      "learning_rate": 3.452836613689803e-06,
+      "loss": 0.19696075916290284,
+      "step": 1370
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4584133576368786,
+      "learning_rate": 3.426955640426584e-06,
+      "loss": 0.20014967918395996,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.46474214573205574,
+      "learning_rate": 3.4009589697773605e-06,
+      "loss": 0.19937365055084227,
+      "step": 1390
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.4671452045462699,
+      "learning_rate": 3.3748498465002475e-06,
+      "loss": 0.19703936576843262,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.48450994567172556,
+      "learning_rate": 3.3486315293890693e-06,
+      "loss": 0.20506525039672852,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.48940983460177095,
+      "learning_rate": 3.3223072908666053e-06,
+      "loss": 0.19508613348007203,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.5510507698314822,
+      "learning_rate": 3.295880416576153e-06,
+      "loss": 0.20555310249328612,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.45473195837081576,
+      "learning_rate": 3.269354204971427e-06,
+      "loss": 0.19813575744628906,
+      "step": 1440
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.4854091562037593,
+      "learning_rate": 3.242731966904865e-06,
+      "loss": 0.19694712162017822,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4637441174996577,
+      "learning_rate": 3.2160170252143913e-06,
+      "loss": 0.1959088087081909,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4460606032902631,
+      "learning_rate": 3.1892127143086716e-06,
+      "loss": 0.20340628623962403,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4768689558424143,
+      "learning_rate": 3.1623223797509347e-06,
+      "loss": 0.19146734476089478,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.46631038217283505,
+      "learning_rate": 3.135349377841396e-06,
+      "loss": 0.19588179588317872,
+      "step": 1490
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.48197350793708515,
+      "learning_rate": 3.1082970751983497e-06,
+      "loss": 0.20245718955993652,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.44408940491911375,
+      "learning_rate": 3.0811688483379546e-06,
+      "loss": 0.19959219694137573,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.47255519902507054,
+      "learning_rate": 3.0539680832528074e-06,
+      "loss": 0.1994904398918152,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.48800627171777977,
+      "learning_rate": 3.026698174989316e-06,
+      "loss": 0.19807126522064208,
+      "step": 1530
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4748737132528679,
+      "learning_rate": 2.999362527223952e-06,
+      "loss": 0.19806113243103027,
+      "step": 1540
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.47637730688550123,
+      "learning_rate": 2.9719645518384194e-06,
+      "loss": 0.19955278635025026,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5411554495039922,
+      "learning_rate": 2.944507668493807e-06,
+      "loss": 0.202299165725708,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.48642193804707995,
+      "learning_rate": 2.9169953042037623e-06,
+      "loss": 0.19863581657409668,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5363553346933208,
+      "learning_rate": 2.889430892906754e-06,
+      "loss": 0.19409118890762328,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.47187050499878397,
+      "learning_rate": 2.861817875037462e-06,
+      "loss": 0.1912764310836792,
+      "step": 1590
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5163595948637988,
+      "learning_rate": 2.8341596970973683e-06,
+      "loss": 0.20115599632263184,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.5033907485073755,
+      "learning_rate": 2.80645981122458e-06,
+      "loss": 0.19687057733535768,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.4753722793172304,
+      "learning_rate": 2.7787216747629508e-06,
+      "loss": 0.20292258262634277,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.46781165760957,
+      "learning_rate": 2.7509487498305615e-06,
+      "loss": 0.18959319591522217,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4803554793777817,
+      "learning_rate": 2.7231445028875924e-06,
+      "loss": 0.19619333744049072,
+      "step": 1640
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.43719126287209875,
+      "learning_rate": 2.6953124043036604e-06,
+      "loss": 0.19511375427246094,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.4689037514921924,
+      "learning_rate": 2.667455927924667e-06,
+      "loss": 0.19399585723876953,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.48479905355532704,
+      "learning_rate": 2.6395785506392164e-06,
+      "loss": 0.1896076202392578,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.516453973005613,
+      "learning_rate": 2.6116837519446407e-06,
+      "loss": 0.1939442992210388,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.47710575683228795,
+      "learning_rate": 2.5837750135127192e-06,
+      "loss": 0.19078316688537597,
+      "step": 1690
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.47654319681013313,
+      "learning_rate": 2.555855818755108e-06,
+      "loss": 0.19690483808517456,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.5030326386548561,
+      "learning_rate": 2.5279296523885636e-06,
+      "loss": 0.19325432777404786,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.49452423153374125,
+      "learning_rate": 2.5e-06,
+      "loss": 0.19436432123184205,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.5135088244704792,
+      "learning_rate": 2.472070347611437e-06,
+      "loss": 0.1878933072090149,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5160118206798595,
+      "learning_rate": 2.444144181244893e-06,
+      "loss": 0.19355961084365844,
+      "step": 1740
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5069308846787346,
+      "learning_rate": 2.416224986487282e-06,
+      "loss": 0.19122695922851562,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.5385800538703149,
+      "learning_rate": 2.3883162480553605e-06,
+      "loss": 0.18820159435272216,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.49129457413116234,
+      "learning_rate": 2.3604214493607844e-06,
+      "loss": 0.19197521209716797,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.4908165776123557,
+      "learning_rate": 2.332544072075333e-06,
+      "loss": 0.19534649848937988,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.49497656453552125,
+      "learning_rate": 2.30468759569634e-06,
+      "loss": 0.19484236240386962,
+      "step": 1790
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.466973816624908,
+      "learning_rate": 2.276855497112408e-06,
+      "loss": 0.191474986076355,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.498294237386886,
+      "learning_rate": 2.2490512501694394e-06,
+      "loss": 0.18636202812194824,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5110432771457695,
+      "learning_rate": 2.2212783252370496e-06,
+      "loss": 0.19112749099731446,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4923044532988948,
+      "learning_rate": 2.1935401887754213e-06,
+      "loss": 0.19590845108032226,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.49881036242858373,
+      "learning_rate": 2.165840302902632e-06,
+      "loss": 0.18917866945266723,
+      "step": 1840
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.5070848566140863,
+      "learning_rate": 2.1381821249625383e-06,
+      "loss": 0.1955878973007202,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.5245919327161893,
+      "learning_rate": 2.1105691070932465e-06,
+      "loss": 0.18681724071502687,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.5043139368489675,
+      "learning_rate": 2.083004695796238e-06,
+      "loss": 0.185194993019104,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5180452275250914,
+      "learning_rate": 2.055492331506194e-06,
+      "loss": 0.1928567886352539,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.5320215436686966,
+      "learning_rate": 2.0280354481615814e-06,
+      "loss": 0.19074957370758056,
+      "step": 1890
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.4725862343819939,
+      "learning_rate": 2.000637472776049e-06,
+      "loss": 0.19257795810699463,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.46908638481055026,
+      "learning_rate": 1.973301825010685e-06,
+      "loss": 0.18594731092453004,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.5595713557618127,
+      "learning_rate": 1.9460319167471934e-06,
+      "loss": 0.19121139049530028,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.507704360185881,
+      "learning_rate": 1.9188311516620466e-06,
+      "loss": 0.18624544143676758,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.4860192603301521,
+      "learning_rate": 1.891702924801651e-06,
+      "loss": 0.19231630563735963,
+      "step": 1940
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.5275367662218493,
+      "learning_rate": 1.864650622158604e-06,
+      "loss": 0.19608126878738402,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.49282562967431837,
+      "learning_rate": 1.8376776202490666e-06,
+      "loss": 0.19235665798187257,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5182260002744055,
+      "learning_rate": 1.8107872856913293e-06,
+      "loss": 0.18613014221191407,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.5103313601861706,
+      "learning_rate": 1.7839829747856096e-06,
+      "loss": 0.1881113052368164,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.5451499180289584,
+      "learning_rate": 1.7572680330951359e-06,
+      "loss": 0.18735458850860595,
+      "step": 1990
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.5090636315844644,
+      "learning_rate": 1.7306457950285747e-06,
+      "loss": 0.1885282278060913,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.4758742975901025,
+      "learning_rate": 1.704119583423848e-06,
+      "loss": 0.18241598606109619,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.49602490022248863,
+      "learning_rate": 1.677692709133396e-06,
+      "loss": 0.19074147939682007,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.520455285125112,
+      "learning_rate": 1.6513684706109311e-06,
+      "loss": 0.19024887084960937,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5234524283247538,
+      "learning_rate": 1.6251501534997529e-06,
+      "loss": 0.18900917768478392,
+      "step": 2040
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.4762667999370438,
+      "learning_rate": 1.5990410302226405e-06,
+      "loss": 0.18147594928741456,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4931916769975977,
+      "learning_rate": 1.5730443595734162e-06,
+      "loss": 0.18815698623657226,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.5595459804684163,
+      "learning_rate": 1.5471633863101982e-06,
+      "loss": 0.18958520889282227,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.551381176131532,
+      "learning_rate": 1.521401340750407e-06,
+      "loss": 0.1908926248550415,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5155022860725758,
+      "learning_rate": 1.495761438367577e-06,
+      "loss": 0.18872777223587037,
+      "step": 2090
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.6037433446756716,
+      "learning_rate": 1.4702468793900187e-06,
+      "loss": 0.18800405263900757,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.5613773833705744,
+      "learning_rate": 1.444860848401384e-06,
+      "loss": 0.18743778467178346,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5277286435676816,
+      "learning_rate": 1.4196065139431866e-06,
+      "loss": 0.18769149780273436,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5487755330646784,
+      "learning_rate": 1.3944870281193178e-06,
+      "loss": 0.1866753101348877,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.5319334450957595,
+      "learning_rate": 1.3695055262026208e-06,
+      "loss": 0.19193503856658936,
+      "step": 2140
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.5061777243502238,
+      "learning_rate": 1.3446651262435679e-06,
+      "loss": 0.18499069213867186,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5063080834031065,
+      "learning_rate": 1.3199689286810746e-06,
+      "loss": 0.18700281381607056,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5014045449596041,
+      "learning_rate": 1.2954200159555294e-06,
+      "loss": 0.18185386657714844,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.5417896517828541,
+      "learning_rate": 1.2710214521240527e-06,
+      "loss": 0.18632771968841552,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.5710908799443121,
+      "learning_rate": 1.246776282478063e-06,
+      "loss": 0.18732945919036864,
+      "step": 2190
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.5180508096448415,
+      "learning_rate": 1.222687533163181e-06,
+      "loss": 0.18602204322814941,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.5480758918229119,
+      "learning_rate": 1.1987582108015228e-06,
+      "loss": 0.18710973262786865,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.5631818126474104,
+      "learning_rate": 1.1749913021164255e-06,
+      "loss": 0.18828771114349366,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.4833634541431531,
+      "learning_rate": 1.1513897735596702e-06,
+      "loss": 0.18257718086242675,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5051522117897481,
+      "learning_rate": 1.127956570941218e-06,
+      "loss": 0.17966469526290893,
+      "step": 2240
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5404271805851407,
+      "learning_rate": 1.104694619061533e-06,
+      "loss": 0.18814800977706908,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5147342090287059,
+      "learning_rate": 1.0816068213465295e-06,
+      "loss": 0.1908186197280884,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5558495401174878,
+      "learning_rate": 1.0586960594851762e-06,
+      "loss": 0.1859324097633362,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.6185737554957568,
+      "learning_rate": 1.0359651930698217e-06,
+      "loss": 0.18477405309677125,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.5398647348951853,
+      "learning_rate": 1.0134170592392837e-06,
+      "loss": 0.1857767939567566,
+      "step": 2290
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.5450678028060058,
+      "learning_rate": 9.910544723247204e-07,
+      "loss": 0.184822678565979,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5999082382312588,
+      "learning_rate": 9.688802234983706e-07,
+      "loss": 0.18381783962249756,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.5175099712487172,
+      "learning_rate": 9.468970804251742e-07,
+      "loss": 0.18641353845596315,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.5367638040398911,
+      "learning_rate": 9.251077869173244e-07,
+      "loss": 0.18090612888336183,
+      "step": 2330
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.563594153188617,
+      "learning_rate": 9.035150625918054e-07,
+      "loss": 0.18149322271347046,
+      "step": 2340
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5304713442318342,
+      "learning_rate": 8.821216025309395e-07,
+      "loss": 0.18464915752410888,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.535119183480021,
+      "learning_rate": 8.609300769460055e-07,
+      "loss": 0.1792607307434082,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5724539486438234,
+      "learning_rate": 8.399431308439592e-07,
+      "loss": 0.183684778213501,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.5589161632397335,
+      "learning_rate": 8.191633836972962e-07,
+      "loss": 0.18650429248809813,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.5386156132762686,
+      "learning_rate": 7.985934291171024e-07,
+      "loss": 0.1821720838546753,
+      "step": 2390
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.5321288466713382,
+      "learning_rate": 7.7823583452934e-07,
+      "loss": 0.18489625453948974,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5670301824645666,
+      "learning_rate": 7.58093140854389e-07,
+      "loss": 0.18495336771011353,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.6058756306995335,
+      "learning_rate": 7.381678621899077e-07,
+      "loss": 0.1848145008087158,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.5477002870283818,
+      "learning_rate": 7.184624854970379e-07,
+      "loss": 0.1817490816116333,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.5458027173632266,
+      "learning_rate": 6.989794702899932e-07,
+      "loss": 0.18078404664993286,
+      "step": 2440
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5772130708628379,
+      "learning_rate": 6.797212483290777e-07,
+      "loss": 0.18299766778945922,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.5674146932938366,
+      "learning_rate": 6.60690223317171e-07,
+      "loss": 0.1799448013305664,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.5238538237059384,
+      "learning_rate": 6.418887705997046e-07,
+      "loss": 0.1826066255569458,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.5857270779434125,
+      "learning_rate": 6.23319236868189e-07,
+      "loss": 0.18549437522888185,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5274424793724192,
+      "learning_rate": 6.049839398673141e-07,
+      "loss": 0.1865037798881531,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5820741885019232,
+      "learning_rate": 5.868851681056567e-07,
+      "loss": 0.18739759922027588,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.559971376703767,
+      "learning_rate": 5.690251805700467e-07,
+      "loss": 0.1853170394897461,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.5456407872897143,
+      "learning_rate": 5.514062064436096e-07,
+      "loss": 0.18589026927948,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.5866178273652722,
+      "learning_rate": 5.34030444827533e-07,
+      "loss": 0.1827709197998047,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.588749656654477,
+      "learning_rate": 5.169000644665895e-07,
+      "loss": 0.1794450044631958,
+      "step": 2540
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5778176841150756,
+      "learning_rate": 5.000172034784442e-07,
+      "loss": 0.18060548305511476,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.566426267196354,
+      "learning_rate": 4.833839690867853e-07,
+      "loss": 0.18326361179351808,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.5763812670051818,
+      "learning_rate": 4.6700243735831705e-07,
+      "loss": 0.17798151969909667,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.5465254160649792,
+      "learning_rate": 4.508746529436311e-07,
+      "loss": 0.1761394739151001,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5717164779412172,
+      "learning_rate": 4.350026288220083e-07,
+      "loss": 0.18241602182388306,
+      "step": 2590
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.5532919690194787,
+      "learning_rate": 4.1938834605017133e-07,
+      "loss": 0.1799800157546997,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5485503614596886,
+      "learning_rate": 4.0403375351501515e-07,
+      "loss": 0.18037915229797363,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.5921392059955939,
+      "learning_rate": 3.88940767690362e-07,
+      "loss": 0.17850807905197144,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.6173777417506611,
+      "learning_rate": 3.7411127239775774e-07,
+      "loss": 0.17773046493530273,
+      "step": 2630
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.5704461135916385,
+      "learning_rate": 3.595471185713431e-07,
+      "loss": 0.17534157037734985,
+      "step": 2640
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.6016600022490033,
+      "learning_rate": 3.4525012402682826e-07,
+      "loss": 0.17784465551376344,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.5793357844007763,
+      "learning_rate": 3.3122207323460804e-07,
+      "loss": 0.17941689491271973,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.5402101980665998,
+      "learning_rate": 3.1746471709702963e-07,
+      "loss": 0.17694177627563476,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5764717205309013,
+      "learning_rate": 3.039797727298585e-07,
+      "loss": 0.18307201862335204,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.6021889152147203,
+      "learning_rate": 2.9076892324795546e-07,
+      "loss": 0.18175405263900757,
+      "step": 2690
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5783244972157141,
+      "learning_rate": 2.778338175551995e-07,
+      "loss": 0.17646790742874147,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.573282650162234,
+      "learning_rate": 2.6517607013868326e-07,
+      "loss": 0.18459818363189698,
+      "step": 2710
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.6039696058732922,
+      "learning_rate": 2.527972608672002e-07,
+      "loss": 0.18084490299224854,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.5916439702722857,
+      "learning_rate": 2.40698934794053e-07,
+      "loss": 0.18053301572799682,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.5703451942226244,
+      "learning_rate": 2.2888260196421237e-07,
+      "loss": 0.1792958378791809,
+      "step": 2740
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5672304805383847,
+      "learning_rate": 2.1734973722583735e-07,
+      "loss": 0.1819172501564026,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.5784570642525821,
+      "learning_rate": 2.0610178004619564e-07,
+      "loss": 0.17332799434661866,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.575451427907292,
+      "learning_rate": 1.9514013433199834e-07,
+      "loss": 0.18558990955352783,
+      "step": 2770
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.5133461724908028,
+      "learning_rate": 1.8446616825416958e-07,
+      "loss": 0.18399085998535156,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.6123280023323261,
+      "learning_rate": 1.7408121407708007e-07,
+      "loss": 0.1844745397567749,
+      "step": 2790
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.5761361465385083,
+      "learning_rate": 1.6398656799226253e-07,
+      "loss": 0.17304511070251466,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.6034414454227958,
+      "learning_rate": 1.5418348995662773e-07,
+      "loss": 0.17871806621551514,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.5923974971972374,
+      "learning_rate": 1.4467320353520275e-07,
+      "loss": 0.17667040824890137,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.603734748014922,
+      "learning_rate": 1.3545689574841341e-07,
+      "loss": 0.1787508487701416,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.5750783540393263,
+      "learning_rate": 1.26535716923927e-07,
+      "loss": 0.18438329696655273,
+      "step": 2840
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.5716942434142535,
+      "learning_rate": 1.1791078055307493e-07,
+      "loss": 0.1802410364151001,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.6031535401501658,
+      "learning_rate": 1.0958316315187289e-07,
+      "loss": 0.17950894832611083,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.5724651470732645,
+      "learning_rate": 1.0155390412665528e-07,
+      "loss": 0.17800890207290648,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5920847136083833,
+      "learning_rate": 9.38240056443443e-08,
+      "loss": 0.17559461593627929,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5600845233888927,
+      "learning_rate": 8.639443250736402e-08,
+      "loss": 0.17780338525772094,
+      "step": 2890
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5760602589693042,
+      "learning_rate": 7.926611203321777e-08,
+      "loss": 0.1794909954071045,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.59057677772977,
+      "learning_rate": 7.243993393874882e-08,
+      "loss": 0.1795297384262085,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5693422129621047,
+      "learning_rate": 6.591675022908805e-08,
+      "loss": 0.17676992416381837,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.5656532345210596,
+      "learning_rate": 5.969737509131241e-08,
+      "loss": 0.17433459758758546,
+      "step": 2930
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.5865348817236666,
+      "learning_rate": 5.3782584792823334e-08,
+      "loss": 0.1795581579208374,
+      "step": 2940
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.6034375830769324,
+      "learning_rate": 4.817311758445686e-08,
+      "loss": 0.18066773414611817,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.598761782830776,
+      "learning_rate": 4.286967360833866e-08,
+      "loss": 0.1803189516067505,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.5410244646488507,
+      "learning_rate": 3.787291481049754e-08,
+      "loss": 0.18075671195983886,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.6102805369465131,
+      "learning_rate": 3.3183464858244364e-08,
+      "loss": 0.18705531358718872,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5798299084498433,
+      "learning_rate": 2.8801909062328992e-08,
+      "loss": 0.17331962585449218,
+      "step": 2990
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.5999449762716584,
+      "learning_rate": 2.4728794303886248e-08,
+      "loss": 0.17158935070037842,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.6212882795186798,
+      "learning_rate": 2.0964628966175794e-08,
+      "loss": 0.17738908529281616,
+      "step": 3010
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.564746561855876,
+      "learning_rate": 1.750988287113009e-08,
+      "loss": 0.17667733430862426,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5852806549215316,
+      "learning_rate": 1.4364987220713278e-08,
+      "loss": 0.18457986116409303,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.5991233203919278,
+      "learning_rate": 1.1530334543099763e-08,
+      "loss": 0.18215363025665282,
+      "step": 3040
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.6041102228390866,
+      "learning_rate": 9.006278643683697e-09,
+      "loss": 0.18243587017059326,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.5869697890802611,
+      "learning_rate": 6.793134560916514e-09,
+      "loss": 0.18486570119857787,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.5595978682216465,
+      "learning_rate": 4.891178526986451e-09,
+      "loss": 0.18047856092453002,
+      "step": 3070
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.5638404572903396,
+      "learning_rate": 3.3006479333413943e-09,
+      "loss": 0.18349089622497558,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.5582534730189623,
+      "learning_rate": 2.021741301058422e-09,
+      "loss": 0.18032891750335694,
+      "step": 3090
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5757824692806152,
+      "learning_rate": 1.0546182560652872e-09,
+      "loss": 0.1812995433807373,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5718406851297113,
+      "learning_rate": 3.9939950921774607e-10,
+      "loss": 0.17747504711151124,
+      "step": 3110
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.549457935685087,
+      "learning_rate": 5.616684123160854e-11,
+      "loss": 0.17633507251739503,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0,
+      "step": 3125,
+      "total_flos": 2477163648385024.0,
+      "train_loss": 0.20598802658081056,
+      "train_runtime": 35266.4791,
+      "train_samples_per_second": 5.671,
+      "train_steps_per_second": 0.089
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2477163648385024.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoints/GLM-4.6V-Flash-SFT/training_loss.png b/checkpoints/GLM-4.6V-Flash-SFT/training_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..c6f8a107f88ce6fd0298ddbb917d6ebee5122146
Binary files /dev/null and b/checkpoints/GLM-4.6V-Flash-SFT/training_loss.png differ
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/all_results.json b/checkpoints/Gemma-4-E4B-it-SFT/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..95fe79d7fbf9dc20758150408b914257954b6c4a
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1.0913057758773248e+16,
+    "train_loss": 0.7292402684783935,
+    "train_runtime": 30167.0559,
+    "train_samples_per_second": 6.63,
+    "train_steps_per_second": 0.104
+}
\ No newline at end of file
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/chat_template.jinja b/checkpoints/Gemma-4-E4B-it-SFT/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..afb1d517bedb410d5bb32df4300d17f6e5888e2a
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/chat_template.jinja
@@ -0,0 +1,263 @@
+{%- macro format_parameters(properties, required) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'OBJECT' -%}
+                ,properties:{
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                {%- elif value is mapping -%}
+                    {{- format_parameters(value, value['required'] | default([])) -}}
+                {%- endif -%}
+                }
+                {%- if value['required'] -%}
+                    ,required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    ,items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{ bos_token }}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {{- messages[0]['content'] | trim -}}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+        {{- '<|turn>' + role + '\n' }}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- if message['tool_responses'] -%}
+                {#- Tool Response handling -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- '<|tool_response>' -}}
+                    {%- if tool_response['response'] is mapping -%}
+                        {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
+                        {%- for key, value in tool_response['response'] | dictsort -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                            {%- if not loop.last %},{% endif -%}
+                        {%- endfor -%}
+                        {{- '}' -}}
+                    {%- else -%}
+                        {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
+                    {%- endif -%}
+                    {{- '<tool_response|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_response' -%}
+            {%- endif -%}
+
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '\n\n<|image|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '\n\n<|video|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+        {%- if not (message['tool_responses'] and not message['content']) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/config.json b/checkpoints/Gemma-4-E4B-it-SFT/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f697944ac8a7124c0cdbc70da313658644d7a22b
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/config.json
@@ -0,0 +1,199 @@
+{
+  "architectures": [
+    "Gemma4ForConditionalGeneration"
+  ],
+  "audio_config": {
+    "_name_or_path": "",
+    "architectures": null,
+    "attention_chunk_size": 12,
+    "attention_context_left": 13,
+    "attention_context_right": 0,
+    "attention_invalid_logits_value": -1000000000.0,
+    "attention_logit_cap": 50.0,
+    "chunk_size_feed_forward": 0,
+    "conv_kernel_size": 5,
+    "dtype": "bfloat16",
+    "gradient_clipping": 10000000000.0,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "model_type": "gemma4_audio",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 12,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_proj_dims": 1536,
+    "problem_type": null,
+    "residual_weight": 0.5,
+    "return_dict": true,
+    "rms_norm_eps": 1e-06,
+    "subsampling_conv_channels": [
+      128,
+      32
+    ],
+    "use_clipped_linears": true
+  },
+  "audio_token_id": 258881,
+  "boa_token_id": 256000,
+  "boi_token_id": 255999,
+  "bos_token_id": 2,
+  "dtype": "bfloat16",
+  "eoa_token_id": 258883,
+  "eoa_token_index": 258883,
+  "eoi_token_id": 258882,
+  "eos_token_id": 106,
+  "hidden_size": 2560,
+  "image_token_id": 258880,
+  "initializer_range": 0.02,
+  "model_type": "gemma4",
+  "pad_token_id": 0,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attention_k_eq_v": false,
+    "bos_token_id": 2,
+    "dtype": "bfloat16",
+    "enable_moe_block": false,
+    "eos_token_id": 1,
+    "expert_intermediate_size": null,
+    "final_logit_softcapping": 30.0,
+    "global_head_dim": 512,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 2560,
+    "hidden_size_per_layer_input": 256,
+    "initializer_range": 0.02,
+    "intermediate_size": 10240,
+    "layer_types": [
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 131072,
+    "model_type": "gemma4_text",
+    "moe_intermediate_size": null,
+    "num_attention_heads": 8,
+    "num_experts": null,
+    "num_global_key_value_heads": null,
+    "num_hidden_layers": 42,
+    "num_key_value_heads": 2,
+    "num_kv_shared_layers": 18,
+    "pad_token_id": 0,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "full_attention": {
+        "partial_rotary_factor": 0.25,
+        "rope_theta": 1000000.0,
+        "rope_type": "proportional"
+      },
+      "sliding_attention": {
+        "rope_theta": 10000.0,
+        "rope_type": "default"
+      }
+    },
+    "sliding_window": 512,
+    "tie_word_embeddings": true,
+    "top_k_experts": null,
+    "use_bidirectional_attention": null,
+    "use_cache": false,
+    "use_double_wide_mlp": false,
+    "vocab_size": 262144,
+    "vocab_size_per_layer_input": 262144
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.3",
+  "use_cache": false,
+  "video_token_id": 258884,
+  "vision_config": {
+    "_name_or_path": "",
+    "architectures": null,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "chunk_size_feed_forward": 0,
+    "default_output_length": 280,
+    "dtype": "bfloat16",
+    "global_head_dim": 64,
+    "head_dim": 64,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "max_position_embeddings": 131072,
+    "model_type": "gemma4_vision",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 16,
+    "num_key_value_heads": 12,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "position_embedding_size": 10240,
+    "problem_type": null,
+    "return_dict": true,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 100.0,
+      "rope_type": "default"
+    },
+    "standardize": false,
+    "use_clipped_linears": true
+  },
+  "vision_soft_tokens_per_image": 280
+}
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/eval_results_job_gemma_gemma_4_e4b_20260430_011024.json b/checkpoints/Gemma-4-E4B-it-SFT/eval_results_job_gemma_gemma_4_e4b_20260430_011024.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a51cfe202de611bfc2d01db96808e984bc26986
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/eval_results_job_gemma_gemma_4_e4b_20260430_011024.json
@@ -0,0 +1,56 @@
+{
+  "mae_dx": 0.48666724137931033,
+  "rmse_dx": 1.0707492462177417,
+  "mae_dy": 0.3855034482758621,
+  "rmse_dy": 0.7843001492655289,
+  "mae_dz": 0.04997413793103449,
+  "rmse_dz": 0.156602120477122,
+  "mae_dpitch": 0.9934068965517242,
+  "rmse_dpitch": 1.7330746049166195,
+  "mae_dyaw": 2.2219862068965517,
+  "rmse_dyaw": 3.906024586323736,
+  "mae_droll": 0.0,
+  "rmse_droll": 0.0,
+  "mae_overall": 0.6895896551724138,
+  "mae_position": 0.30738160919540225,
+  "mae_rotation": 1.0717977011494253,
+  "rmse_overall": 1.8278735619896387,
+  "wp1_euc_mae": 0.2665493636964831,
+  "wp1_euc_median": 0.18,
+  "wp2_euc_mae": 0.5012943438070621,
+  "wp2_euc_median": 0.31144823004794875,
+  "wp3_euc_mae": 0.7271333853911885,
+  "wp3_euc_median": 0.48,
+  "wp4_euc_mae": 0.958032444080531,
+  "wp4_euc_median": 0.6351377754492935,
+  "wp5_euc_mae": 1.1876023185914943,
+  "wp5_euc_median": 0.7778817364281356,
+  "euclidean_mae": 0.7281223711133517,
+  "ADE": 0.7281223711133519,
+  "FDE": 1.1876023185914943,
+  "ADE_median": 0.49122803576716423,
+  "FDE_median": 0.7778817364281356,
+  "SR@0.5m": 0.5736206896551724,
+  "SR@1.0m": 0.783448275862069,
+  "SR@2.0m": 0.9222413793103448,
+  "SR@5.0m": 0.9898275862068966,
+  "TrajSR@1.0m": 0.5887931034482758,
+  "TrajSR@2.0m": 0.8353448275862069,
+  "TrajSR@5.0m": 0.9724137931034482,
+  "RotAcc@1.0deg": 0.39948275862068966,
+  "RotAcc@5.0deg": 0.83,
+  "RotAcc@10.0deg": 0.9762068965517241,
+  "wp1_rot_mae": 1.8561397413473146,
+  "wp2_rot_mae": 2.249132034716281,
+  "wp3_rot_mae": 2.6355453352548355,
+  "wp4_rot_mae": 3.048629056478642,
+  "wp5_rot_mae": 3.45811827126434,
+  "rotation_euc_mae": 2.6495128878122824,
+  "parse_failure_rate": 0.0,
+  "parse_success_rate": 1.0,
+  "valid_samples": 1160,
+  "total_samples": 1160,
+  "parse_failures": 0,
+  "inference_engine": "vllm",
+  "vllm_version": "0.19.0"
+}
\ No newline at end of file
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/generation_config.json b/checkpoints/Gemma-4-E4B-it-SFT/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6633d85c38512e1932d115a29dca1605862e16e2
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/generation_config.json
@@ -0,0 +1,15 @@
+{
+  "bos_token_id": 2,
+  "do_sample": true,
+  "eos_token_id": [
+    106,
+    1,
+    106,
+    50
+  ],
+  "pad_token_id": 0,
+  "temperature": 1.0,
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "5.5.3"
+}
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/model.safetensors b/checkpoints/Gemma-4-E4B-it-SFT/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c78bfd5e93a040783e456059d34ae42d8c59ddaf
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa115532595f57272ed0b16337a23de6762ffa60ab858147f5f51f1cff34105b
+size 15992595884
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/processor_config.json b/checkpoints/Gemma-4-E4B-it-SFT/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5465974d23e1eca2c46c2809b26c997946ce0d90
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json b/checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/tokenizer_config.json b/checkpoints/Gemma-4-E4B-it-SFT/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac1c3eff3ad5a4d3913ba8e5f36a14ed1c7e51d6
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/tokenizer_config.json
@@ -0,0 +1,96 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": true,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<content>(?:(?!\\<\\|tool_call\\>)(?!\\<turn\\|\\>).)+)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?:\\<turn\\|\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "split_special_tokens": false,
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>"
+}
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/train_results.json b/checkpoints/Gemma-4-E4B-it-SFT/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..95fe79d7fbf9dc20758150408b914257954b6c4a
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1.0913057758773248e+16,
+    "train_loss": 0.7292402684783935,
+    "train_runtime": 30167.0559,
+    "train_samples_per_second": 6.63,
+    "train_steps_per_second": 0.104
+}
\ No newline at end of file
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/trainer_state.json b/checkpoints/Gemma-4-E4B-it-SFT/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f369a1edf7f1302c8239943827d3b72e96e101f
--- /dev/null
+++ b/checkpoints/Gemma-4-E4B-it-SFT/trainer_state.json
@@ -0,0 +1,2227 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 366.0841096744857,
+      "learning_rate": 1.437699680511182e-07,
+      "loss": 23.85431823730469,
+      "step": 10
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 367.47333882445946,
+      "learning_rate": 3.0351437699680514e-07,
+      "loss": 23.65589599609375,
+      "step": 20
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 367.96579270464326,
+      "learning_rate": 4.6325878594249205e-07,
+      "loss": 22.780029296875,
+      "step": 30
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 332.5732884154056,
+      "learning_rate": 6.230031948881789e-07,
+      "loss": 20.279689025878906,
+      "step": 40
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 219.53674756423746,
+      "learning_rate": 7.82747603833866e-07,
+      "loss": 15.498806762695313,
+      "step": 50
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 156.1487544830451,
+      "learning_rate": 9.424920127795528e-07,
+      "loss": 10.388201904296874,
+      "step": 60
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 37.96869040917498,
+      "learning_rate": 1.1022364217252397e-06,
+      "loss": 3.7560958862304688,
+      "step": 70
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 16.783464772614202,
+      "learning_rate": 1.2619808306709266e-06,
+      "loss": 2.033830261230469,
+      "step": 80
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 5.438256169634593,
+      "learning_rate": 1.4217252396166134e-06,
+      "loss": 1.0431390762329102,
+      "step": 90
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 3.6935246150045775,
+      "learning_rate": 1.5814696485623005e-06,
+      "loss": 0.8069572448730469,
+      "step": 100
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 9.218312544625562,
+      "learning_rate": 1.7412140575079875e-06,
+      "loss": 0.7057615280151367,
+      "step": 110
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 5.394484238866305,
+      "learning_rate": 1.9009584664536742e-06,
+      "loss": 0.6301750183105469,
+      "step": 120
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 6.577481237217732,
+      "learning_rate": 2.060702875399361e-06,
+      "loss": 0.5898516654968262,
+      "step": 130
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 3.4158074483641068,
+      "learning_rate": 2.220447284345048e-06,
+      "loss": 0.5524418830871582,
+      "step": 140
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 4.032046521040006,
+      "learning_rate": 2.380191693290735e-06,
+      "loss": 0.5317594051361084,
+      "step": 150
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 5.468634675306576,
+      "learning_rate": 2.539936102236422e-06,
+      "loss": 0.5184277534484864,
+      "step": 160
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 3.4313124951156424,
+      "learning_rate": 2.699680511182109e-06,
+      "loss": 0.5204483985900878,
+      "step": 170
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 5.13400179254009,
+      "learning_rate": 2.8594249201277955e-06,
+      "loss": 0.5058025360107422,
+      "step": 180
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 5.9183424216837786,
+      "learning_rate": 3.0191693290734825e-06,
+      "loss": 0.5073411941528321,
+      "step": 190
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 5.625073986187664,
+      "learning_rate": 3.17891373801917e-06,
+      "loss": 0.5000103950500489,
+      "step": 200
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 5.050603467051007,
+      "learning_rate": 3.3386581469648564e-06,
+      "loss": 0.488192081451416,
+      "step": 210
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 11.776866822937645,
+      "learning_rate": 3.4984025559105434e-06,
+      "loss": 0.48699202537536623,
+      "step": 220
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 7.438900018795585,
+      "learning_rate": 3.6581469648562303e-06,
+      "loss": 0.4820102691650391,
+      "step": 230
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 4.3491840646532065,
+      "learning_rate": 3.817891373801918e-06,
+      "loss": 0.47640199661254884,
+      "step": 240
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 3.472565426233091,
+      "learning_rate": 3.977635782747604e-06,
+      "loss": 0.4729574203491211,
+      "step": 250
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 3.1912744148161942,
+      "learning_rate": 4.137380191693291e-06,
+      "loss": 0.4786433219909668,
+      "step": 260
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 3.9698013424470777,
+      "learning_rate": 4.297124600638978e-06,
+      "loss": 0.4748369216918945,
+      "step": 270
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 8.11949393489321,
+      "learning_rate": 4.456869009584665e-06,
+      "loss": 0.4681865692138672,
+      "step": 280
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 4.7349566199381234,
+      "learning_rate": 4.616613418530352e-06,
+      "loss": 0.46743001937866213,
+      "step": 290
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 4.756427284033883,
+      "learning_rate": 4.776357827476039e-06,
+      "loss": 0.46964178085327146,
+      "step": 300
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 4.86570605379029,
+      "learning_rate": 4.936102236421725e-06,
+      "loss": 0.45612516403198244,
+      "step": 310
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 5.762654788054032,
+      "learning_rate": 4.999943833158769e-06,
+      "loss": 0.45009474754333495,
+      "step": 320
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 3.501477346053355,
+      "learning_rate": 4.999600600490783e-06,
+      "loss": 0.4523477554321289,
+      "step": 330
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 7.957279740713588,
+      "learning_rate": 4.9989453817439345e-06,
+      "loss": 0.44190473556518556,
+      "step": 340
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 7.660308885793361,
+      "learning_rate": 4.997978258698942e-06,
+      "loss": 0.43758931159973147,
+      "step": 350
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 5.8839479464224205,
+      "learning_rate": 4.996699352066659e-06,
+      "loss": 0.4371060371398926,
+      "step": 360
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 3.452842882877267,
+      "learning_rate": 4.995108821473014e-06,
+      "loss": 0.42999753952026365,
+      "step": 370
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 4.825810317520427,
+      "learning_rate": 4.993206865439084e-06,
+      "loss": 0.4285894393920898,
+      "step": 380
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 5.379766821254966,
+      "learning_rate": 4.990993721356317e-06,
+      "loss": 0.42139811515808107,
+      "step": 390
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 4.854730410799869,
+      "learning_rate": 4.988469665456901e-06,
+      "loss": 0.42040281295776366,
+      "step": 400
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 4.6616615938661745,
+      "learning_rate": 4.985635012779288e-06,
+      "loss": 0.4207456588745117,
+      "step": 410
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 4.5341296475975605,
+      "learning_rate": 4.98249011712887e-06,
+      "loss": 0.414472770690918,
+      "step": 420
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 5.217437981869656,
+      "learning_rate": 4.979035371033824e-06,
+      "loss": 0.41441006660461427,
+      "step": 430
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 3.561516924716779,
+      "learning_rate": 4.975271205696115e-06,
+      "loss": 0.40767755508422854,
+      "step": 440
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 3.815692337476438,
+      "learning_rate": 4.971198090937671e-06,
+      "loss": 0.3997596263885498,
+      "step": 450
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 4.559242371997167,
+      "learning_rate": 4.966816535141756e-06,
+      "loss": 0.39360842704772947,
+      "step": 460
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 3.432229350472061,
+      "learning_rate": 4.9621270851895035e-06,
+      "loss": 0.40289998054504395,
+      "step": 470
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 5.375227134041046,
+      "learning_rate": 4.957130326391662e-06,
+      "loss": 0.3982266664505005,
+      "step": 480
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 5.539585521677851,
+      "learning_rate": 4.951826882415544e-06,
+      "loss": 0.39270691871643065,
+      "step": 490
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 3.4147092253345743,
+      "learning_rate": 4.946217415207177e-06,
+      "loss": 0.3853750705718994,
+      "step": 500
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 4.444175842440995,
+      "learning_rate": 4.940302624908689e-06,
+      "loss": 0.38694162368774415,
+      "step": 510
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 3.3493207902303475,
+      "learning_rate": 4.934083249770912e-06,
+      "loss": 0.3797153949737549,
+      "step": 520
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 3.0499194254019097,
+      "learning_rate": 4.927560066061251e-06,
+      "loss": 0.38063654899597166,
+      "step": 530
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 3.141871281336489,
+      "learning_rate": 4.920733887966783e-06,
+      "loss": 0.39005699157714846,
+      "step": 540
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 3.979297184951908,
+      "learning_rate": 4.913605567492636e-06,
+      "loss": 0.38013472557067873,
+      "step": 550
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 3.7669251986704113,
+      "learning_rate": 4.906175994355656e-06,
+      "loss": 0.37832577228546144,
+      "step": 560
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 2.983798431857085,
+      "learning_rate": 4.898446095873345e-06,
+      "loss": 0.38150479793548586,
+      "step": 570
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 3.657787030439589,
+      "learning_rate": 4.890416836848128e-06,
+      "loss": 0.3775670528411865,
+      "step": 580
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 3.551048022748126,
+      "learning_rate": 4.882089219446925e-06,
+      "loss": 0.37199065685272215,
+      "step": 590
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 4.750977601329729,
+      "learning_rate": 4.873464283076074e-06,
+      "loss": 0.3790221452713013,
+      "step": 600
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 7.684545118387627,
+      "learning_rate": 4.864543104251587e-06,
+      "loss": 0.37508673667907716,
+      "step": 610
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 5.872575231845199,
+      "learning_rate": 4.855326796464798e-06,
+      "loss": 0.3811868906021118,
+      "step": 620
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 3.9960144706794316,
+      "learning_rate": 4.8458165100433725e-06,
+      "loss": 0.37326750755310056,
+      "step": 630
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 3.9998452581157657,
+      "learning_rate": 4.836013432007738e-06,
+      "loss": 0.3709099769592285,
+      "step": 640
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 2.6973135018594343,
+      "learning_rate": 4.825918785922921e-06,
+      "loss": 0.3728507995605469,
+      "step": 650
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 4.478756132604264,
+      "learning_rate": 4.8155338317458315e-06,
+      "loss": 0.36782591342926024,
+      "step": 660
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 2.5620662799375378,
+      "learning_rate": 4.804859865668002e-06,
+      "loss": 0.36416780948638916,
+      "step": 670
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 2.9398359151969884,
+      "learning_rate": 4.793898219953804e-06,
+      "loss": 0.36772732734680175,
+      "step": 680
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 3.404020172068192,
+      "learning_rate": 4.782650262774164e-06,
+      "loss": 0.3651688575744629,
+      "step": 690
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 2.588678061474319,
+      "learning_rate": 4.7711173980357886e-06,
+      "loss": 0.3649880409240723,
+      "step": 700
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 3.5390276900279773,
+      "learning_rate": 4.759301065205947e-06,
+      "loss": 0.3612825870513916,
+      "step": 710
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 3.8670986814196473,
+      "learning_rate": 4.7472027391328e-06,
+      "loss": 0.3657612085342407,
+      "step": 720
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 3.0276354554801217,
+      "learning_rate": 4.734823929861317e-06,
+      "loss": 0.36682844161987305,
+      "step": 730
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 5.205227283770371,
+      "learning_rate": 4.722166182444801e-06,
+      "loss": 0.3605961322784424,
+      "step": 740
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 3.1037248816470737,
+      "learning_rate": 4.709231076752045e-06,
+      "loss": 0.3625338554382324,
+      "step": 750
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 3.827009314178272,
+      "learning_rate": 4.696020227270142e-06,
+      "loss": 0.36273531913757323,
+      "step": 760
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 2.553717481812464,
+      "learning_rate": 4.6825352829029705e-06,
+      "loss": 0.35740270614624026,
+      "step": 770
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 2.8273485176739563,
+      "learning_rate": 4.668777926765392e-06,
+      "loss": 0.3613132953643799,
+      "step": 780
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 3.242165291552063,
+      "learning_rate": 4.6547498759731725e-06,
+      "loss": 0.3525214672088623,
+      "step": 790
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 2.607635187753211,
+      "learning_rate": 4.6404528814286575e-06,
+      "loss": 0.3569283723831177,
+      "step": 800
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 3.2439792578606204,
+      "learning_rate": 4.6258887276022425e-06,
+      "loss": 0.357681941986084,
+      "step": 810
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 2.9728036180938284,
+      "learning_rate": 4.611059232309639e-06,
+      "loss": 0.3537192106246948,
+      "step": 820
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 2.556165398739607,
+      "learning_rate": 4.595966246484986e-06,
+      "loss": 0.3528641700744629,
+      "step": 830
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 2.593548528246384,
+      "learning_rate": 4.580611653949829e-06,
+      "loss": 0.3564203500747681,
+      "step": 840
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 3.428440109671292,
+      "learning_rate": 4.564997371177992e-06,
+      "loss": 0.3518026828765869,
+      "step": 850
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 4.993564850548027,
+      "learning_rate": 4.54912534705637e-06,
+      "loss": 0.35079920291900635,
+      "step": 860
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 3.340510283095063,
+      "learning_rate": 4.532997562641683e-06,
+      "loss": 0.3466078042984009,
+      "step": 870
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 2.6894615056191644,
+      "learning_rate": 4.516616030913214e-06,
+      "loss": 0.3472653865814209,
+      "step": 880
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 1.891440124594712,
+      "learning_rate": 4.499982796521556e-06,
+      "loss": 0.34483723640441893,
+      "step": 890
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 3.223309297530686,
+      "learning_rate": 4.48309993553341e-06,
+      "loss": 0.3444544553756714,
+      "step": 900
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 3.1032077209020468,
+      "learning_rate": 4.465969555172468e-06,
+      "loss": 0.34571564197540283,
+      "step": 910
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 2.5407458837926638,
+      "learning_rate": 4.448593793556391e-06,
+      "loss": 0.3534140110015869,
+      "step": 920
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 3.1253686498979123,
+      "learning_rate": 4.430974819429954e-06,
+      "loss": 0.3445676326751709,
+      "step": 930
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 3.740083740472538,
+      "learning_rate": 4.413114831894344e-06,
+      "loss": 0.33962287902832033,
+      "step": 940
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 4.724023923665093,
+      "learning_rate": 4.3950160601326865e-06,
+      "loss": 0.3363780498504639,
+      "step": 950
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 3.597276867142834,
+      "learning_rate": 4.376680763131811e-06,
+      "loss": 0.3429840087890625,
+      "step": 960
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 2.97998267012516,
+      "learning_rate": 4.358111229400296e-06,
+      "loss": 0.3470882177352905,
+      "step": 970
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 3.1405275857331856,
+      "learning_rate": 4.33930977668283e-06,
+      "loss": 0.35235731601715087,
+      "step": 980
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 3.774584318253359,
+      "learning_rate": 4.320278751670922e-06,
+      "loss": 0.3418004512786865,
+      "step": 990
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 3.4325438208492605,
+      "learning_rate": 4.301020529710009e-06,
+      "loss": 0.3456583499908447,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 3.1407187711443916,
+      "learning_rate": 4.281537514502962e-06,
+      "loss": 0.3446167469024658,
+      "step": 1010
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 2.6154317834679226,
+      "learning_rate": 4.261832137810093e-06,
+      "loss": 0.34354138374328613,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 2.8993376261822648,
+      "learning_rate": 4.241906859145611e-06,
+      "loss": 0.3451784372329712,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 2.3351853591260574,
+      "learning_rate": 4.221764165470661e-06,
+      "loss": 0.33875834941864014,
+      "step": 1040
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 3.4295735539049605,
+      "learning_rate": 4.201406570882898e-06,
+      "loss": 0.33980226516723633,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 2.6388634367096735,
+      "learning_rate": 4.180836616302704e-06,
+      "loss": 0.3395829200744629,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 3.211009486395674,
+      "learning_rate": 4.160056869156041e-06,
+      "loss": 0.3433471441268921,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 3.4377414857289317,
+      "learning_rate": 4.139069923053995e-06,
+      "loss": 0.34047765731811525,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 3.131466112366247,
+      "learning_rate": 4.117878397469062e-06,
+      "loss": 0.3420018434524536,
+      "step": 1090
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 2.388207923072635,
+      "learning_rate": 4.096484937408195e-06,
+      "loss": 0.3351470470428467,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 2.2910707329028117,
+      "learning_rate": 4.074892213082676e-06,
+      "loss": 0.33539299964904784,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 2.156244058261874,
+      "learning_rate": 4.0531029195748265e-06,
+      "loss": 0.33862009048461916,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 2.6382644444406296,
+      "learning_rate": 4.03111977650163e-06,
+      "loss": 0.34041495323181153,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 2.5960896388831545,
+      "learning_rate": 4.008945527675281e-06,
+      "loss": 0.3390871524810791,
+      "step": 1140
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 3.657074741484568,
+      "learning_rate": 3.986582940760717e-06,
+      "loss": 0.3278806209564209,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 2.9587401358526075,
+      "learning_rate": 3.9640348069301785e-06,
+      "loss": 0.3368961334228516,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 1.965300565427372,
+      "learning_rate": 3.941303940514826e-06,
+      "loss": 0.3339808464050293,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 2.90985435283837,
+      "learning_rate": 3.918393178653472e-06,
+      "loss": 0.3376065969467163,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 3.27190473511409,
+      "learning_rate": 3.895305380938468e-06,
+      "loss": 0.3342454433441162,
+      "step": 1190
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 2.0468253424433165,
+      "learning_rate": 3.872043429058783e-06,
+      "loss": 0.32965447902679446,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 2.5123150680001576,
+      "learning_rate": 3.84861022644033e-06,
+      "loss": 0.3357837677001953,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 3.148104290988529,
+      "learning_rate": 3.825008697883574e-06,
+      "loss": 0.34343953132629396,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 2.488823913942074,
+      "learning_rate": 3.8012417891984776e-06,
+      "loss": 0.333116340637207,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 3.0225259799028645,
+      "learning_rate": 3.777312466836819e-06,
+      "loss": 0.3318933486938477,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 3.3439153363899115,
+      "learning_rate": 3.7532237175219378e-06,
+      "loss": 0.32833037376403806,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 2.72884090647899,
+      "learning_rate": 3.728978547875948e-06,
+      "loss": 0.3360243082046509,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 2.5999080124511966,
+      "learning_rate": 3.7045799840444712e-06,
+      "loss": 0.33025145530700684,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 3.0518346526448488,
+      "learning_rate": 3.6800310713189258e-06,
+      "loss": 0.3306798219680786,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 2.0509087709244507,
+      "learning_rate": 3.6553348737564328e-06,
+      "loss": 0.33091559410095217,
+      "step": 1290
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 2.908137390744499,
+      "learning_rate": 3.6304944737973794e-06,
+      "loss": 0.33455810546875,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 3.0396312942670796,
+      "learning_rate": 3.6055129718806836e-06,
+      "loss": 0.331624960899353,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 3.282462978283218,
+      "learning_rate": 3.5803934860568134e-06,
+      "loss": 0.32364490032196047,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 2.2269456751164727,
+      "learning_rate": 3.5551391515986163e-06,
+      "loss": 0.3319955348968506,
+      "step": 1330
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 2.8364899461485527,
+      "learning_rate": 3.529753120609982e-06,
+      "loss": 0.3252741813659668,
+      "step": 1340
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 2.89515974439621,
+      "learning_rate": 3.5042385616324243e-06,
+      "loss": 0.3287111520767212,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 2.311001238312573,
+      "learning_rate": 3.4785986592495934e-06,
+      "loss": 0.32939796447753905,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 2.4126049139350734,
+      "learning_rate": 3.452836613689803e-06,
+      "loss": 0.32168779373168943,
+      "step": 1370
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 3.1765584413022254,
+      "learning_rate": 3.426955640426584e-06,
+      "loss": 0.32864985466003416,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 3.154206643410634,
+      "learning_rate": 3.4009589697773605e-06,
+      "loss": 0.3260640621185303,
+      "step": 1390
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 3.4230687653412564,
+      "learning_rate": 3.3748498465002475e-06,
+      "loss": 0.32304584980010986,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 2.6276396964869684,
+      "learning_rate": 3.3486315293890693e-06,
+      "loss": 0.33318138122558594,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 2.754821177049362,
+      "learning_rate": 3.3223072908666053e-06,
+      "loss": 0.32256054878234863,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 2.881952130772473,
+      "learning_rate": 3.295880416576153e-06,
+      "loss": 0.33387539386749265,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 2.5217047707442966,
+      "learning_rate": 3.269354204971427e-06,
+      "loss": 0.32321481704711913,
+      "step": 1440
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 2.976679985492794,
+      "learning_rate": 3.242731966904865e-06,
+      "loss": 0.32245721817016604,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 2.527563459090948,
+      "learning_rate": 3.2160170252143913e-06,
+      "loss": 0.32239205837249757,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 1.997832889519553,
+      "learning_rate": 3.1892127143086716e-06,
+      "loss": 0.32758924961090086,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 2.299101703675196,
+      "learning_rate": 3.1623223797509347e-06,
+      "loss": 0.31891183853149413,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 2.9210746413068907,
+      "learning_rate": 3.135349377841396e-06,
+      "loss": 0.32430353164672854,
+      "step": 1490
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.6265609696149146,
+      "learning_rate": 3.1082970751983497e-06,
+      "loss": 0.3312281608581543,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 2.5956160397204786,
+      "learning_rate": 3.0811688483379546e-06,
+      "loss": 0.3238035202026367,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 2.231793404952503,
+      "learning_rate": 3.0539680832528074e-06,
+      "loss": 0.32330875396728515,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 2.5723097920479763,
+      "learning_rate": 3.026698174989316e-06,
+      "loss": 0.32520170211791993,
+      "step": 1530
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 2.691498291676849,
+      "learning_rate": 2.999362527223952e-06,
+      "loss": 0.3273704290390015,
+      "step": 1540
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 2.0511124933056375,
+      "learning_rate": 2.9719645518384194e-06,
+      "loss": 0.3250606536865234,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 2.872290392112785,
+      "learning_rate": 2.944507668493807e-06,
+      "loss": 0.3281686782836914,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 2.330246614888919,
+      "learning_rate": 2.9169953042037623e-06,
+      "loss": 0.32374157905578616,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 2.0520711406500394,
+      "learning_rate": 2.889430892906754e-06,
+      "loss": 0.3169667720794678,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 2.048670737699487,
+      "learning_rate": 2.861817875037462e-06,
+      "loss": 0.3160442590713501,
+      "step": 1590
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 2.8695840695234303,
+      "learning_rate": 2.8341596970973683e-06,
+      "loss": 0.32544608116149903,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 1.976397223627746,
+      "learning_rate": 2.80645981122458e-06,
+      "loss": 0.3229134798049927,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 2.7070609575351807,
+      "learning_rate": 2.7787216747629508e-06,
+      "loss": 0.32655487060546873,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 2.6027463070090993,
+      "learning_rate": 2.7509487498305615e-06,
+      "loss": 0.31430754661560056,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 2.4274539931656585,
+      "learning_rate": 2.7231445028875924e-06,
+      "loss": 0.3237884759902954,
+      "step": 1640
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 1.9308598632845329,
+      "learning_rate": 2.6953124043036604e-06,
+      "loss": 0.32111692428588867,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 2.1321964485217784,
+      "learning_rate": 2.667455927924667e-06,
+      "loss": 0.3178241729736328,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 3.1390388403682534,
+      "learning_rate": 2.6395785506392164e-06,
+      "loss": 0.31754770278930666,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 2.137535651695072,
+      "learning_rate": 2.6116837519446407e-06,
+      "loss": 0.3183767795562744,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 2.353751591087722,
+      "learning_rate": 2.5837750135127192e-06,
+      "loss": 0.31382954120635986,
+      "step": 1690
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 2.58704039056448,
+      "learning_rate": 2.555855818755108e-06,
+      "loss": 0.3226866483688354,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 2.709677414439902,
+      "learning_rate": 2.5279296523885636e-06,
+      "loss": 0.3166576623916626,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 2.0859245317104107,
+      "learning_rate": 2.5e-06,
+      "loss": 0.3218212127685547,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 2.3347357869338436,
+      "learning_rate": 2.472070347611437e-06,
+      "loss": 0.31246294975280764,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 2.5799420800617106,
+      "learning_rate": 2.444144181244893e-06,
+      "loss": 0.31868853569030764,
+      "step": 1740
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.8867509619529406,
+      "learning_rate": 2.416224986487282e-06,
+      "loss": 0.31381807327270506,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 2.625660671305278,
+      "learning_rate": 2.3883162480553605e-06,
+      "loss": 0.31146280765533446,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 2.8862495653341544,
+      "learning_rate": 2.3604214493607844e-06,
+      "loss": 0.3111546993255615,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 2.267020272744141,
+      "learning_rate": 2.332544072075333e-06,
+      "loss": 0.32178173065185545,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 2.073205643473978,
+      "learning_rate": 2.30468759569634e-06,
+      "loss": 0.31751441955566406,
+      "step": 1790
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 2.232045258362397,
+      "learning_rate": 2.276855497112408e-06,
+      "loss": 0.3135702610015869,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 3.4632505976937744,
+      "learning_rate": 2.2490512501694394e-06,
+      "loss": 0.3126095771789551,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 2.7008114205550022,
+      "learning_rate": 2.2212783252370496e-06,
+      "loss": 0.31725611686706545,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 2.640110404643157,
+      "learning_rate": 2.1935401887754213e-06,
+      "loss": 0.3210929870605469,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 2.9154181525967924,
+      "learning_rate": 2.165840302902632e-06,
+      "loss": 0.31817543506622314,
+      "step": 1840
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 2.3435756622683916,
+      "learning_rate": 2.1381821249625383e-06,
+      "loss": 0.3186073303222656,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 2.391868801860604,
+      "learning_rate": 2.1105691070932465e-06,
+      "loss": 0.3081700563430786,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 2.27033295147997,
+      "learning_rate": 2.083004695796238e-06,
+      "loss": 0.30403599739074705,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 2.1095837820360157,
+      "learning_rate": 2.055492331506194e-06,
+      "loss": 0.31353535652160647,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 2.284519052184323,
+      "learning_rate": 2.0280354481615814e-06,
+      "loss": 0.31677517890930174,
+      "step": 1890
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 2.237766836173548,
+      "learning_rate": 2.000637472776049e-06,
+      "loss": 0.3152945041656494,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 2.7842715157490434,
+      "learning_rate": 1.973301825010685e-06,
+      "loss": 0.30818216800689696,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 2.4813744091778784,
+      "learning_rate": 1.9460319167471934e-06,
+      "loss": 0.31820502281188967,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 2.0821248606030887,
+      "learning_rate": 1.9188311516620466e-06,
+      "loss": 0.31040709018707274,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 2.9336859566866975,
+      "learning_rate": 1.891702924801651e-06,
+      "loss": 0.31292426586151123,
+      "step": 1940
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 2.511253012965921,
+      "learning_rate": 1.864650622158604e-06,
+      "loss": 0.32196660041809083,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 2.4545922236833455,
+      "learning_rate": 1.8376776202490666e-06,
+      "loss": 0.31464810371398927,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 2.277913414668649,
+      "learning_rate": 1.8107872856913293e-06,
+      "loss": 0.30748977661132815,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 3.6960663974743273,
+      "learning_rate": 1.7839829747856096e-06,
+      "loss": 0.31303911209106444,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 2.5169048193896844,
+      "learning_rate": 1.7572680330951359e-06,
+      "loss": 0.309541130065918,
+      "step": 1990
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 2.625801312197355,
+      "learning_rate": 1.7306457950285747e-06,
+      "loss": 0.31228773593902587,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 3.166705714244592,
+      "learning_rate": 1.704119583423848e-06,
+      "loss": 0.30709683895111084,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 2.7529448920288755,
+      "learning_rate": 1.677692709133396e-06,
+      "loss": 0.3121641159057617,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 2.4164266641009386,
+      "learning_rate": 1.6513684706109311e-06,
+      "loss": 0.31612191200256345,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 2.1475852674178486,
+      "learning_rate": 1.6251501534997529e-06,
+      "loss": 0.30926761627197263,
+      "step": 2040
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 3.027937409819003,
+      "learning_rate": 1.5990410302226405e-06,
+      "loss": 0.3059820652008057,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 2.3663528005893575,
+      "learning_rate": 1.5730443595734162e-06,
+      "loss": 0.30960190296173096,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 2.5495655090650806,
+      "learning_rate": 1.5471633863101982e-06,
+      "loss": 0.3146512508392334,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 2.563871195645732,
+      "learning_rate": 1.521401340750407e-06,
+      "loss": 0.3116560935974121,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 2.4316488926893314,
+      "learning_rate": 1.495761438367577e-06,
+      "loss": 0.31447796821594237,
+      "step": 2090
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 2.446980089200077,
+      "learning_rate": 1.4702468793900187e-06,
+      "loss": 0.3153538703918457,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 2.2511595317617283,
+      "learning_rate": 1.444860848401384e-06,
+      "loss": 0.31273808479309084,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 2.459748219135552,
+      "learning_rate": 1.4196065139431866e-06,
+      "loss": 0.31059865951538085,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 2.4570005490031805,
+      "learning_rate": 1.3944870281193178e-06,
+      "loss": 0.31122384071350095,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 2.5940034157380447,
+      "learning_rate": 1.3695055262026208e-06,
+      "loss": 0.3145638704299927,
+      "step": 2140
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 2.8940635665298644,
+      "learning_rate": 1.3446651262435679e-06,
+      "loss": 0.31133465766906737,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 2.2603444512196216,
+      "learning_rate": 1.3199689286810746e-06,
+      "loss": 0.31110968589782717,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 2.3697248986342223,
+      "learning_rate": 1.2954200159555294e-06,
+      "loss": 0.3046250820159912,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 2.9149559965372083,
+      "learning_rate": 1.2710214521240527e-06,
+      "loss": 0.3056375503540039,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 2.785583016537511,
+      "learning_rate": 1.246776282478063e-06,
+      "loss": 0.3074607849121094,
+      "step": 2190
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 2.238483419316128,
+      "learning_rate": 1.222687533163181e-06,
+      "loss": 0.30821986198425294,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 2.0963873111402225,
+      "learning_rate": 1.1987582108015228e-06,
+      "loss": 0.31098227500915526,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 2.3511311934322725,
+      "learning_rate": 1.1749913021164255e-06,
+      "loss": 0.3125911712646484,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 2.0182013166602735,
+      "learning_rate": 1.1513897735596702e-06,
+      "loss": 0.30506420135498047,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 2.0904990978865654,
+      "learning_rate": 1.127956570941218e-06,
+      "loss": 0.30170474052429197,
+      "step": 2240
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.3591898483151525,
+      "learning_rate": 1.104694619061533e-06,
+      "loss": 0.3140627145767212,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 2.3874798738589553,
+      "learning_rate": 1.0816068213465295e-06,
+      "loss": 0.3148207187652588,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 2.462173136321867,
+      "learning_rate": 1.0586960594851762e-06,
+      "loss": 0.30828402042388914,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 2.2877287929832946,
+      "learning_rate": 1.0359651930698217e-06,
+      "loss": 0.30725433826446535,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 2.5585705908550413,
+      "learning_rate": 1.0134170592392837e-06,
+      "loss": 0.30991530418395996,
+      "step": 2290
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 2.415441399008779,
+      "learning_rate": 9.910544723247204e-07,
+      "loss": 0.31087689399719237,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 2.6450690086623285,
+      "learning_rate": 9.688802234983706e-07,
+      "loss": 0.3067446231842041,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 2.363123649822279,
+      "learning_rate": 9.468970804251742e-07,
+      "loss": 0.30767192840576174,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 2.245412676348008,
+      "learning_rate": 9.251077869173244e-07,
+      "loss": 0.30107917785644533,
+      "step": 2330
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 2.5736642361970503,
+      "learning_rate": 9.035150625918054e-07,
+      "loss": 0.303986120223999,
+      "step": 2340
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 2.6844109007429138,
+      "learning_rate": 8.821216025309395e-07,
+      "loss": 0.3074802875518799,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 2.412670568786912,
+      "learning_rate": 8.609300769460055e-07,
+      "loss": 0.30130510330200194,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 3.176069141824472,
+      "learning_rate": 8.399431308439592e-07,
+      "loss": 0.3105806827545166,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 2.23339472526297,
+      "learning_rate": 8.191633836972962e-07,
+      "loss": 0.3084972620010376,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 2.6912839020724175,
+      "learning_rate": 7.985934291171024e-07,
+      "loss": 0.3067460536956787,
+      "step": 2390
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 2.5426618104677976,
+      "learning_rate": 7.7823583452934e-07,
+      "loss": 0.30809898376464845,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 2.55531536817282,
+      "learning_rate": 7.58093140854389e-07,
+      "loss": 0.3071744441986084,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 2.285863017236424,
+      "learning_rate": 7.381678621899077e-07,
+      "loss": 0.3093477725982666,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 2.3600405361881767,
+      "learning_rate": 7.184624854970379e-07,
+      "loss": 0.30798888206481934,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 2.0247328579355726,
+      "learning_rate": 6.989794702899932e-07,
+      "loss": 0.3048464298248291,
+      "step": 2440
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 2.7079172300622334,
+      "learning_rate": 6.797212483290777e-07,
+      "loss": 0.3093360424041748,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 2.8011999237207967,
+      "learning_rate": 6.60690223317171e-07,
+      "loss": 0.30233092308044435,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 2.202966089641912,
+      "learning_rate": 6.418887705997046e-07,
+      "loss": 0.3048731327056885,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 2.6510546903467755,
+      "learning_rate": 6.23319236868189e-07,
+      "loss": 0.3104764461517334,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 2.510992490322273,
+      "learning_rate": 6.049839398673141e-07,
+      "loss": 0.31223044395446775,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.7988283248607604,
+      "learning_rate": 5.868851681056567e-07,
+      "loss": 0.3109541893005371,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 2.370572243788772,
+      "learning_rate": 5.690251805700467e-07,
+      "loss": 0.3075347900390625,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 2.057318428676814,
+      "learning_rate": 5.514062064436096e-07,
+      "loss": 0.30944228172302246,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 2.9526395601791937,
+      "learning_rate": 5.34030444827533e-07,
+      "loss": 0.30773684978485105,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 2.1808951881567165,
+      "learning_rate": 5.169000644665895e-07,
+      "loss": 0.30281686782836914,
+      "step": 2540
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 2.501184820191482,
+      "learning_rate": 5.000172034784442e-07,
+      "loss": 0.30731327533721925,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 2.4433836822113304,
+      "learning_rate": 4.833839690867853e-07,
+      "loss": 0.30861892700195315,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 2.482955525732734,
+      "learning_rate": 4.6700243735831705e-07,
+      "loss": 0.3014340400695801,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 2.516375989369738,
+      "learning_rate": 4.508746529436311e-07,
+      "loss": 0.302032995223999,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 2.2676227598264926,
+      "learning_rate": 4.350026288220083e-07,
+      "loss": 0.30550131797790525,
+      "step": 2590
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 2.3829531066293126,
+      "learning_rate": 4.1938834605017133e-07,
+      "loss": 0.3046237945556641,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 2.0018887466739548,
+      "learning_rate": 4.0403375351501515e-07,
+      "loss": 0.3024258852005005,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 2.5182571334882597,
+      "learning_rate": 3.88940767690362e-07,
+      "loss": 0.3063870906829834,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 2.7441991027074355,
+      "learning_rate": 3.7411127239775774e-07,
+      "loss": 0.30306272506713866,
+      "step": 2630
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 2.161963722714269,
+      "learning_rate": 3.595471185713431e-07,
+      "loss": 0.3009947299957275,
+      "step": 2640
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 2.7694143698141285,
+      "learning_rate": 3.4525012402682826e-07,
+      "loss": 0.30188300609588625,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 2.6814413975784217,
+      "learning_rate": 3.3122207323460804e-07,
+      "loss": 0.3024703025817871,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 2.4444711671869306,
+      "learning_rate": 3.1746471709702963e-07,
+      "loss": 0.3008608102798462,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 2.6886622183433015,
+      "learning_rate": 3.039797727298585e-07,
+      "loss": 0.30821614265441893,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 2.641784614909192,
+      "learning_rate": 2.9076892324795546e-07,
+      "loss": 0.30515303611755373,
+      "step": 2690
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 2.5595370943122444,
+      "learning_rate": 2.778338175551995e-07,
+      "loss": 0.3007267236709595,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 2.283872628964803,
+      "learning_rate": 2.6517607013868326e-07,
+      "loss": 0.30617167949676516,
+      "step": 2710
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 2.558413840419693,
+      "learning_rate": 2.527972608672002e-07,
+      "loss": 0.3038905143737793,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 2.4952676522317567,
+      "learning_rate": 2.40698934794053e-07,
+      "loss": 0.3054081201553345,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 2.247637838190116,
+      "learning_rate": 2.2888260196421237e-07,
+      "loss": 0.3028261661529541,
+      "step": 2740
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 2.5035963414447804,
+      "learning_rate": 2.1734973722583735e-07,
+      "loss": 0.3062435626983643,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 1.918923632238423,
+      "learning_rate": 2.0610178004619564e-07,
+      "loss": 0.2972743034362793,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 2.4603002546330845,
+      "learning_rate": 1.9514013433199834e-07,
+      "loss": 0.3119321346282959,
+      "step": 2770
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 2.1315709346733667,
+      "learning_rate": 1.8446616825416958e-07,
+      "loss": 0.30900893211364744,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 2.3753122188061218,
+      "learning_rate": 1.7408121407708007e-07,
+      "loss": 0.3069151401519775,
+      "step": 2790
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 2.207415755325001,
+      "learning_rate": 1.6398656799226253e-07,
+      "loss": 0.2986165523529053,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 2.178561452169741,
+      "learning_rate": 1.5418348995662773e-07,
+      "loss": 0.3010268688201904,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 2.5082064593439393,
+      "learning_rate": 1.4467320353520275e-07,
+      "loss": 0.2984073877334595,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 2.366814729694057,
+      "learning_rate": 1.3545689574841341e-07,
+      "loss": 0.3026757001876831,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 2.380709246306716,
+      "learning_rate": 1.26535716923927e-07,
+      "loss": 0.310437536239624,
+      "step": 2840
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 2.484246324702375,
+      "learning_rate": 1.1791078055307493e-07,
+      "loss": 0.30369887351989744,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 2.6412244000001786,
+      "learning_rate": 1.0958316315187289e-07,
+      "loss": 0.3044759750366211,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 2.4542916560781967,
+      "learning_rate": 1.0155390412665528e-07,
+      "loss": 0.30136928558349607,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 2.631911471911446,
+      "learning_rate": 9.38240056443443e-08,
+      "loss": 0.30144243240356444,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 2.2530200189747243,
+      "learning_rate": 8.639443250736402e-08,
+      "loss": 0.3027902603149414,
+      "step": 2890
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 3.1331936934174123,
+      "learning_rate": 7.926611203321777e-08,
+      "loss": 0.30441856384277344,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 2.5134219010551067,
+      "learning_rate": 7.243993393874882e-08,
+      "loss": 0.306389307975769,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 2.372785201514508,
+      "learning_rate": 6.591675022908805e-08,
+      "loss": 0.30292179584503176,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 2.407913531878434,
+      "learning_rate": 5.969737509131241e-08,
+      "loss": 0.29895825386047364,
+      "step": 2930
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 2.2376435379528865,
+      "learning_rate": 5.3782584792823334e-08,
+      "loss": 0.30271134376525877,
+      "step": 2940
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 2.653290725438786,
+      "learning_rate": 4.817311758445686e-08,
+      "loss": 0.3062829732894897,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 2.42511171945876,
+      "learning_rate": 4.286967360833866e-08,
+      "loss": 0.3066932439804077,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 2.1534299736877895,
+      "learning_rate": 3.787291481049754e-08,
+      "loss": 0.3068870544433594,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 2.209956884835794,
+      "learning_rate": 3.3183464858244364e-08,
+      "loss": 0.31453580856323243,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 2.5928568899987017,
+      "learning_rate": 2.8801909062328992e-08,
+      "loss": 0.2991969108581543,
+      "step": 2990
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 2.385980918167846,
+      "learning_rate": 2.4728794303886248e-08,
+      "loss": 0.2963397026062012,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 2.374100986684654,
+      "learning_rate": 2.0964628966175794e-08,
+      "loss": 0.30301966667175295,
+      "step": 3010
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 2.094256605734986,
+      "learning_rate": 1.750988287113009e-08,
+      "loss": 0.2994666576385498,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 1.916185239441286,
+      "learning_rate": 1.4364987220713278e-08,
+      "loss": 0.3080729007720947,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 2.3446521041543207,
+      "learning_rate": 1.1530334543099763e-08,
+      "loss": 0.3026130199432373,
+      "step": 3040
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 2.5854178252734323,
+      "learning_rate": 9.006278643683697e-09,
+      "loss": 0.309655499458313,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 1.9908162772434517,
+      "learning_rate": 6.793134560916514e-09,
+      "loss": 0.31186389923095703,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 2.1977962094508534,
+      "learning_rate": 4.891178526986451e-09,
+      "loss": 0.30645883083343506,
+      "step": 3070
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 2.2397406638818147,
+      "learning_rate": 3.3006479333413943e-09,
+      "loss": 0.3090504169464111,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 2.0435901319475036,
+      "learning_rate": 2.021741301058422e-09,
+      "loss": 0.3049570322036743,
+      "step": 3090
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 2.371036869409615,
+      "learning_rate": 1.0546182560652872e-09,
+      "loss": 0.3073274612426758,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 2.2551729202130457,
+      "learning_rate": 3.9939950921774607e-10,
+      "loss": 0.30047030448913575,
+      "step": 3110
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 2.2067081414460827,
+      "learning_rate": 5.616684123160854e-11,
+      "loss": 0.3023503065109253,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0,
+      "step": 3125,
+      "total_flos": 1.0913057758773248e+16,
+      "train_loss": 0.7292402684783935,
+      "train_runtime": 30167.0559,
+      "train_samples_per_second": 6.63,
+      "train_steps_per_second": 0.104
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0913057758773248e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoints/Gemma-4-E4B-it-SFT/training_loss.png b/checkpoints/Gemma-4-E4B-it-SFT/training_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..f78c1b6ee186ebfafd3c8e04e74c3f1677868e33
Binary files /dev/null and b/checkpoints/Gemma-4-E4B-it-SFT/training_loss.png differ
diff --git a/checkpoints/InternVL3.5-8B-SFT/all_results.json b/checkpoints/InternVL3.5-8B-SFT/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4436aa71bbc2b5c4eb3af92edaf52a6a1e07b35c
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1955525886476288.0,
+    "train_loss": 0.1948647116279602,
+    "train_runtime": 28413.61,
+    "train_samples_per_second": 7.039,
+    "train_steps_per_second": 0.11
+}
\ No newline at end of file
diff --git a/checkpoints/InternVL3.5-8B-SFT/chat_template.jinja b/checkpoints/InternVL3.5-8B-SFT/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..2e5dcf6ad9edfd8c89a52da19c23bd8a8d87a7f2
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/chat_template.jinja
@@ -0,0 +1,6 @@
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+'}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<image>
+' }}{% elif content['type'] == 'video' %}{{ '<video>
+' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{'<|im_end|>
+'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant
+' }}{% endif %}
\ No newline at end of file
diff --git a/checkpoints/InternVL3.5-8B-SFT/config.json b/checkpoints/InternVL3.5-8B-SFT/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..268660df4c23f8befa8d354468d81c1d9041a925
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/config.json
@@ -0,0 +1,121 @@
+{
+  "architectures": [
+    "InternVLForConditionalGeneration"
+  ],
+  "downsample_ratio": 0.5,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_size": 4096,
+  "image_seq_length": 256,
+  "image_token_id": 151671,
+  "model_type": "internvl",
+  "pad_token_id": 151643,
+  "projector_hidden_act": "gelu",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "debug": false,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "ep_size": 1,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "micro_forward": false,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "pad_token_id": null,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 1000000,
+      "rope_type": "default"
+    },
+    "skip_checkpoint": false,
+    "sliding_window": null,
+    "tie_word_embeddings": false,
+    "use_cache": false,
+    "use_deepep": false,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.5.3",
+  "use_cache": false,
+  "vision_config": {
+    "attention_bias": true,
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 1024,
+    "image_size": [
+      448,
+      448
+    ],
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "layer_scale_init_value": 0.1,
+    "model_type": "internvl_vision",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "patch_size": [
+      14,
+      14
+    ],
+    "projection_dropout": 0.0,
+    "use_absolute_position_embeddings": true,
+    "use_mask_token": false,
+    "use_mean_pooling": true,
+    "use_qk_norm": false
+  },
+  "vision_feature_layer": -1,
+  "vision_feature_select_strategy": "default"
+}
diff --git a/checkpoints/InternVL3.5-8B-SFT/eval_results_job_internvl35_8b_internvl35_8b_20260430_002347.json b/checkpoints/InternVL3.5-8B-SFT/eval_results_job_internvl35_8b_internvl35_8b_20260430_002347.json
new file mode 100644
index 0000000000000000000000000000000000000000..37add930d1bcce8a9e4d426d24035df7d04d59d4
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/eval_results_job_internvl35_8b_internvl35_8b_20260430_002347.json
@@ -0,0 +1,55 @@
+{
+  "mae_dx": 0.1673913793103448,
+  "rmse_dx": 0.558917781258475,
+  "mae_dy": 0.14736034482758623,
+  "rmse_dy": 0.43480112378118263,
+  "mae_dz": 0.014543103448275864,
+  "rmse_dz": 0.08856353651475307,
+  "mae_dpitch": 0.3550155172413793,
+  "rmse_dpitch": 0.7902604437560551,
+  "mae_dyaw": 1.1672620689655173,
+  "rmse_dyaw": 2.7476475518096417,
+  "mae_droll": 0.0,
+  "rmse_droll": 0.0,
+  "mae_overall": 0.3085954022988506,
+  "mae_position": 0.10976494252873564,
+  "mae_rotation": 0.5074258620689654,
+  "rmse_overall": 1.2030075672648746,
+  "wp1_euc_mae": 0.07507887870117307,
+  "wp1_euc_median": 0.020000000000000018,
+  "wp2_euc_mae": 0.15300439055300805,
+  "wp2_euc_median": 0.04472135954999579,
+  "wp3_euc_mae": 0.24257533874781437,
+  "wp3_euc_median": 0.0806225774829854,
+  "wp4_euc_mae": 0.35212693283711727,
+  "wp4_euc_median": 0.12369316876852973,
+  "wp5_euc_mae": 0.4665492393220971,
+  "wp5_euc_median": 0.17131835484052965,
+  "euclidean_mae": 0.25786695603224197,
+  "ADE": 0.25786695603224197,
+  "FDE": 0.4665492393220971,
+  "ADE_median": 0.09423994273900672,
+  "FDE_median": 0.17131835484052965,
+  "SR@0.5m": 0.8844827586206897,
+  "SR@1.0m": 0.9520689655172414,
+  "SR@2.0m": 0.9801724137931035,
+  "SR@5.0m": 0.9956896551724138,
+  "TrajSR@1.0m": 0.8931034482758621,
+  "TrajSR@2.0m": 0.9586206896551724,
+  "TrajSR@5.0m": 0.9887931034482759,
+  "RotAcc@1.0deg": 0.6555172413793103,
+  "RotAcc@5.0deg": 0.9496551724137932,
+  "RotAcc@10.0deg": 0.9872413793103448,
+  "wp1_rot_mae": 0.6533116266968418,
+  "wp2_rot_mae": 0.9349310214465391,
+  "wp3_rot_mae": 1.2746919556832232,
+  "wp4_rot_mae": 1.6432791843561125,
+  "wp5_rot_mae": 2.0312724349773714,
+  "rotation_euc_mae": 1.3074972446320177,
+  "parse_failure_rate": 0.0,
+  "parse_success_rate": 1.0,
+  "valid_samples": 1160,
+  "total_samples": 1160,
+  "parse_failures": 0,
+  "inference_engine": "transformers"
+}
\ No newline at end of file
diff --git a/checkpoints/InternVL3.5-8B-SFT/generation_config.json b/checkpoints/InternVL3.5-8B-SFT/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9fb44e20ba433dc943bd1431a7bcaee508e5da5
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/generation_config.json
@@ -0,0 +1,8 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": [
+    151645
+  ],
+  "pad_token_id": 151643,
+  "transformers_version": "5.5.3"
+}
diff --git a/checkpoints/InternVL3.5-8B-SFT/model.safetensors b/checkpoints/InternVL3.5-8B-SFT/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4b078e34ee4fbde29d4d7acd337513ccc90f4f74
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ca84729e7bbaee8063f7e92a2435d5d69d0d38983a451a5085d11c886ab5e34
+size 17056747968
diff --git a/checkpoints/InternVL3.5-8B-SFT/processor_config.json b/checkpoints/InternVL3.5-8B-SFT/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef75af1480039bd9e0d67dd53fc3da740f495b0b
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/processor_config.json
@@ -0,0 +1,79 @@
+{
+  "image_processor": {
+    "crop_to_patches": false,
+    "data_format": "channels_first",
+    "default_to_square": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.485,
+      0.456,
+      0.406
+    ],
+    "image_processor_type": "GotOcr2ImageProcessor",
+    "image_std": [
+      0.229,
+      0.224,
+      0.225
+    ],
+    "max_patches": 12,
+    "min_patches": 1,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "height": 448,
+      "width": 448
+    }
+  },
+  "image_seq_length": 256,
+  "processor_class": "InternVLProcessor",
+  "video_processor": {
+    "data_format": "channels_first",
+    "default_to_square": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": false,
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "initial_shift": true,
+    "model_valid_processing_keys": [
+      "do_convert_rgb",
+      "do_resize",
+      "size",
+      "size_divisor",
+      "default_to_square",
+      "resample",
+      "do_rescale",
+      "rescale_factor",
+      "do_normalize",
+      "image_mean",
+      "image_std",
+      "do_pad",
+      "do_center_crop",
+      "crop_size",
+      "data_format",
+      "input_data_format",
+      "device"
+    ],
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "size": {
+      "height": 384,
+      "width": 384
+    },
+    "video_processor_type": "InternVLVideoProcessor"
+  }
+}
diff --git a/checkpoints/InternVL3.5-8B-SFT/tokenizer.json b/checkpoints/InternVL3.5-8B-SFT/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..de249ee3c7b21b365aa6c1715efcfdf1a0b8d507
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6581c44164d273d4222df982905a7e0450dcf3a4a7ebe98f9ec53e4de05beffe
+size 11424300
diff --git a/checkpoints/InternVL3.5-8B-SFT/tokenizer_config.json b/checkpoints/InternVL3.5-8B-SFT/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..99a7cfec6d0bfb0c5334c94c34059296a3456361
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "context_image_token": "<IMG_CONTEXT>",
+  "end_image_token": "</img>",
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_end|>"
+  ],
+  "is_local": true,
+  "model_max_length": 14588,
+  "model_specific_special_tokens": {
+    "context_image_token": "<IMG_CONTEXT>",
+    "end_image_token": "</img>",
+    "start_image_token": "<img>",
+    "video_token": "<|video_pad|>"
+  },
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "InternVLProcessor",
+  "split_special_tokens": false,
+  "start_image_token": "<img>",
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<|video_pad|>"
+}
diff --git a/checkpoints/InternVL3.5-8B-SFT/train_results.json b/checkpoints/InternVL3.5-8B-SFT/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4436aa71bbc2b5c4eb3af92edaf52a6a1e07b35c
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1955525886476288.0,
+    "train_loss": 0.1948647116279602,
+    "train_runtime": 28413.61,
+    "train_samples_per_second": 7.039,
+    "train_steps_per_second": 0.11
+}
\ No newline at end of file
diff --git a/checkpoints/InternVL3.5-8B-SFT/trainer_state.json b/checkpoints/InternVL3.5-8B-SFT/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bc796f8df70d971130e32cb19d1881623ceb148
--- /dev/null
+++ b/checkpoints/InternVL3.5-8B-SFT/trainer_state.json
@@ -0,0 +1,2227 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 7.051180674009678,
+      "learning_rate": 1.437699680511182e-07,
+      "loss": 0.45998425483703614,
+      "step": 10
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 6.386709802443142,
+      "learning_rate": 3.0351437699680514e-07,
+      "loss": 0.44952831268310545,
+      "step": 20
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 5.3631010908380015,
+      "learning_rate": 4.6325878594249205e-07,
+      "loss": 0.3993690013885498,
+      "step": 30
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.852155255839625,
+      "learning_rate": 6.230031948881789e-07,
+      "loss": 0.3118258237838745,
+      "step": 40
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.4705571475990448,
+      "learning_rate": 7.82747603833866e-07,
+      "loss": 0.2786674976348877,
+      "step": 50
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.39720799855122535,
+      "learning_rate": 9.424920127795528e-07,
+      "loss": 0.2685645580291748,
+      "step": 60
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.34144681090493506,
+      "learning_rate": 1.1022364217252397e-06,
+      "loss": 0.27388153076171873,
+      "step": 70
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.29670665469044527,
+      "learning_rate": 1.2619808306709266e-06,
+      "loss": 0.2562382221221924,
+      "step": 80
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.2721949763566226,
+      "learning_rate": 1.4217252396166134e-06,
+      "loss": 0.2521932125091553,
+      "step": 90
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.30509418891876505,
+      "learning_rate": 1.5814696485623005e-06,
+      "loss": 0.2553669214248657,
+      "step": 100
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.2710599378904947,
+      "learning_rate": 1.7412140575079875e-06,
+      "loss": 0.2548961162567139,
+      "step": 110
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.3180117403374185,
+      "learning_rate": 1.9009584664536742e-06,
+      "loss": 0.2442842960357666,
+      "step": 120
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.2695352733592907,
+      "learning_rate": 2.060702875399361e-06,
+      "loss": 0.24766459465026855,
+      "step": 130
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.3064535363854503,
+      "learning_rate": 2.220447284345048e-06,
+      "loss": 0.23845260143280028,
+      "step": 140
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.29068646435586043,
+      "learning_rate": 2.380191693290735e-06,
+      "loss": 0.23559024333953857,
+      "step": 150
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3186203915842237,
+      "learning_rate": 2.539936102236422e-06,
+      "loss": 0.23029537200927735,
+      "step": 160
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.32754075011707046,
+      "learning_rate": 2.699680511182109e-06,
+      "loss": 0.2385089635848999,
+      "step": 170
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.38484368515577855,
+      "learning_rate": 2.8594249201277955e-06,
+      "loss": 0.23111426830291748,
+      "step": 180
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.299502856060473,
+      "learning_rate": 3.0191693290734825e-06,
+      "loss": 0.23530282974243164,
+      "step": 190
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.3078123289936782,
+      "learning_rate": 3.17891373801917e-06,
+      "loss": 0.23611860275268554,
+      "step": 200
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.30717572422426626,
+      "learning_rate": 3.3386581469648564e-06,
+      "loss": 0.23241891860961914,
+      "step": 210
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.30949630760689323,
+      "learning_rate": 3.4984025559105434e-06,
+      "loss": 0.2257370948791504,
+      "step": 220
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.2734080768093611,
+      "learning_rate": 3.6581469648562303e-06,
+      "loss": 0.22820501327514647,
+      "step": 230
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.31986420438753294,
+      "learning_rate": 3.817891373801918e-06,
+      "loss": 0.22324295043945314,
+      "step": 240
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3271935835910018,
+      "learning_rate": 3.977635782747604e-06,
+      "loss": 0.22092509269714355,
+      "step": 250
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.28164810489138675,
+      "learning_rate": 4.137380191693291e-06,
+      "loss": 0.22088565826416015,
+      "step": 260
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.2806581165168549,
+      "learning_rate": 4.297124600638978e-06,
+      "loss": 0.2235860824584961,
+      "step": 270
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.2818314341404028,
+      "learning_rate": 4.456869009584665e-06,
+      "loss": 0.21951718330383302,
+      "step": 280
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.2755068214230404,
+      "learning_rate": 4.616613418530352e-06,
+      "loss": 0.22480430603027343,
+      "step": 290
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.2991295097090295,
+      "learning_rate": 4.776357827476039e-06,
+      "loss": 0.22600164413452148,
+      "step": 300
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3239664056294863,
+      "learning_rate": 4.936102236421725e-06,
+      "loss": 0.21372499465942382,
+      "step": 310
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.2881723034602484,
+      "learning_rate": 4.999943833158769e-06,
+      "loss": 0.21513206958770753,
+      "step": 320
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.27877645475403023,
+      "learning_rate": 4.999600600490783e-06,
+      "loss": 0.22072982788085938,
+      "step": 330
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.28224550070191395,
+      "learning_rate": 4.9989453817439345e-06,
+      "loss": 0.2146312713623047,
+      "step": 340
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.26853026150431764,
+      "learning_rate": 4.997978258698942e-06,
+      "loss": 0.21449072360992433,
+      "step": 350
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.2706003676564934,
+      "learning_rate": 4.996699352066659e-06,
+      "loss": 0.2151791572570801,
+      "step": 360
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.28539700359373177,
+      "learning_rate": 4.995108821473014e-06,
+      "loss": 0.21470160484313966,
+      "step": 370
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.29207494833659137,
+      "learning_rate": 4.993206865439084e-06,
+      "loss": 0.21086468696594238,
+      "step": 380
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.263064572322246,
+      "learning_rate": 4.990993721356317e-06,
+      "loss": 0.20984139442443847,
+      "step": 390
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.2865097413347111,
+      "learning_rate": 4.988469665456901e-06,
+      "loss": 0.21040558815002441,
+      "step": 400
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.2738592744136949,
+      "learning_rate": 4.985635012779288e-06,
+      "loss": 0.21828360557556153,
+      "step": 410
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.24970618963972283,
+      "learning_rate": 4.98249011712887e-06,
+      "loss": 0.2106489658355713,
+      "step": 420
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.2629431248486553,
+      "learning_rate": 4.979035371033824e-06,
+      "loss": 0.20979018211364747,
+      "step": 430
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.2725505982701801,
+      "learning_rate": 4.975271205696115e-06,
+      "loss": 0.20948367118835448,
+      "step": 440
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.2704053444924022,
+      "learning_rate": 4.971198090937671e-06,
+      "loss": 0.2033768653869629,
+      "step": 450
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.31765035973786815,
+      "learning_rate": 4.966816535141756e-06,
+      "loss": 0.20044360160827637,
+      "step": 460
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.2502347867419884,
+      "learning_rate": 4.9621270851895035e-06,
+      "loss": 0.2100567102432251,
+      "step": 470
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.2934932151321077,
+      "learning_rate": 4.957130326391662e-06,
+      "loss": 0.21090621948242189,
+      "step": 480
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.26660410583968774,
+      "learning_rate": 4.951826882415544e-06,
+      "loss": 0.20775444507598878,
+      "step": 490
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.28519626596006936,
+      "learning_rate": 4.946217415207177e-06,
+      "loss": 0.20256528854370118,
+      "step": 500
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.2798675045050625,
+      "learning_rate": 4.940302624908689e-06,
+      "loss": 0.20623595714569093,
+      "step": 510
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.28222884808809434,
+      "learning_rate": 4.934083249770912e-06,
+      "loss": 0.20097856521606444,
+      "step": 520
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.2788085638053828,
+      "learning_rate": 4.927560066061251e-06,
+      "loss": 0.20387496948242187,
+      "step": 530
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.27554368272722524,
+      "learning_rate": 4.920733887966783e-06,
+      "loss": 0.21524934768676757,
+      "step": 540
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.26559833530971816,
+      "learning_rate": 4.913605567492636e-06,
+      "loss": 0.20402135848999023,
+      "step": 550
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.26554772115650926,
+      "learning_rate": 4.906175994355656e-06,
+      "loss": 0.20598478317260743,
+      "step": 560
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.3223429392292309,
+      "learning_rate": 4.898446095873345e-06,
+      "loss": 0.20747475624084472,
+      "step": 570
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.24355730693567182,
+      "learning_rate": 4.890416836848128e-06,
+      "loss": 0.20512137413024903,
+      "step": 580
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3001767473938059,
+      "learning_rate": 4.882089219446925e-06,
+      "loss": 0.19992779493331908,
+      "step": 590
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.2835389086432711,
+      "learning_rate": 4.873464283076074e-06,
+      "loss": 0.20495295524597168,
+      "step": 600
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.26019712508927473,
+      "learning_rate": 4.864543104251587e-06,
+      "loss": 0.2035728931427002,
+      "step": 610
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.2657949563176517,
+      "learning_rate": 4.855326796464798e-06,
+      "loss": 0.20619282722473145,
+      "step": 620
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.28295912792439204,
+      "learning_rate": 4.8458165100433725e-06,
+      "loss": 0.2016925811767578,
+      "step": 630
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.2902924299127114,
+      "learning_rate": 4.836013432007738e-06,
+      "loss": 0.20164456367492675,
+      "step": 640
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.24256417788990398,
+      "learning_rate": 4.825918785922921e-06,
+      "loss": 0.20648303031921386,
+      "step": 650
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.27122351891055063,
+      "learning_rate": 4.8155338317458315e-06,
+      "loss": 0.20356349945068358,
+      "step": 660
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.2600569122055766,
+      "learning_rate": 4.804859865668002e-06,
+      "loss": 0.19959055185317992,
+      "step": 670
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.25345624369635567,
+      "learning_rate": 4.793898219953804e-06,
+      "loss": 0.2007960557937622,
+      "step": 680
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2544929334444299,
+      "learning_rate": 4.782650262774164e-06,
+      "loss": 0.20300769805908203,
+      "step": 690
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.2897145189307127,
+      "learning_rate": 4.7711173980357886e-06,
+      "loss": 0.19880002737045288,
+      "step": 700
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.2560542526589546,
+      "learning_rate": 4.759301065205947e-06,
+      "loss": 0.19960763454437255,
+      "step": 710
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.3097914904022575,
+      "learning_rate": 4.7472027391328e-06,
+      "loss": 0.2003918170928955,
+      "step": 720
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.2887809607696432,
+      "learning_rate": 4.734823929861317e-06,
+      "loss": 0.20292911529541016,
+      "step": 730
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.28917619670340877,
+      "learning_rate": 4.722166182444801e-06,
+      "loss": 0.20004558563232422,
+      "step": 740
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.27043264841658887,
+      "learning_rate": 4.709231076752045e-06,
+      "loss": 0.19843683242797852,
+      "step": 750
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.2645651727770741,
+      "learning_rate": 4.696020227270142e-06,
+      "loss": 0.20258240699768065,
+      "step": 760
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.2777282429222742,
+      "learning_rate": 4.6825352829029705e-06,
+      "loss": 0.1994302749633789,
+      "step": 770
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.28182340391383837,
+      "learning_rate": 4.668777926765392e-06,
+      "loss": 0.197939932346344,
+      "step": 780
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.2390403179508666,
+      "learning_rate": 4.6547498759731725e-06,
+      "loss": 0.19328031539916993,
+      "step": 790
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.30761446053746666,
+      "learning_rate": 4.6404528814286575e-06,
+      "loss": 0.1962287425994873,
+      "step": 800
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.26058296777263723,
+      "learning_rate": 4.6258887276022425e-06,
+      "loss": 0.20304152965545655,
+      "step": 810
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3023946784650888,
+      "learning_rate": 4.611059232309639e-06,
+      "loss": 0.19789116382598876,
+      "step": 820
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.28736962727648746,
+      "learning_rate": 4.595966246484986e-06,
+      "loss": 0.19968997240066527,
+      "step": 830
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.28571881200537336,
+      "learning_rate": 4.580611653949829e-06,
+      "loss": 0.20007586479187012,
+      "step": 840
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.295019179491335,
+      "learning_rate": 4.564997371177992e-06,
+      "loss": 0.19763822555541993,
+      "step": 850
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.29653404936460237,
+      "learning_rate": 4.54912534705637e-06,
+      "loss": 0.19755616188049316,
+      "step": 860
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.2642449071502374,
+      "learning_rate": 4.532997562641683e-06,
+      "loss": 0.19439829587936402,
+      "step": 870
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.25657475126133233,
+      "learning_rate": 4.516616030913214e-06,
+      "loss": 0.1987127423286438,
+      "step": 880
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.28458590654874555,
+      "learning_rate": 4.499982796521556e-06,
+      "loss": 0.19352295398712158,
+      "step": 890
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.2793448530701338,
+      "learning_rate": 4.48309993553341e-06,
+      "loss": 0.1959349274635315,
+      "step": 900
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.3163250873932861,
+      "learning_rate": 4.465969555172468e-06,
+      "loss": 0.1957021713256836,
+      "step": 910
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.2933329400631374,
+      "learning_rate": 4.448593793556391e-06,
+      "loss": 0.20156097412109375,
+      "step": 920
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.2688085579058971,
+      "learning_rate": 4.430974819429954e-06,
+      "loss": 0.1948945164680481,
+      "step": 930
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.28553708068341715,
+      "learning_rate": 4.413114831894344e-06,
+      "loss": 0.18995710611343383,
+      "step": 940
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.26518275753825254,
+      "learning_rate": 4.3950160601326865e-06,
+      "loss": 0.18871839046478273,
+      "step": 950
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.28692003913342795,
+      "learning_rate": 4.376680763131811e-06,
+      "loss": 0.19533849954605104,
+      "step": 960
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.27227233815166896,
+      "learning_rate": 4.358111229400296e-06,
+      "loss": 0.19751427173614503,
+      "step": 970
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.27245831220598377,
+      "learning_rate": 4.33930977668283e-06,
+      "loss": 0.20111453533172607,
+      "step": 980
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.2482632152661181,
+      "learning_rate": 4.320278751670922e-06,
+      "loss": 0.19406617879867555,
+      "step": 990
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.2892442073812178,
+      "learning_rate": 4.301020529710009e-06,
+      "loss": 0.19525597095489503,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.26392559431034407,
+      "learning_rate": 4.281537514502962e-06,
+      "loss": 0.19918107986450195,
+      "step": 1010
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.27003912401002855,
+      "learning_rate": 4.261832137810093e-06,
+      "loss": 0.1964997172355652,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.2664017566726753,
+      "learning_rate": 4.241906859145611e-06,
+      "loss": 0.19660145044326782,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.2744161118643581,
+      "learning_rate": 4.221764165470661e-06,
+      "loss": 0.1935626745223999,
+      "step": 1040
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.2717693030089869,
+      "learning_rate": 4.201406570882898e-06,
+      "loss": 0.19286205768585205,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.259292524653773,
+      "learning_rate": 4.180836616302704e-06,
+      "loss": 0.1922353148460388,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.2739674960468982,
+      "learning_rate": 4.160056869156041e-06,
+      "loss": 0.19553282260894775,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.272965837223612,
+      "learning_rate": 4.139069923053995e-06,
+      "loss": 0.19367674589157105,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.2463436566122966,
+      "learning_rate": 4.117878397469062e-06,
+      "loss": 0.19772920608520508,
+      "step": 1090
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.24672019869428047,
+      "learning_rate": 4.096484937408195e-06,
+      "loss": 0.1892393112182617,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.2673060417093708,
+      "learning_rate": 4.074892213082676e-06,
+      "loss": 0.1892371416091919,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.26767314750680543,
+      "learning_rate": 4.0531029195748265e-06,
+      "loss": 0.19717614650726317,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.2796524343786416,
+      "learning_rate": 4.03111977650163e-06,
+      "loss": 0.19503848552703856,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.2816284710404393,
+      "learning_rate": 4.008945527675281e-06,
+      "loss": 0.19529366493225098,
+      "step": 1140
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.31949481569871324,
+      "learning_rate": 3.986582940760717e-06,
+      "loss": 0.18451136350631714,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.2723449306170863,
+      "learning_rate": 3.9640348069301785e-06,
+      "loss": 0.191510009765625,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.27747112521567696,
+      "learning_rate": 3.941303940514826e-06,
+      "loss": 0.19263410568237305,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.2719099807762723,
+      "learning_rate": 3.918393178653472e-06,
+      "loss": 0.19341590404510497,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.29074805846664115,
+      "learning_rate": 3.895305380938468e-06,
+      "loss": 0.19099385738372804,
+      "step": 1190
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.2517462589595264,
+      "learning_rate": 3.872043429058783e-06,
+      "loss": 0.18874506950378417,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.2591827841853763,
+      "learning_rate": 3.84861022644033e-06,
+      "loss": 0.19069148302078248,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.2702770742629986,
+      "learning_rate": 3.825008697883574e-06,
+      "loss": 0.19928838014602662,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.27788866885326635,
+      "learning_rate": 3.8012417891984776e-06,
+      "loss": 0.19237933158874512,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.2656255469668472,
+      "learning_rate": 3.777312466836819e-06,
+      "loss": 0.19055767059326173,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.28446496354107703,
+      "learning_rate": 3.7532237175219378e-06,
+      "loss": 0.18940582275390624,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.4152862546777316,
+      "learning_rate": 3.728978547875948e-06,
+      "loss": 0.19362914562225342,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.28537432061728957,
+      "learning_rate": 3.7045799840444712e-06,
+      "loss": 0.1886904716491699,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.29038310854731697,
+      "learning_rate": 3.6800310713189258e-06,
+      "loss": 0.18923617601394654,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.32132086585692904,
+      "learning_rate": 3.6553348737564328e-06,
+      "loss": 0.19005811214447021,
+      "step": 1290
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.2669423345384319,
+      "learning_rate": 3.6304944737973794e-06,
+      "loss": 0.19575085639953613,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.28931030301965927,
+      "learning_rate": 3.6055129718806836e-06,
+      "loss": 0.18975239992141724,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.28948269391746034,
+      "learning_rate": 3.5803934860568134e-06,
+      "loss": 0.18510067462921143,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.2821484963772758,
+      "learning_rate": 3.5551391515986163e-06,
+      "loss": 0.1907583475112915,
+      "step": 1330
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.27423888046510925,
+      "learning_rate": 3.529753120609982e-06,
+      "loss": 0.18690071105957032,
+      "step": 1340
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.30811658453814883,
+      "learning_rate": 3.5042385616324243e-06,
+      "loss": 0.19000139236450195,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.24402420223179272,
+      "learning_rate": 3.4785986592495934e-06,
+      "loss": 0.18803791999816893,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.24576039119812526,
+      "learning_rate": 3.452836613689803e-06,
+      "loss": 0.1866163969039917,
+      "step": 1370
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.2949022587874532,
+      "learning_rate": 3.426955640426584e-06,
+      "loss": 0.1890486001968384,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.2582182081996982,
+      "learning_rate": 3.4009589697773605e-06,
+      "loss": 0.18851635456085206,
+      "step": 1390
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.2722482128131903,
+      "learning_rate": 3.3748498465002475e-06,
+      "loss": 0.18554195165634155,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.27484686642107964,
+      "learning_rate": 3.3486315293890693e-06,
+      "loss": 0.19425587654113768,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.28258316073925427,
+      "learning_rate": 3.3223072908666053e-06,
+      "loss": 0.1843653440475464,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.28555979247115143,
+      "learning_rate": 3.295880416576153e-06,
+      "loss": 0.1941524863243103,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.2969010932820601,
+      "learning_rate": 3.269354204971427e-06,
+      "loss": 0.18759560585021973,
+      "step": 1440
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.30795851200957197,
+      "learning_rate": 3.242731966904865e-06,
+      "loss": 0.18544803857803344,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.28527072571260903,
+      "learning_rate": 3.2160170252143913e-06,
+      "loss": 0.18547136783599855,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.2533866816866613,
+      "learning_rate": 3.1892127143086716e-06,
+      "loss": 0.19228132963180541,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.2776942873045479,
+      "learning_rate": 3.1623223797509347e-06,
+      "loss": 0.1812342882156372,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.2744584915099732,
+      "learning_rate": 3.135349377841396e-06,
+      "loss": 0.1853887915611267,
+      "step": 1490
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2866639604297882,
+      "learning_rate": 3.1082970751983497e-06,
+      "loss": 0.1918737769126892,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.26310322890713356,
+      "learning_rate": 3.0811688483379546e-06,
+      "loss": 0.18995790481567382,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.28320054398109096,
+      "learning_rate": 3.0539680832528074e-06,
+      "loss": 0.18962399959564208,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.2654570443815982,
+      "learning_rate": 3.026698174989316e-06,
+      "loss": 0.18734774589538575,
+      "step": 1530
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.2658181920127404,
+      "learning_rate": 2.999362527223952e-06,
+      "loss": 0.1873406410217285,
+      "step": 1540
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.29250213703445505,
+      "learning_rate": 2.9719645518384194e-06,
+      "loss": 0.1892526626586914,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3090995402302473,
+      "learning_rate": 2.944507668493807e-06,
+      "loss": 0.19257349967956544,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.28272052629438726,
+      "learning_rate": 2.9169953042037623e-06,
+      "loss": 0.18868753910064698,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3954198531333443,
+      "learning_rate": 2.889430892906754e-06,
+      "loss": 0.18459179401397705,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.2563261821009193,
+      "learning_rate": 2.861817875037462e-06,
+      "loss": 0.18163517713546753,
+      "step": 1590
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.28115388072993086,
+      "learning_rate": 2.8341596970973683e-06,
+      "loss": 0.19087796211242675,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.27079102831839946,
+      "learning_rate": 2.80645981122458e-06,
+      "loss": 0.1863863706588745,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.27596423249252744,
+      "learning_rate": 2.7787216747629508e-06,
+      "loss": 0.19303735494613647,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.2682301223547138,
+      "learning_rate": 2.7509487498305615e-06,
+      "loss": 0.18045294284820557,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.27817197846381203,
+      "learning_rate": 2.7231445028875924e-06,
+      "loss": 0.18653267621994019,
+      "step": 1640
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.25176165708531945,
+      "learning_rate": 2.6953124043036604e-06,
+      "loss": 0.18530716896057128,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.272299195118528,
+      "learning_rate": 2.667455927924667e-06,
+      "loss": 0.18495219945907593,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.26513870922757315,
+      "learning_rate": 2.6395785506392164e-06,
+      "loss": 0.18016864061355592,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.26899577641448663,
+      "learning_rate": 2.6116837519446407e-06,
+      "loss": 0.18437364101409912,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.29589553270345376,
+      "learning_rate": 2.5837750135127192e-06,
+      "loss": 0.18141529560089112,
+      "step": 1690
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.28180995392351926,
+      "learning_rate": 2.555855818755108e-06,
+      "loss": 0.18680166006088256,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.29608650413456306,
+      "learning_rate": 2.5279296523885636e-06,
+      "loss": 0.18486298322677613,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.28475957723655715,
+      "learning_rate": 2.5e-06,
+      "loss": 0.1850725531578064,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.27856833997611247,
+      "learning_rate": 2.472070347611437e-06,
+      "loss": 0.1791991949081421,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.30516489860119894,
+      "learning_rate": 2.444144181244893e-06,
+      "loss": 0.18483606576919556,
+      "step": 1740
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.29804656625996045,
+      "learning_rate": 2.416224986487282e-06,
+      "loss": 0.18195321559906005,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.30740179095263215,
+      "learning_rate": 2.3883162480553605e-06,
+      "loss": 0.17964634895324708,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.29672245353605753,
+      "learning_rate": 2.3604214493607844e-06,
+      "loss": 0.18308933973312377,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.2837212145176832,
+      "learning_rate": 2.332544072075333e-06,
+      "loss": 0.18688681125640869,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.28451872958084823,
+      "learning_rate": 2.30468759569634e-06,
+      "loss": 0.18532857894897461,
+      "step": 1790
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.29734825652467917,
+      "learning_rate": 2.276855497112408e-06,
+      "loss": 0.18262310028076173,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3012944650683003,
+      "learning_rate": 2.2490512501694394e-06,
+      "loss": 0.17781586647033693,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.2692920477116042,
+      "learning_rate": 2.2212783252370496e-06,
+      "loss": 0.18318163156509398,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.2700619255739624,
+      "learning_rate": 2.1935401887754213e-06,
+      "loss": 0.18857367038726808,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.2868516489290536,
+      "learning_rate": 2.165840302902632e-06,
+      "loss": 0.18190672397613525,
+      "step": 1840
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.28726300225812107,
+      "learning_rate": 2.1381821249625383e-06,
+      "loss": 0.1867521286010742,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.2995145996099388,
+      "learning_rate": 2.1105691070932465e-06,
+      "loss": 0.17851842641830445,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.28575212768410063,
+      "learning_rate": 2.083004695796238e-06,
+      "loss": 0.17741835117340088,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.31284763297048707,
+      "learning_rate": 2.055492331506194e-06,
+      "loss": 0.1843113422393799,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.3170666816206652,
+      "learning_rate": 2.0280354481615814e-06,
+      "loss": 0.18248820304870605,
+      "step": 1890
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.30950907311465886,
+      "learning_rate": 2.000637472776049e-06,
+      "loss": 0.1839754819869995,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.2536972696685391,
+      "learning_rate": 1.973301825010685e-06,
+      "loss": 0.17841637134552002,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.291862692607901,
+      "learning_rate": 1.9460319167471934e-06,
+      "loss": 0.18339977264404297,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.2848109477155621,
+      "learning_rate": 1.9188311516620466e-06,
+      "loss": 0.17915148735046388,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.3060077712638729,
+      "learning_rate": 1.891702924801651e-06,
+      "loss": 0.1848907709121704,
+      "step": 1940
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.27297816434517674,
+      "learning_rate": 1.864650622158604e-06,
+      "loss": 0.18888840675354004,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.2781302448691454,
+      "learning_rate": 1.8376776202490666e-06,
+      "loss": 0.1847243309020996,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.31527749144779466,
+      "learning_rate": 1.8107872856913293e-06,
+      "loss": 0.17888798713684081,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.2981389294211551,
+      "learning_rate": 1.7839829747856096e-06,
+      "loss": 0.18081605434417725,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.29438595992497246,
+      "learning_rate": 1.7572680330951359e-06,
+      "loss": 0.17975808382034303,
+      "step": 1990
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2777422843592099,
+      "learning_rate": 1.7306457950285747e-06,
+      "loss": 0.1812159538269043,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3068388373590525,
+      "learning_rate": 1.704119583423848e-06,
+      "loss": 0.17536230087280275,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.272885194568128,
+      "learning_rate": 1.677692709133396e-06,
+      "loss": 0.18365554809570311,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3023336412584975,
+      "learning_rate": 1.6513684706109311e-06,
+      "loss": 0.18368566036224365,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.28988866387653284,
+      "learning_rate": 1.6251501534997529e-06,
+      "loss": 0.18175660371780394,
+      "step": 2040
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.28123365590903454,
+      "learning_rate": 1.5990410302226405e-06,
+      "loss": 0.17483808994293212,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.28187049939921544,
+      "learning_rate": 1.5730443595734162e-06,
+      "loss": 0.18124582767486572,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.31643189708694724,
+      "learning_rate": 1.5471633863101982e-06,
+      "loss": 0.18188211917877198,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.3071146379480691,
+      "learning_rate": 1.521401340750407e-06,
+      "loss": 0.18458983898162842,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.30923765962914507,
+      "learning_rate": 1.495761438367577e-06,
+      "loss": 0.18291953802108765,
+      "step": 2090
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.31506268222239586,
+      "learning_rate": 1.4702468793900187e-06,
+      "loss": 0.18112607002258302,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.2991031913192095,
+      "learning_rate": 1.444860848401384e-06,
+      "loss": 0.18132129907608033,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.306957825954438,
+      "learning_rate": 1.4196065139431866e-06,
+      "loss": 0.18091821670532227,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.30984784981623864,
+      "learning_rate": 1.3944870281193178e-06,
+      "loss": 0.17975277900695802,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.33685631116321924,
+      "learning_rate": 1.3695055262026208e-06,
+      "loss": 0.18606040477752686,
+      "step": 2140
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.28362188085343176,
+      "learning_rate": 1.3446651262435679e-06,
+      "loss": 0.17845985889434815,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.28046286761312267,
+      "learning_rate": 1.3199689286810746e-06,
+      "loss": 0.18048195838928222,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.29900090645940436,
+      "learning_rate": 1.2954200159555294e-06,
+      "loss": 0.17538446187973022,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.32576508972663926,
+      "learning_rate": 1.2710214521240527e-06,
+      "loss": 0.18001599311828614,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.30869890145158635,
+      "learning_rate": 1.246776282478063e-06,
+      "loss": 0.18135268688201905,
+      "step": 2190
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.28612747319198,
+      "learning_rate": 1.222687533163181e-06,
+      "loss": 0.18038851022720337,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.32303440375726766,
+      "learning_rate": 1.1987582108015228e-06,
+      "loss": 0.18109045028686524,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3093047688685527,
+      "learning_rate": 1.1749913021164255e-06,
+      "loss": 0.18254566192626953,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.2882548432858515,
+      "learning_rate": 1.1513897735596702e-06,
+      "loss": 0.17732615470886232,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.29445166285798274,
+      "learning_rate": 1.127956570941218e-06,
+      "loss": 0.17425966262817383,
+      "step": 2240
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3514589237334647,
+      "learning_rate": 1.104694619061533e-06,
+      "loss": 0.18296418190002442,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.32323021290499837,
+      "learning_rate": 1.0816068213465295e-06,
+      "loss": 0.1851881265640259,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.30421571681673176,
+      "learning_rate": 1.0586960594851762e-06,
+      "loss": 0.180436372756958,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.31911631321578676,
+      "learning_rate": 1.0359651930698217e-06,
+      "loss": 0.17929892539978026,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.30015899620754233,
+      "learning_rate": 1.0134170592392837e-06,
+      "loss": 0.18022915124893188,
+      "step": 2290
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.31786084969492157,
+      "learning_rate": 9.910544723247204e-07,
+      "loss": 0.17959039211273192,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.31599364626026827,
+      "learning_rate": 9.688802234983706e-07,
+      "loss": 0.17806137800216676,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.3303243768736776,
+      "learning_rate": 9.468970804251742e-07,
+      "loss": 0.1811964988708496,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.3312986961423255,
+      "learning_rate": 9.251077869173244e-07,
+      "loss": 0.17583439350128174,
+      "step": 2330
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.30030412592967864,
+      "learning_rate": 9.035150625918054e-07,
+      "loss": 0.17623555660247803,
+      "step": 2340
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.3177646626866783,
+      "learning_rate": 8.821216025309395e-07,
+      "loss": 0.18003884553909302,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3012142976429357,
+      "learning_rate": 8.609300769460055e-07,
+      "loss": 0.17543296813964843,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3177168816443014,
+      "learning_rate": 8.399431308439592e-07,
+      "loss": 0.18021781444549562,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.34248252589513506,
+      "learning_rate": 8.191633836972962e-07,
+      "loss": 0.18125417232513427,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.29292480325152365,
+      "learning_rate": 7.985934291171024e-07,
+      "loss": 0.17757056951522826,
+      "step": 2390
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.3257764859746147,
+      "learning_rate": 7.7823583452934e-07,
+      "loss": 0.18096057176589966,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.28892062916284306,
+      "learning_rate": 7.58093140854389e-07,
+      "loss": 0.18015010356903077,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.32360358107292697,
+      "learning_rate": 7.381678621899077e-07,
+      "loss": 0.18067935705184937,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.3139428787829718,
+      "learning_rate": 7.184624854970379e-07,
+      "loss": 0.1768512487411499,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.3182311104789415,
+      "learning_rate": 6.989794702899932e-07,
+      "loss": 0.17589566707611085,
+      "step": 2440
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.3112954733861784,
+      "learning_rate": 6.797212483290777e-07,
+      "loss": 0.177903413772583,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.31026727362843554,
+      "learning_rate": 6.60690223317171e-07,
+      "loss": 0.17535500526428222,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.2855504901999764,
+      "learning_rate": 6.418887705997046e-07,
+      "loss": 0.1787285327911377,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.33581031525319194,
+      "learning_rate": 6.23319236868189e-07,
+      "loss": 0.181508469581604,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.30084134655605693,
+      "learning_rate": 6.049839398673141e-07,
+      "loss": 0.18244649171829225,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3207759323449182,
+      "learning_rate": 5.868851681056567e-07,
+      "loss": 0.18296375274658203,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3103299858846911,
+      "learning_rate": 5.690251805700467e-07,
+      "loss": 0.18089601993560792,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.3310470653200237,
+      "learning_rate": 5.514062064436096e-07,
+      "loss": 0.1829407334327698,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.31783823046596615,
+      "learning_rate": 5.34030444827533e-07,
+      "loss": 0.17886234521865846,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.3279151171862584,
+      "learning_rate": 5.169000644665895e-07,
+      "loss": 0.17618993520736695,
+      "step": 2540
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3006249030100123,
+      "learning_rate": 5.000172034784442e-07,
+      "loss": 0.17779455184936524,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.3164261324675526,
+      "learning_rate": 4.833839690867853e-07,
+      "loss": 0.18002912998199463,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.31374931318878396,
+      "learning_rate": 4.6700243735831705e-07,
+      "loss": 0.173567795753479,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.31170459979916293,
+      "learning_rate": 4.508746529436311e-07,
+      "loss": 0.1724323034286499,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.3080863565290302,
+      "learning_rate": 4.350026288220083e-07,
+      "loss": 0.1794981598854065,
+      "step": 2590
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.30618951989415283,
+      "learning_rate": 4.1938834605017133e-07,
+      "loss": 0.1761255979537964,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3029510706797137,
+      "learning_rate": 4.0403375351501515e-07,
+      "loss": 0.17623082399368287,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.336336912959277,
+      "learning_rate": 3.88940767690362e-07,
+      "loss": 0.1757615327835083,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.32859024308656015,
+      "learning_rate": 3.7411127239775774e-07,
+      "loss": 0.17455869913101196,
+      "step": 2630
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.3174124959768476,
+      "learning_rate": 3.595471185713431e-07,
+      "loss": 0.17312180995941162,
+      "step": 2640
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.3247217043719523,
+      "learning_rate": 3.4525012402682826e-07,
+      "loss": 0.17421470880508422,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.3290462164412991,
+      "learning_rate": 3.3122207323460804e-07,
+      "loss": 0.17708632946014405,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.3024938333869805,
+      "learning_rate": 3.1746471709702963e-07,
+      "loss": 0.17333836555480958,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.32678703604131465,
+      "learning_rate": 3.039797727298585e-07,
+      "loss": 0.1801586151123047,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.32985764106850785,
+      "learning_rate": 2.9076892324795546e-07,
+      "loss": 0.17783432006835936,
+      "step": 2690
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.31242585953952057,
+      "learning_rate": 2.778338175551995e-07,
+      "loss": 0.17357670068740844,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.3220012856306909,
+      "learning_rate": 2.6517607013868326e-07,
+      "loss": 0.18131563663482667,
+      "step": 2710
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.33350326064348024,
+      "learning_rate": 2.527972608672002e-07,
+      "loss": 0.17757024765014648,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.335919926946263,
+      "learning_rate": 2.40698934794053e-07,
+      "loss": 0.17683808803558348,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.3209912976041497,
+      "learning_rate": 2.2888260196421237e-07,
+      "loss": 0.17635661363601685,
+      "step": 2740
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3165955269677658,
+      "learning_rate": 2.1734973722583735e-07,
+      "loss": 0.17913974523544313,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.31474674596852353,
+      "learning_rate": 2.0610178004619564e-07,
+      "loss": 0.17095563411712647,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.305115903859637,
+      "learning_rate": 1.9514013433199834e-07,
+      "loss": 0.18293533325195313,
+      "step": 2770
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.3164297745100823,
+      "learning_rate": 1.8446616825416958e-07,
+      "loss": 0.18138229846954346,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.3526140625065779,
+      "learning_rate": 1.7408121407708007e-07,
+      "loss": 0.18163397312164306,
+      "step": 2790
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.3224933819196559,
+      "learning_rate": 1.6398656799226253e-07,
+      "loss": 0.1705089807510376,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.31764589677400257,
+      "learning_rate": 1.5418348995662773e-07,
+      "loss": 0.17652597427368164,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.3414067132784035,
+      "learning_rate": 1.4467320353520275e-07,
+      "loss": 0.17487871646881104,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.3138098972679996,
+      "learning_rate": 1.3545689574841341e-07,
+      "loss": 0.17592911720275878,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.31560573280288073,
+      "learning_rate": 1.26535716923927e-07,
+      "loss": 0.18197228908538818,
+      "step": 2840
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.3188962184685744,
+      "learning_rate": 1.1791078055307493e-07,
+      "loss": 0.1777464509010315,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.31575220367713525,
+      "learning_rate": 1.0958316315187289e-07,
+      "loss": 0.17706483602523804,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3131837624055497,
+      "learning_rate": 1.0155390412665528e-07,
+      "loss": 0.17496002912521363,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.32248583567737266,
+      "learning_rate": 9.38240056443443e-08,
+      "loss": 0.17229046821594238,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.3101253584845484,
+      "learning_rate": 8.639443250736402e-08,
+      "loss": 0.17552309036254882,
+      "step": 2890
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.33217431742972764,
+      "learning_rate": 7.926611203321777e-08,
+      "loss": 0.17659810781478882,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.33918124282098266,
+      "learning_rate": 7.243993393874882e-08,
+      "loss": 0.17737939357757568,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.31351790893613213,
+      "learning_rate": 6.591675022908805e-08,
+      "loss": 0.1745692253112793,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.33783778867129854,
+      "learning_rate": 5.969737509131241e-08,
+      "loss": 0.1722058415412903,
+      "step": 2930
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.308776655874055,
+      "learning_rate": 5.3782584792823334e-08,
+      "loss": 0.17710112333297728,
+      "step": 2940
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.3142338038371378,
+      "learning_rate": 4.817311758445686e-08,
+      "loss": 0.178252911567688,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.33048986218580767,
+      "learning_rate": 4.286967360833866e-08,
+      "loss": 0.1782402753829956,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.3110909627270251,
+      "learning_rate": 3.787291481049754e-08,
+      "loss": 0.17829475402832032,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.33726065147122686,
+      "learning_rate": 3.3183464858244364e-08,
+      "loss": 0.18406097888946532,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.3326393750086487,
+      "learning_rate": 2.8801909062328992e-08,
+      "loss": 0.17060396671295167,
+      "step": 2990
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.32948960265922206,
+      "learning_rate": 2.4728794303886248e-08,
+      "loss": 0.16899311542510986,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.33211982053439487,
+      "learning_rate": 2.0964628966175794e-08,
+      "loss": 0.17517964839935302,
+      "step": 3010
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.30613498697830943,
+      "learning_rate": 1.750988287113009e-08,
+      "loss": 0.17458994388580323,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.3027770955918648,
+      "learning_rate": 1.4364987220713278e-08,
+      "loss": 0.18178436756134034,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.3292318037983906,
+      "learning_rate": 1.1530334543099763e-08,
+      "loss": 0.1790144681930542,
+      "step": 3040
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.33300755292787143,
+      "learning_rate": 9.006278643683697e-09,
+      "loss": 0.1808505654335022,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.32631723332989787,
+      "learning_rate": 6.793134560916514e-09,
+      "loss": 0.18275127410888672,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.3082787331662993,
+      "learning_rate": 4.891178526986451e-09,
+      "loss": 0.1783647656440735,
+      "step": 3070
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.32341550392390483,
+      "learning_rate": 3.3006479333413943e-09,
+      "loss": 0.18126009702682494,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.30931371888762194,
+      "learning_rate": 2.021741301058422e-09,
+      "loss": 0.17681236267089845,
+      "step": 3090
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.3419672311636941,
+      "learning_rate": 1.0546182560652872e-09,
+      "loss": 0.17989683151245117,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3111951639834393,
+      "learning_rate": 3.9939950921774607e-10,
+      "loss": 0.17482796907424927,
+      "step": 3110
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3230413672933209,
+      "learning_rate": 5.616684123160854e-11,
+      "loss": 0.17436976432800294,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0,
+      "step": 3125,
+      "total_flos": 1955525886476288.0,
+      "train_loss": 0.1948647116279602,
+      "train_runtime": 28413.61,
+      "train_samples_per_second": 7.039,
+      "train_steps_per_second": 0.11
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1955525886476288.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoints/InternVL3.5-8B-SFT/training_loss.png b/checkpoints/InternVL3.5-8B-SFT/training_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..e43a2ab5d6b749d80ff2c755d1857c48099f1cb7
Binary files /dev/null and b/checkpoints/InternVL3.5-8B-SFT/training_loss.png differ
diff --git a/checkpoints/Qwen3-VL-2B-SFT/all_results.json b/checkpoints/Qwen3-VL-2B-SFT/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..67b1f8a6883bb1fdd5ebbb5135b6ec0b10925041
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1201860236279808.0,
+    "train_loss": 0.2128027264213562,
+    "train_runtime": 15463.9635,
+    "train_samples_per_second": 12.933,
+    "train_steps_per_second": 0.202
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3-VL-2B-SFT/chat_template.jinja b/checkpoints/Qwen3-VL-2B-SFT/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..124386803f142761528f710e77ae483f5f8c4fc4
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/chat_template.jinja
@@ -0,0 +1,120 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- for message in messages %}
+    {%- if message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content_item in message.content %}
+                {%- if 'text' in content_item %}
+                    {{- content_item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and message.content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/checkpoints/Qwen3-VL-2B-SFT/config.json b/checkpoints/Qwen3-VL-2B-SFT/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a7893a9be707e411c31545b4230e40ae93f72e12
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/config.json
@@ -0,0 +1,71 @@
+{
+  "architectures": [
+    "Qwen3VLForConditionalGeneration"
+  ],
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_size": 2048,
+  "image_token_id": 151655,
+  "model_type": "qwen3_vl",
+  "pad_token_id": 151643,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 8,
+    "pad_token_id": null,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_theta": 5000000,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": true,
+    "use_cache": false,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.3",
+  "use_cache": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      5,
+      11,
+      17
+    ],
+    "depth": 24,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1024,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 2048,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}
diff --git a/checkpoints/Qwen3-VL-2B-SFT/eval_results_job_qwen3vl_2b_qwen3_vl_2b_20260430_002232.json b/checkpoints/Qwen3-VL-2B-SFT/eval_results_job_qwen3vl_2b_qwen3_vl_2b_20260430_002232.json
new file mode 100644
index 0000000000000000000000000000000000000000..faddc1bb4f4555dcee2b9743696623d65fb21f1e
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/eval_results_job_qwen3vl_2b_qwen3_vl_2b_20260430_002232.json
@@ -0,0 +1,56 @@
+{
+  "mae_dx": 0.1790637931034483,
+  "rmse_dx": 0.5420201834898262,
+  "mae_dy": 0.16369482758620688,
+  "rmse_dy": 0.4466289248937244,
+  "mae_dz": 0.017312068965517242,
+  "rmse_dz": 0.11626166978875267,
+  "mae_dpitch": 0.35010344827586204,
+  "rmse_dpitch": 0.7528209935143029,
+  "mae_dyaw": 1.3350413793103448,
+  "rmse_dyaw": 2.917615133797725,
+  "mae_droll": 0.0,
+  "rmse_droll": 0.0,
+  "mae_overall": 0.3408692528735632,
+  "mae_position": 0.12002356321839079,
+  "mae_rotation": 0.5617149425287357,
+  "rmse_overall": 1.263988236295834,
+  "wp1_euc_mae": 0.08209225193702147,
+  "wp1_euc_median": 0.022360679774997918,
+  "wp2_euc_mae": 0.1592417265186995,
+  "wp2_euc_median": 0.058309518948453015,
+  "wp3_euc_mae": 0.2573809864066125,
+  "wp3_euc_median": 0.10049875621120885,
+  "wp4_euc_mae": 0.3827385749455985,
+  "wp4_euc_median": 0.1529705854077837,
+  "wp5_euc_mae": 0.5205297307973544,
+  "wp5_euc_median": 0.2197724920005007,
+  "euclidean_mae": 0.2803966541210573,
+  "ADE": 0.2803966541210573,
+  "FDE": 0.5205297307973544,
+  "ADE_median": 0.11621310610654717,
+  "FDE_median": 0.2197724920005007,
+  "SR@0.5m": 0.8663793103448276,
+  "SR@1.0m": 0.9436206896551724,
+  "SR@2.0m": 0.98,
+  "SR@5.0m": 0.9955172413793103,
+  "TrajSR@1.0m": 0.8732758620689656,
+  "TrajSR@2.0m": 0.9517241379310345,
+  "TrajSR@5.0m": 0.9887931034482759,
+  "RotAcc@1.0deg": 0.608103448275862,
+  "RotAcc@5.0deg": 0.9405172413793104,
+  "RotAcc@10.0deg": 0.9853448275862069,
+  "wp1_rot_mae": 0.7521227362111129,
+  "wp2_rot_mae": 1.0375870328696497,
+  "wp3_rot_mae": 1.394596726377232,
+  "wp4_rot_mae": 1.8037210071006413,
+  "wp5_rot_mae": 2.2606874444077585,
+  "rotation_euc_mae": 1.4497429893932787,
+  "parse_failure_rate": 0.0,
+  "parse_success_rate": 1.0,
+  "valid_samples": 1160,
+  "total_samples": 1160,
+  "parse_failures": 0,
+  "inference_engine": "vllm",
+  "vllm_version": "0.19.0"
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3-VL-2B-SFT/generation_config.json b/checkpoints/Qwen3-VL-2B-SFT/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8548f60df0ba832de7a75caa724e098a7ad0eda5
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/generation_config.json
@@ -0,0 +1,14 @@
+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.0,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.5.3"
+}
diff --git a/checkpoints/Qwen3-VL-2B-SFT/model.safetensors b/checkpoints/Qwen3-VL-2B-SFT/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1842277d70a3df5d46689486999c39e22ebf6e79
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f00f124b4c3943edcd2411fc8bac2af6a9fd0b4b769e7a55996571189440f85
+size 4255140312
diff --git a/checkpoints/Qwen3-VL-2B-SFT/processor_config.json b/checkpoints/Qwen3-VL-2B-SFT/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..33818c7f9e991ad735fd240209f4fa73e6c28c50
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/processor_config.json
@@ -0,0 +1,60 @@
+{
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "Qwen2VLImageProcessor",
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "merge_size": 2,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "longest_edge": 16777216,
+      "shortest_edge": 65536
+    },
+    "temporal_patch_size": 2
+  },
+  "processor_class": "Qwen3VLProcessor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "fps": 2,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_frames": 768,
+    "merge_size": 2,
+    "min_frames": 4,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "size": {
+      "longest_edge": 25165824,
+      "shortest_edge": 4096
+    },
+    "temporal_patch_size": 2,
+    "video_processor_type": "Qwen3VLVideoProcessor"
+  }
+}
diff --git a/checkpoints/Qwen3-VL-2B-SFT/tokenizer.json b/checkpoints/Qwen3-VL-2B-SFT/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/checkpoints/Qwen3-VL-2B-SFT/tokenizer_config.json b/checkpoints/Qwen3-VL-2B-SFT/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..43900b512f2f27ef1e4be8d571c4d7dba693c654
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/tokenizer_config.json
@@ -0,0 +1,31 @@
+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "Qwen3VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
diff --git a/checkpoints/Qwen3-VL-2B-SFT/train_results.json b/checkpoints/Qwen3-VL-2B-SFT/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..67b1f8a6883bb1fdd5ebbb5135b6ec0b10925041
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1201860236279808.0,
+    "train_loss": 0.2128027264213562,
+    "train_runtime": 15463.9635,
+    "train_samples_per_second": 12.933,
+    "train_steps_per_second": 0.202
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3-VL-2B-SFT/trainer_state.json b/checkpoints/Qwen3-VL-2B-SFT/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a11a2e661fe97b06c93df186eba1842720553e03
--- /dev/null
+++ b/checkpoints/Qwen3-VL-2B-SFT/trainer_state.json
@@ -0,0 +1,2227 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 16.167704755253098,
+      "learning_rate": 1.437699680511182e-07,
+      "loss": 0.6528051853179931,
+      "step": 10
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 15.890120546753822,
+      "learning_rate": 3.0351437699680514e-07,
+      "loss": 0.6462714195251464,
+      "step": 20
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 14.94996510180698,
+      "learning_rate": 4.6325878594249205e-07,
+      "loss": 0.6038930416107178,
+      "step": 30
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 7.595956825837255,
+      "learning_rate": 6.230031948881789e-07,
+      "loss": 0.49077792167663575,
+      "step": 40
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 3.026643067758099,
+      "learning_rate": 7.82747603833866e-07,
+      "loss": 0.3725566864013672,
+      "step": 50
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.45050871801394,
+      "learning_rate": 9.424920127795528e-07,
+      "loss": 0.3130798816680908,
+      "step": 60
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7098603642718405,
+      "learning_rate": 1.1022364217252397e-06,
+      "loss": 0.29621334075927735,
+      "step": 70
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6027577608327673,
+      "learning_rate": 1.2619808306709266e-06,
+      "loss": 0.27455599308013917,
+      "step": 80
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6521596145147045,
+      "learning_rate": 1.4217252396166134e-06,
+      "loss": 0.2667043447494507,
+      "step": 90
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.5069890685833461,
+      "learning_rate": 1.5814696485623005e-06,
+      "loss": 0.26807360649108886,
+      "step": 100
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.5470393023746721,
+      "learning_rate": 1.7412140575079875e-06,
+      "loss": 0.26680865287780764,
+      "step": 110
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5543553869620175,
+      "learning_rate": 1.9009584664536742e-06,
+      "loss": 0.25434055328369143,
+      "step": 120
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.5420531484574165,
+      "learning_rate": 2.060702875399361e-06,
+      "loss": 0.25767529010772705,
+      "step": 130
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.645702037816744,
+      "learning_rate": 2.220447284345048e-06,
+      "loss": 0.24863953590393068,
+      "step": 140
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.6143136416629473,
+      "learning_rate": 2.380191693290735e-06,
+      "loss": 0.24553947448730468,
+      "step": 150
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5094817219127052,
+      "learning_rate": 2.539936102236422e-06,
+      "loss": 0.2415369987487793,
+      "step": 160
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.6291606522275387,
+      "learning_rate": 2.699680511182109e-06,
+      "loss": 0.24887418746948242,
+      "step": 170
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.6248895072998087,
+      "learning_rate": 2.8594249201277955e-06,
+      "loss": 0.2414403438568115,
+      "step": 180
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.6640745861299296,
+      "learning_rate": 3.0191693290734825e-06,
+      "loss": 0.24553894996643066,
+      "step": 190
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.6136916428260776,
+      "learning_rate": 3.17891373801917e-06,
+      "loss": 0.24655485153198242,
+      "step": 200
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.6572881584027297,
+      "learning_rate": 3.3386581469648564e-06,
+      "loss": 0.2433255910873413,
+      "step": 210
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.6365580690264084,
+      "learning_rate": 3.4984025559105434e-06,
+      "loss": 0.23687341213226318,
+      "step": 220
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.6771736107097397,
+      "learning_rate": 3.6581469648562303e-06,
+      "loss": 0.23829469680786133,
+      "step": 230
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.6990706788858505,
+      "learning_rate": 3.817891373801918e-06,
+      "loss": 0.23471264839172362,
+      "step": 240
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6029376877872676,
+      "learning_rate": 3.977635782747604e-06,
+      "loss": 0.23215394020080565,
+      "step": 250
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.6082124769869354,
+      "learning_rate": 4.137380191693291e-06,
+      "loss": 0.2326298713684082,
+      "step": 260
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.7069824323872274,
+      "learning_rate": 4.297124600638978e-06,
+      "loss": 0.23525137901306153,
+      "step": 270
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.6697633994539672,
+      "learning_rate": 4.456869009584665e-06,
+      "loss": 0.23122966289520264,
+      "step": 280
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5896144959913211,
+      "learning_rate": 4.616613418530352e-06,
+      "loss": 0.2369994878768921,
+      "step": 290
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.6202443536122002,
+      "learning_rate": 4.776357827476039e-06,
+      "loss": 0.23878774642944336,
+      "step": 300
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.654740818437731,
+      "learning_rate": 4.936102236421725e-06,
+      "loss": 0.22523627281188965,
+      "step": 310
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5332231058888761,
+      "learning_rate": 4.999943833158769e-06,
+      "loss": 0.22634780406951904,
+      "step": 320
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.5353007164619794,
+      "learning_rate": 4.999600600490783e-06,
+      "loss": 0.23276047706604003,
+      "step": 330
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.53617134295571,
+      "learning_rate": 4.9989453817439345e-06,
+      "loss": 0.22672569751739502,
+      "step": 340
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.5149149938648103,
+      "learning_rate": 4.997978258698942e-06,
+      "loss": 0.22631363868713378,
+      "step": 350
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.5959881018141326,
+      "learning_rate": 4.996699352066659e-06,
+      "loss": 0.22707018852233887,
+      "step": 360
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.6648028246958526,
+      "learning_rate": 4.995108821473014e-06,
+      "loss": 0.22777373790740968,
+      "step": 370
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.6395047869916185,
+      "learning_rate": 4.993206865439084e-06,
+      "loss": 0.22382116317749023,
+      "step": 380
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.6449783716947614,
+      "learning_rate": 4.990993721356317e-06,
+      "loss": 0.22268824577331542,
+      "step": 390
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.6709421623745665,
+      "learning_rate": 4.988469665456901e-06,
+      "loss": 0.22317943572998047,
+      "step": 400
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.5466948727484514,
+      "learning_rate": 4.985635012779288e-06,
+      "loss": 0.23101482391357422,
+      "step": 410
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.48989327197226856,
+      "learning_rate": 4.98249011712887e-06,
+      "loss": 0.2234072208404541,
+      "step": 420
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.5417400145938138,
+      "learning_rate": 4.979035371033824e-06,
+      "loss": 0.22212049961090088,
+      "step": 430
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.5576422767413268,
+      "learning_rate": 4.975271205696115e-06,
+      "loss": 0.22228083610534669,
+      "step": 440
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6175584799790863,
+      "learning_rate": 4.971198090937671e-06,
+      "loss": 0.21532373428344725,
+      "step": 450
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.6360712146764758,
+      "learning_rate": 4.966816535141756e-06,
+      "loss": 0.21311187744140625,
+      "step": 460
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.5401953881204377,
+      "learning_rate": 4.9621270851895035e-06,
+      "loss": 0.22237277030944824,
+      "step": 470
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.5988873649948656,
+      "learning_rate": 4.957130326391662e-06,
+      "loss": 0.22391064167022706,
+      "step": 480
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.5132670412160366,
+      "learning_rate": 4.951826882415544e-06,
+      "loss": 0.2206397533416748,
+      "step": 490
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5935020011592513,
+      "learning_rate": 4.946217415207177e-06,
+      "loss": 0.2148068904876709,
+      "step": 500
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.5324390349507315,
+      "learning_rate": 4.940302624908689e-06,
+      "loss": 0.21909193992614745,
+      "step": 510
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.6082929578051663,
+      "learning_rate": 4.934083249770912e-06,
+      "loss": 0.2133782386779785,
+      "step": 520
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.6272295187969801,
+      "learning_rate": 4.927560066061251e-06,
+      "loss": 0.2180723190307617,
+      "step": 530
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.5538741111929965,
+      "learning_rate": 4.920733887966783e-06,
+      "loss": 0.22759020328521729,
+      "step": 540
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.5703593568416581,
+      "learning_rate": 4.913605567492636e-06,
+      "loss": 0.21657073497772217,
+      "step": 550
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.5873043850881617,
+      "learning_rate": 4.906175994355656e-06,
+      "loss": 0.21824207305908203,
+      "step": 560
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.7955355117519857,
+      "learning_rate": 4.898446095873345e-06,
+      "loss": 0.2209712028503418,
+      "step": 570
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.5347403539894492,
+      "learning_rate": 4.890416836848128e-06,
+      "loss": 0.2184591293334961,
+      "step": 580
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.5464598874722423,
+      "learning_rate": 4.882089219446925e-06,
+      "loss": 0.2130581855773926,
+      "step": 590
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.5871382794412585,
+      "learning_rate": 4.873464283076074e-06,
+      "loss": 0.21770844459533692,
+      "step": 600
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.5516595084585112,
+      "learning_rate": 4.864543104251587e-06,
+      "loss": 0.21629047393798828,
+      "step": 610
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.5949100146178041,
+      "learning_rate": 4.855326796464798e-06,
+      "loss": 0.22033746242523194,
+      "step": 620
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.5798876425998256,
+      "learning_rate": 4.8458165100433725e-06,
+      "loss": 0.21477458477020264,
+      "step": 630
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.563545251458103,
+      "learning_rate": 4.836013432007738e-06,
+      "loss": 0.21490144729614258,
+      "step": 640
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.5256728978801903,
+      "learning_rate": 4.825918785922921e-06,
+      "loss": 0.21858677864074708,
+      "step": 650
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.5062609806869888,
+      "learning_rate": 4.8155338317458315e-06,
+      "loss": 0.21592459678649903,
+      "step": 660
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.555318042395406,
+      "learning_rate": 4.804859865668002e-06,
+      "loss": 0.21323423385620116,
+      "step": 670
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.6382467151310525,
+      "learning_rate": 4.793898219953804e-06,
+      "loss": 0.2151188373565674,
+      "step": 680
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.5426280956852546,
+      "learning_rate": 4.782650262774164e-06,
+      "loss": 0.2155141830444336,
+      "step": 690
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.5602841392771764,
+      "learning_rate": 4.7711173980357886e-06,
+      "loss": 0.21242978572845458,
+      "step": 700
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.5837827171492797,
+      "learning_rate": 4.759301065205947e-06,
+      "loss": 0.2129213333129883,
+      "step": 710
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.5678516858648391,
+      "learning_rate": 4.7472027391328e-06,
+      "loss": 0.21422340869903564,
+      "step": 720
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.6213695156464779,
+      "learning_rate": 4.734823929861317e-06,
+      "loss": 0.2172607660293579,
+      "step": 730
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.6084105321286742,
+      "learning_rate": 4.722166182444801e-06,
+      "loss": 0.21331138610839845,
+      "step": 740
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5848312022835148,
+      "learning_rate": 4.709231076752045e-06,
+      "loss": 0.21255254745483398,
+      "step": 750
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.5855428740644943,
+      "learning_rate": 4.696020227270142e-06,
+      "loss": 0.21734881401062012,
+      "step": 760
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.5135013968609298,
+      "learning_rate": 4.6825352829029705e-06,
+      "loss": 0.21321442127227783,
+      "step": 770
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.5938685951597557,
+      "learning_rate": 4.668777926765392e-06,
+      "loss": 0.21113758087158202,
+      "step": 780
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.6490004462160337,
+      "learning_rate": 4.6547498759731725e-06,
+      "loss": 0.20692987442016603,
+      "step": 790
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.5694207965471786,
+      "learning_rate": 4.6404528814286575e-06,
+      "loss": 0.20959222316741943,
+      "step": 800
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.5648942925010132,
+      "learning_rate": 4.6258887276022425e-06,
+      "loss": 0.21758944988250734,
+      "step": 810
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.6544068998265237,
+      "learning_rate": 4.611059232309639e-06,
+      "loss": 0.21146907806396484,
+      "step": 820
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.6680185905090128,
+      "learning_rate": 4.595966246484986e-06,
+      "loss": 0.21348462104797364,
+      "step": 830
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4956164506371995,
+      "learning_rate": 4.580611653949829e-06,
+      "loss": 0.21317172050476074,
+      "step": 840
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.6491508776235345,
+      "learning_rate": 4.564997371177992e-06,
+      "loss": 0.2108323574066162,
+      "step": 850
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.6859739128419746,
+      "learning_rate": 4.54912534705637e-06,
+      "loss": 0.21068863868713378,
+      "step": 860
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.5876140035889241,
+      "learning_rate": 4.532997562641683e-06,
+      "loss": 0.20738301277160645,
+      "step": 870
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.5388630641864397,
+      "learning_rate": 4.516616030913214e-06,
+      "loss": 0.2113194465637207,
+      "step": 880
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.527263546069221,
+      "learning_rate": 4.499982796521556e-06,
+      "loss": 0.20718231201171874,
+      "step": 890
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.6778383199902553,
+      "learning_rate": 4.48309993553341e-06,
+      "loss": 0.20899975299835205,
+      "step": 900
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.6041502046582736,
+      "learning_rate": 4.465969555172468e-06,
+      "loss": 0.20922982692718506,
+      "step": 910
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.5872507915529911,
+      "learning_rate": 4.448593793556391e-06,
+      "loss": 0.21518073081970215,
+      "step": 920
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.5414243473578003,
+      "learning_rate": 4.430974819429954e-06,
+      "loss": 0.20869126319885253,
+      "step": 930
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.4624854855413159,
+      "learning_rate": 4.413114831894344e-06,
+      "loss": 0.20277881622314453,
+      "step": 940
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.5247854876993729,
+      "learning_rate": 4.3950160601326865e-06,
+      "loss": 0.20181698799133302,
+      "step": 950
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.5808078368512252,
+      "learning_rate": 4.376680763131811e-06,
+      "loss": 0.20898809432983398,
+      "step": 960
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.5805212694083882,
+      "learning_rate": 4.358111229400296e-06,
+      "loss": 0.21212198734283447,
+      "step": 970
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.5721764020420262,
+      "learning_rate": 4.33930977668283e-06,
+      "loss": 0.21448736190795897,
+      "step": 980
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5598008397585128,
+      "learning_rate": 4.320278751670922e-06,
+      "loss": 0.20758256912231446,
+      "step": 990
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5522723710696453,
+      "learning_rate": 4.301020529710009e-06,
+      "loss": 0.20947573184967042,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.5556932215476815,
+      "learning_rate": 4.281537514502962e-06,
+      "loss": 0.2131945848464966,
+      "step": 1010
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.5256326530235461,
+      "learning_rate": 4.261832137810093e-06,
+      "loss": 0.20962438583374024,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.5141067804644184,
+      "learning_rate": 4.241906859145611e-06,
+      "loss": 0.21035046577453614,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.509376911595103,
+      "learning_rate": 4.221764165470661e-06,
+      "loss": 0.20757730007171632,
+      "step": 1040
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5632632187185198,
+      "learning_rate": 4.201406570882898e-06,
+      "loss": 0.20691304206848143,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.5786515758035645,
+      "learning_rate": 4.180836616302704e-06,
+      "loss": 0.20582923889160157,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.591108109764431,
+      "learning_rate": 4.160056869156041e-06,
+      "loss": 0.2102893590927124,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.5367428274966828,
+      "learning_rate": 4.139069923053995e-06,
+      "loss": 0.20834057331085204,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.49962583382458753,
+      "learning_rate": 4.117878397469062e-06,
+      "loss": 0.2114588975906372,
+      "step": 1090
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.5580828852277292,
+      "learning_rate": 4.096484937408195e-06,
+      "loss": 0.2029412269592285,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.5671943339841842,
+      "learning_rate": 4.074892213082676e-06,
+      "loss": 0.20308828353881836,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.5583868175031171,
+      "learning_rate": 4.0531029195748265e-06,
+      "loss": 0.2104210376739502,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.5452939479895703,
+      "learning_rate": 4.03111977650163e-06,
+      "loss": 0.20968456268310548,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.6195183591357212,
+      "learning_rate": 4.008945527675281e-06,
+      "loss": 0.20957679748535157,
+      "step": 1140
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.6171258889408775,
+      "learning_rate": 3.986582940760717e-06,
+      "loss": 0.1984492540359497,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.6164010362674036,
+      "learning_rate": 3.9640348069301785e-06,
+      "loss": 0.20632429122924806,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.5558070727772452,
+      "learning_rate": 3.941303940514826e-06,
+      "loss": 0.20776019096374512,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.5943916453083408,
+      "learning_rate": 3.918393178653472e-06,
+      "loss": 0.20839078426361085,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.5018385923371635,
+      "learning_rate": 3.895305380938468e-06,
+      "loss": 0.2044908285140991,
+      "step": 1190
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.48660847876218716,
+      "learning_rate": 3.872043429058783e-06,
+      "loss": 0.20328717231750487,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.5586353975354608,
+      "learning_rate": 3.84861022644033e-06,
+      "loss": 0.20572426319122314,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.5709168788921625,
+      "learning_rate": 3.825008697883574e-06,
+      "loss": 0.21369614601135253,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.5589246090839964,
+      "learning_rate": 3.8012417891984776e-06,
+      "loss": 0.2072831630706787,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.5711782327133378,
+      "learning_rate": 3.777312466836819e-06,
+      "loss": 0.20526669025421143,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.5656399244912672,
+      "learning_rate": 3.7532237175219378e-06,
+      "loss": 0.20442888736724854,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.5520901024337347,
+      "learning_rate": 3.728978547875948e-06,
+      "loss": 0.2092284679412842,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.553756025103199,
+      "learning_rate": 3.7045799840444712e-06,
+      "loss": 0.20277605056762696,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.5430187148138641,
+      "learning_rate": 3.6800310713189258e-06,
+      "loss": 0.20491743087768555,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.7620941398223869,
+      "learning_rate": 3.6553348737564328e-06,
+      "loss": 0.2055516481399536,
+      "step": 1290
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.5265612798122297,
+      "learning_rate": 3.6304944737973794e-06,
+      "loss": 0.21130599975585937,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.5353794185025008,
+      "learning_rate": 3.6055129718806836e-06,
+      "loss": 0.20504627227783204,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.5979654766960453,
+      "learning_rate": 3.5803934860568134e-06,
+      "loss": 0.2000981330871582,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5915664314356317,
+      "learning_rate": 3.5551391515986163e-06,
+      "loss": 0.20581989288330077,
+      "step": 1330
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.562992516341074,
+      "learning_rate": 3.529753120609982e-06,
+      "loss": 0.20160207748413086,
+      "step": 1340
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.7046032478558245,
+      "learning_rate": 3.5042385616324243e-06,
+      "loss": 0.2043483018875122,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.5184492477363449,
+      "learning_rate": 3.4785986592495934e-06,
+      "loss": 0.20285494327545167,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.5806380074338086,
+      "learning_rate": 3.452836613689803e-06,
+      "loss": 0.2009434223175049,
+      "step": 1370
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.5204618736945451,
+      "learning_rate": 3.426955640426584e-06,
+      "loss": 0.20416510105133057,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.5765864502605341,
+      "learning_rate": 3.4009589697773605e-06,
+      "loss": 0.20326631069183348,
+      "step": 1390
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.5779970501460372,
+      "learning_rate": 3.3748498465002475e-06,
+      "loss": 0.20073289871215821,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.6393995362823897,
+      "learning_rate": 3.3486315293890693e-06,
+      "loss": 0.20874643325805664,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.5108762095593324,
+      "learning_rate": 3.3223072908666053e-06,
+      "loss": 0.19835340976715088,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.6435280387445825,
+      "learning_rate": 3.295880416576153e-06,
+      "loss": 0.20992684364318848,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.5838753206875198,
+      "learning_rate": 3.269354204971427e-06,
+      "loss": 0.20265870094299315,
+      "step": 1440
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.6745984788898958,
+      "learning_rate": 3.242731966904865e-06,
+      "loss": 0.20037527084350587,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.5358161645108944,
+      "learning_rate": 3.2160170252143913e-06,
+      "loss": 0.20123369693756105,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.5112361606823973,
+      "learning_rate": 3.1892127143086716e-06,
+      "loss": 0.20752406120300293,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.6333759965752455,
+      "learning_rate": 3.1623223797509347e-06,
+      "loss": 0.19706425666809083,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.6206117536462172,
+      "learning_rate": 3.135349377841396e-06,
+      "loss": 0.20125732421875,
+      "step": 1490
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.5541712474486513,
+      "learning_rate": 3.1082970751983497e-06,
+      "loss": 0.20749812126159667,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.5835934183180771,
+      "learning_rate": 3.0811688483379546e-06,
+      "loss": 0.20475554466247559,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.5792514427898341,
+      "learning_rate": 3.0539680832528074e-06,
+      "loss": 0.20504088401794435,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.6358843481166787,
+      "learning_rate": 3.026698174989316e-06,
+      "loss": 0.20325000286102296,
+      "step": 1530
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.5059500889981753,
+      "learning_rate": 2.999362527223952e-06,
+      "loss": 0.2031909465789795,
+      "step": 1540
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.5388306821924389,
+      "learning_rate": 2.9719645518384194e-06,
+      "loss": 0.20504312515258788,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5939936480408617,
+      "learning_rate": 2.944507668493807e-06,
+      "loss": 0.2084404706954956,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.5687025114161597,
+      "learning_rate": 2.9169953042037623e-06,
+      "loss": 0.20367155075073243,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5703613797457775,
+      "learning_rate": 2.889430892906754e-06,
+      "loss": 0.19950419664382935,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.50147360976836,
+      "learning_rate": 2.861817875037462e-06,
+      "loss": 0.19737675189971923,
+      "step": 1590
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.5962810686359508,
+      "learning_rate": 2.8341596970973683e-06,
+      "loss": 0.206866455078125,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.564566320219468,
+      "learning_rate": 2.80645981122458e-06,
+      "loss": 0.2020205020904541,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.5246372929237232,
+      "learning_rate": 2.7787216747629508e-06,
+      "loss": 0.20939722061157226,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.5415181940486332,
+      "learning_rate": 2.7509487498305615e-06,
+      "loss": 0.19629446268081666,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.5627430222118958,
+      "learning_rate": 2.7231445028875924e-06,
+      "loss": 0.20240178108215331,
+      "step": 1640
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.5578941065241574,
+      "learning_rate": 2.6953124043036604e-06,
+      "loss": 0.2012562036514282,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.5487117054063715,
+      "learning_rate": 2.667455927924667e-06,
+      "loss": 0.20127537250518798,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.571360126804376,
+      "learning_rate": 2.6395785506392164e-06,
+      "loss": 0.1964709758758545,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.6088527341362128,
+      "learning_rate": 2.6116837519446407e-06,
+      "loss": 0.1997244954109192,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.5974545138027041,
+      "learning_rate": 2.5837750135127192e-06,
+      "loss": 0.19768773317337035,
+      "step": 1690
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.5496714163583045,
+      "learning_rate": 2.555855818755108e-06,
+      "loss": 0.20294923782348634,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.7083231030411815,
+      "learning_rate": 2.5279296523885636e-06,
+      "loss": 0.20083847045898437,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.5938882026412365,
+      "learning_rate": 2.5e-06,
+      "loss": 0.20156488418579102,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.5963429209905415,
+      "learning_rate": 2.472070347611437e-06,
+      "loss": 0.19514652490615844,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.6395947365412442,
+      "learning_rate": 2.444144181244893e-06,
+      "loss": 0.20121583938598633,
+      "step": 1740
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5998001248295249,
+      "learning_rate": 2.416224986487282e-06,
+      "loss": 0.19726226329803467,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.5593754591530539,
+      "learning_rate": 2.3883162480553605e-06,
+      "loss": 0.19497768878936766,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.5860785466160793,
+      "learning_rate": 2.3604214493607844e-06,
+      "loss": 0.1996150493621826,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.5963601131944923,
+      "learning_rate": 2.332544072075333e-06,
+      "loss": 0.20348951816558838,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.5745583695919886,
+      "learning_rate": 2.30468759569634e-06,
+      "loss": 0.2016512393951416,
+      "step": 1790
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5720738010975994,
+      "learning_rate": 2.276855497112408e-06,
+      "loss": 0.1983588457107544,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.594436652050367,
+      "learning_rate": 2.2490512501694394e-06,
+      "loss": 0.19393882751464844,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.5547702774883363,
+      "learning_rate": 2.2212783252370496e-06,
+      "loss": 0.19950855970382692,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.49741997333090354,
+      "learning_rate": 2.1935401887754213e-06,
+      "loss": 0.20486598014831542,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.6191188389453962,
+      "learning_rate": 2.165840302902632e-06,
+      "loss": 0.1979525566101074,
+      "step": 1840
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.613998551941137,
+      "learning_rate": 2.1381821249625383e-06,
+      "loss": 0.2030627727508545,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.6115410126221079,
+      "learning_rate": 2.1105691070932465e-06,
+      "loss": 0.1951197624206543,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.5666967026000811,
+      "learning_rate": 2.083004695796238e-06,
+      "loss": 0.1926891803741455,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5564168831256036,
+      "learning_rate": 2.055492331506194e-06,
+      "loss": 0.20087857246398927,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.647003695530594,
+      "learning_rate": 2.0280354481615814e-06,
+      "loss": 0.1991624116897583,
+      "step": 1890
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.6020348842840653,
+      "learning_rate": 2.000637472776049e-06,
+      "loss": 0.20029563903808595,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.593460784828495,
+      "learning_rate": 1.973301825010685e-06,
+      "loss": 0.19462828636169432,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.6796900420369784,
+      "learning_rate": 1.9460319167471934e-06,
+      "loss": 0.20009157657623292,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.5803908647953272,
+      "learning_rate": 1.9188311516620466e-06,
+      "loss": 0.19473812580108643,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.5919196787967083,
+      "learning_rate": 1.891702924801651e-06,
+      "loss": 0.20190510749816895,
+      "step": 1940
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.6056764566097385,
+      "learning_rate": 1.864650622158604e-06,
+      "loss": 0.2063821792602539,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.5106064574990916,
+      "learning_rate": 1.8376776202490666e-06,
+      "loss": 0.20139360427856445,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5816570517079882,
+      "learning_rate": 1.8107872856913293e-06,
+      "loss": 0.19568054676055907,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.6100308085295513,
+      "learning_rate": 1.7839829747856096e-06,
+      "loss": 0.19661173820495606,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.6256775545767371,
+      "learning_rate": 1.7572680330951359e-06,
+      "loss": 0.19576869010925294,
+      "step": 1990
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.5979254874380191,
+      "learning_rate": 1.7306457950285747e-06,
+      "loss": 0.19802470207214357,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.6445065470953916,
+      "learning_rate": 1.704119583423848e-06,
+      "loss": 0.19182772636413575,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.5238518416749739,
+      "learning_rate": 1.677692709133396e-06,
+      "loss": 0.19971816539764403,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.5902086462380663,
+      "learning_rate": 1.6513684706109311e-06,
+      "loss": 0.20058016777038573,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5301315426540266,
+      "learning_rate": 1.6251501534997529e-06,
+      "loss": 0.19816763401031495,
+      "step": 2040
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.5702221922649561,
+      "learning_rate": 1.5990410302226405e-06,
+      "loss": 0.19167234897613525,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.5682142108318351,
+      "learning_rate": 1.5730443595734162e-06,
+      "loss": 0.19806729555130004,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.6268750721579749,
+      "learning_rate": 1.5471633863101982e-06,
+      "loss": 0.1990320086479187,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.6501758398050216,
+      "learning_rate": 1.521401340750407e-06,
+      "loss": 0.20063567161560059,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5367071332530153,
+      "learning_rate": 1.495761438367577e-06,
+      "loss": 0.2000502109527588,
+      "step": 2090
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.6644202151690211,
+      "learning_rate": 1.4702468793900187e-06,
+      "loss": 0.19811663627624512,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.6020454013039992,
+      "learning_rate": 1.444860848401384e-06,
+      "loss": 0.19873985052108764,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5672766014696592,
+      "learning_rate": 1.4196065139431866e-06,
+      "loss": 0.19663108587265016,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.6668756559032718,
+      "learning_rate": 1.3944870281193178e-06,
+      "loss": 0.19677751064300536,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.6146850263092741,
+      "learning_rate": 1.3695055262026208e-06,
+      "loss": 0.20252432823181152,
+      "step": 2140
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.6023134400750195,
+      "learning_rate": 1.3446651262435679e-06,
+      "loss": 0.19564807415008545,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5973758444267007,
+      "learning_rate": 1.3199689286810746e-06,
+      "loss": 0.19767165184020996,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.604085220565822,
+      "learning_rate": 1.2954200159555294e-06,
+      "loss": 0.19245314598083496,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.5971658440027723,
+      "learning_rate": 1.2710214521240527e-06,
+      "loss": 0.19593756198883056,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.6712656742168871,
+      "learning_rate": 1.246776282478063e-06,
+      "loss": 0.19848381280899047,
+      "step": 2190
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.5303502593262494,
+      "learning_rate": 1.222687533163181e-06,
+      "loss": 0.19739968776702882,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.6329890536946617,
+      "learning_rate": 1.1987582108015228e-06,
+      "loss": 0.19885218143463135,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.6175733280769058,
+      "learning_rate": 1.1749913021164255e-06,
+      "loss": 0.20003676414489746,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.6297338992517326,
+      "learning_rate": 1.1513897735596702e-06,
+      "loss": 0.19420522451400757,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5570261846558745,
+      "learning_rate": 1.127956570941218e-06,
+      "loss": 0.19144604206085206,
+      "step": 2240
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.7464999016757174,
+      "learning_rate": 1.104694619061533e-06,
+      "loss": 0.20028018951416016,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5813509472785208,
+      "learning_rate": 1.0816068213465295e-06,
+      "loss": 0.2022254228591919,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5788680063085246,
+      "learning_rate": 1.0586960594851762e-06,
+      "loss": 0.19734264612197877,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.6879904092074834,
+      "learning_rate": 1.0359651930698217e-06,
+      "loss": 0.19566457271575927,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.545714278159425,
+      "learning_rate": 1.0134170592392837e-06,
+      "loss": 0.19808268547058105,
+      "step": 2290
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.6957466724150051,
+      "learning_rate": 9.910544723247204e-07,
+      "loss": 0.19703471660614014,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5722555379171206,
+      "learning_rate": 9.688802234983706e-07,
+      "loss": 0.19638856649398803,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.6657445816108672,
+      "learning_rate": 9.468970804251742e-07,
+      "loss": 0.1994560480117798,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.6118638240003964,
+      "learning_rate": 9.251077869173244e-07,
+      "loss": 0.19247424602508545,
+      "step": 2330
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.618262759129052,
+      "learning_rate": 9.035150625918054e-07,
+      "loss": 0.19384448528289794,
+      "step": 2340
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5841167908088344,
+      "learning_rate": 8.821216025309395e-07,
+      "loss": 0.19670048952102662,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.6330443090953268,
+      "learning_rate": 8.609300769460055e-07,
+      "loss": 0.191538667678833,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.6922248169402944,
+      "learning_rate": 8.399431308439592e-07,
+      "loss": 0.19869886636734008,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.5821907331028691,
+      "learning_rate": 8.191633836972962e-07,
+      "loss": 0.19837281703948975,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.5484553164447705,
+      "learning_rate": 7.985934291171024e-07,
+      "loss": 0.19366707801818847,
+      "step": 2390
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.6131324978552078,
+      "learning_rate": 7.7823583452934e-07,
+      "loss": 0.19763607978820802,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5665386766642198,
+      "learning_rate": 7.58093140854389e-07,
+      "loss": 0.19747262001037597,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.6702088035794936,
+      "learning_rate": 7.381678621899077e-07,
+      "loss": 0.19848825931549072,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.6808200224599221,
+      "learning_rate": 7.184624854970379e-07,
+      "loss": 0.19454023838043213,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.5446840545845119,
+      "learning_rate": 6.989794702899932e-07,
+      "loss": 0.1943270444869995,
+      "step": 2440
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.6415010178339859,
+      "learning_rate": 6.797212483290777e-07,
+      "loss": 0.19584910869598388,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.603526871568268,
+      "learning_rate": 6.60690223317171e-07,
+      "loss": 0.19342836141586303,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.5817111419419255,
+      "learning_rate": 6.418887705997046e-07,
+      "loss": 0.19574793577194213,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.7792382444355755,
+      "learning_rate": 6.23319236868189e-07,
+      "loss": 0.1987607717514038,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.6291788716222239,
+      "learning_rate": 6.049839398673141e-07,
+      "loss": 0.20009655952453614,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.674170182636883,
+      "learning_rate": 5.868851681056567e-07,
+      "loss": 0.2016763210296631,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5738700746068163,
+      "learning_rate": 5.690251805700467e-07,
+      "loss": 0.19858623743057252,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.5748267344102337,
+      "learning_rate": 5.514062064436096e-07,
+      "loss": 0.19959205389022827,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.6464282974919533,
+      "learning_rate": 5.34030444827533e-07,
+      "loss": 0.19621236324310304,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.6390320405050175,
+      "learning_rate": 5.169000644665895e-07,
+      "loss": 0.19293551445007323,
+      "step": 2540
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5856228193289068,
+      "learning_rate": 5.000172034784442e-07,
+      "loss": 0.1952167272567749,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.6152721851543074,
+      "learning_rate": 4.833839690867853e-07,
+      "loss": 0.19755464792251587,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.6792777707129383,
+      "learning_rate": 4.6700243735831705e-07,
+      "loss": 0.1906466007232666,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.5650779115466599,
+      "learning_rate": 4.508746529436311e-07,
+      "loss": 0.1896218776702881,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.6068556104605155,
+      "learning_rate": 4.350026288220083e-07,
+      "loss": 0.1972370147705078,
+      "step": 2590
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.6087844635927864,
+      "learning_rate": 4.1938834605017133e-07,
+      "loss": 0.19401493072509765,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.594443863161453,
+      "learning_rate": 4.0403375351501515e-07,
+      "loss": 0.19397275447845458,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.5777613928889838,
+      "learning_rate": 3.88940767690362e-07,
+      "loss": 0.19363962411880492,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.6122408540819826,
+      "learning_rate": 3.7411127239775774e-07,
+      "loss": 0.19224631786346436,
+      "step": 2630
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.5922115547592817,
+      "learning_rate": 3.595471185713431e-07,
+      "loss": 0.19027912616729736,
+      "step": 2640
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.6012010067551694,
+      "learning_rate": 3.4525012402682826e-07,
+      "loss": 0.1921192765235901,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.6089446682050474,
+      "learning_rate": 3.3122207323460804e-07,
+      "loss": 0.19460537433624267,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.6314431181993275,
+      "learning_rate": 3.1746471709702963e-07,
+      "loss": 0.19075865745544435,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.6136529603147252,
+      "learning_rate": 3.039797727298585e-07,
+      "loss": 0.1973212718963623,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.6278068265217286,
+      "learning_rate": 2.9076892324795546e-07,
+      "loss": 0.19564627408981322,
+      "step": 2690
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.6308491327804164,
+      "learning_rate": 2.778338175551995e-07,
+      "loss": 0.19089040756225586,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.6806226474068601,
+      "learning_rate": 2.6517607013868326e-07,
+      "loss": 0.19906394481658934,
+      "step": 2710
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.6497216896329614,
+      "learning_rate": 2.527972608672002e-07,
+      "loss": 0.19420729875564574,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.5988037888796804,
+      "learning_rate": 2.40698934794053e-07,
+      "loss": 0.1949334740638733,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.5825410688543936,
+      "learning_rate": 2.2888260196421237e-07,
+      "loss": 0.19373006820678712,
+      "step": 2740
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5659393573725252,
+      "learning_rate": 2.1734973722583735e-07,
+      "loss": 0.19743962287902833,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.6810045862821603,
+      "learning_rate": 2.0610178004619564e-07,
+      "loss": 0.18792747259140014,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.5624807528399969,
+      "learning_rate": 1.9514013433199834e-07,
+      "loss": 0.20065484046936036,
+      "step": 2770
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.5300049949985157,
+      "learning_rate": 1.8446616825416958e-07,
+      "loss": 0.19963890314102173,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.6417643354263414,
+      "learning_rate": 1.7408121407708007e-07,
+      "loss": 0.19946534633636476,
+      "step": 2790
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.6263783317633913,
+      "learning_rate": 1.6398656799226253e-07,
+      "loss": 0.1873138427734375,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.6642472444356609,
+      "learning_rate": 1.5418348995662773e-07,
+      "loss": 0.1936098575592041,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.6361104958877116,
+      "learning_rate": 1.4467320353520275e-07,
+      "loss": 0.192909574508667,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.606401356191172,
+      "learning_rate": 1.3545689574841341e-07,
+      "loss": 0.1932598114013672,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.6138805257535019,
+      "learning_rate": 1.26535716923927e-07,
+      "loss": 0.19897468090057374,
+      "step": 2840
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.6113791423952993,
+      "learning_rate": 1.1791078055307493e-07,
+      "loss": 0.19516528844833375,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.5897316619244026,
+      "learning_rate": 1.0958316315187289e-07,
+      "loss": 0.1947079300880432,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.6570448249633108,
+      "learning_rate": 1.0155390412665528e-07,
+      "loss": 0.19286593198776245,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.6143543897264965,
+      "learning_rate": 9.38240056443443e-08,
+      "loss": 0.18985612392425538,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.6208574768565508,
+      "learning_rate": 8.639443250736402e-08,
+      "loss": 0.1930636167526245,
+      "step": 2890
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.6380337536968056,
+      "learning_rate": 7.926611203321777e-08,
+      "loss": 0.1940324306488037,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.6333119199427104,
+      "learning_rate": 7.243993393874882e-08,
+      "loss": 0.195207679271698,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5601684784399228,
+      "learning_rate": 6.591675022908805e-08,
+      "loss": 0.1926344394683838,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.7001254632467586,
+      "learning_rate": 5.969737509131241e-08,
+      "loss": 0.189910888671875,
+      "step": 2930
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.5707165379372983,
+      "learning_rate": 5.3782584792823334e-08,
+      "loss": 0.1941395878791809,
+      "step": 2940
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.637882100753534,
+      "learning_rate": 4.817311758445686e-08,
+      "loss": 0.19586544036865233,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.58305847153215,
+      "learning_rate": 4.286967360833866e-08,
+      "loss": 0.19621498584747316,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.6444124781946634,
+      "learning_rate": 3.787291481049754e-08,
+      "loss": 0.19597216844558715,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.68778482150424,
+      "learning_rate": 3.3183464858244364e-08,
+      "loss": 0.20229551792144776,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.589065287919965,
+      "learning_rate": 2.8801909062328992e-08,
+      "loss": 0.1879359722137451,
+      "step": 2990
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.7200708770444023,
+      "learning_rate": 2.4728794303886248e-08,
+      "loss": 0.18806444406509398,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.6369212243333968,
+      "learning_rate": 2.0964628966175794e-08,
+      "loss": 0.19293060302734374,
+      "step": 3010
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.6150129328436796,
+      "learning_rate": 1.750988287113009e-08,
+      "loss": 0.19189660549163817,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5966036549992078,
+      "learning_rate": 1.4364987220713278e-08,
+      "loss": 0.1992994427680969,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.6785615385564472,
+      "learning_rate": 1.1530334543099763e-08,
+      "loss": 0.19624128341674804,
+      "step": 3040
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.626236262460755,
+      "learning_rate": 9.006278643683697e-09,
+      "loss": 0.19942662715911866,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.71228117768398,
+      "learning_rate": 6.793134560916514e-09,
+      "loss": 0.2007957935333252,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.5740813965788273,
+      "learning_rate": 4.891178526986451e-09,
+      "loss": 0.19730459451675414,
+      "step": 3070
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.6522249776214731,
+      "learning_rate": 3.3006479333413943e-09,
+      "loss": 0.1995969295501709,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.6484316026206892,
+      "learning_rate": 2.021741301058422e-09,
+      "loss": 0.19556543827056885,
+      "step": 3090
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.6355663767406068,
+      "learning_rate": 1.0546182560652872e-09,
+      "loss": 0.19732578992843627,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.6169267731488666,
+      "learning_rate": 3.9939950921774607e-10,
+      "loss": 0.1917206883430481,
+      "step": 3110
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.5994063111681457,
+      "learning_rate": 5.616684123160854e-11,
+      "loss": 0.1916499137878418,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0,
+      "step": 3125,
+      "total_flos": 1201860236279808.0,
+      "train_loss": 0.2128027264213562,
+      "train_runtime": 15463.9635,
+      "train_samples_per_second": 12.933,
+      "train_steps_per_second": 0.202
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1201860236279808.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoints/Qwen3-VL-2B-SFT/training_loss.png b/checkpoints/Qwen3-VL-2B-SFT/training_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..5ebacc4733ee7539225383b1c67be9281918b2c0
Binary files /dev/null and b/checkpoints/Qwen3-VL-2B-SFT/training_loss.png differ
diff --git a/checkpoints/Qwen3-VL-8B-SFT/all_results.json b/checkpoints/Qwen3-VL-8B-SFT/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..36004053df95edae2ea9e2bc0be78e9ca95f27de
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1865119859998720.0,
+    "train_loss": 0.1913537948036194,
+    "train_runtime": 24530.7619,
+    "train_samples_per_second": 8.153,
+    "train_steps_per_second": 0.127
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3-VL-8B-SFT/chat_template.jinja b/checkpoints/Qwen3-VL-8B-SFT/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..124386803f142761528f710e77ae483f5f8c4fc4
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/chat_template.jinja
@@ -0,0 +1,120 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- for message in messages %}
+    {%- if message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content_item in message.content %}
+                {%- if 'text' in content_item %}
+                    {{- content_item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and message.content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/checkpoints/Qwen3-VL-8B-SFT/config.json b/checkpoints/Qwen3-VL-8B-SFT/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..28bec3525bc2ba9222997a888a237f512de67b28
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/config.json
@@ -0,0 +1,70 @@
+{
+  "architectures": [
+    "Qwen3VLForConditionalGeneration"
+  ],
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_size": 4096,
+  "image_token_id": 151655,
+  "model_type": "qwen3_vl",
+  "pad_token_id": 151643,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "pad_token_id": null,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_theta": 5000000,
+      "rope_type": "default"
+    },
+    "use_cache": false,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.5.3",
+  "use_cache": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      8,
+      16,
+      24
+    ],
+    "depth": 27,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 4096,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}
diff --git a/checkpoints/Qwen3-VL-8B-SFT/eval_results_job_qwen3vl_8b_qwen3_vl_8b_20260430_002312.json b/checkpoints/Qwen3-VL-8B-SFT/eval_results_job_qwen3vl_8b_qwen3_vl_8b_20260430_002312.json
new file mode 100644
index 0000000000000000000000000000000000000000..f82fb7e68ac9b3f6cd0735b9cc0b77977dfe438c
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/eval_results_job_qwen3vl_8b_qwen3_vl_8b_20260430_002312.json
@@ -0,0 +1,56 @@
+{
+  "mae_dx": 0.16784310344827585,
+  "rmse_dx": 0.5566416075989513,
+  "mae_dy": 0.1432689655172414,
+  "rmse_dy": 0.4053119700964286,
+  "mae_dz": 0.015856896551724137,
+  "rmse_dz": 0.1271192082395429,
+  "mae_dpitch": 0.8785189655172414,
+  "rmse_dpitch": 1.8406877509647648,
+  "mae_dyaw": 1.961193103448276,
+  "rmse_dyaw": 3.8162825611066715,
+  "mae_droll": 0.0,
+  "rmse_droll": 0.0,
+  "mae_overall": 0.5277801724137932,
+  "mae_position": 0.10898965517241378,
+  "mae_rotation": 0.9465706896551725,
+  "rmse_overall": 1.7532080570711825,
+  "wp1_euc_mae": 0.0754244806175472,
+  "wp1_euc_median": 0.020000000000000004,
+  "wp2_euc_mae": 0.1490315903914259,
+  "wp2_euc_median": 0.04242640687119287,
+  "wp3_euc_mae": 0.23570842038325326,
+  "wp3_euc_median": 0.0781024967590666,
+  "wp4_euc_mae": 0.3474207170406001,
+  "wp4_euc_median": 0.120415945787923,
+  "wp5_euc_mae": 0.4714380007035882,
+  "wp5_euc_median": 0.17464249196572973,
+  "euclidean_mae": 0.25580464182728296,
+  "ADE": 0.25580464182728296,
+  "FDE": 0.4714380007035882,
+  "ADE_median": 0.09043517886004399,
+  "FDE_median": 0.17464249196572973,
+  "SR@0.5m": 0.8829310344827587,
+  "SR@1.0m": 0.9489655172413793,
+  "SR@2.0m": 0.9808620689655172,
+  "SR@5.0m": 0.9956896551724138,
+  "TrajSR@1.0m": 0.8844827586206897,
+  "TrajSR@2.0m": 0.9586206896551724,
+  "TrajSR@5.0m": 0.9887931034482759,
+  "RotAcc@1.0deg": 0.5544827586206896,
+  "RotAcc@5.0deg": 0.8108620689655173,
+  "RotAcc@10.0deg": 0.9686206896551725,
+  "wp1_rot_mae": 1.72129239448981,
+  "wp2_rot_mae": 1.9778662255644632,
+  "wp3_rot_mae": 2.262463502586465,
+  "wp4_rot_mae": 2.622403191819014,
+  "wp5_rot_mae": 2.985252349448265,
+  "rotation_euc_mae": 2.313855532781603,
+  "parse_failure_rate": 0.0,
+  "parse_success_rate": 1.0,
+  "valid_samples": 1160,
+  "total_samples": 1160,
+  "parse_failures": 0,
+  "inference_engine": "vllm",
+  "vllm_version": "0.19.0"
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3-VL-8B-SFT/generation_config.json b/checkpoints/Qwen3-VL-8B-SFT/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8548f60df0ba832de7a75caa724e098a7ad0eda5
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/generation_config.json
@@ -0,0 +1,14 @@
+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.0,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.5.3"
+}
diff --git a/checkpoints/Qwen3-VL-8B-SFT/model.safetensors b/checkpoints/Qwen3-VL-8B-SFT/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f1f71ab3427aa2a0f1ba0681efddb0d4e6046948
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9248be9de786b9ff6b663e3b127a88a56c4454a86424774c958f2f667ad3139e
+size 17534340584
diff --git a/checkpoints/Qwen3-VL-8B-SFT/processor_config.json b/checkpoints/Qwen3-VL-8B-SFT/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..33818c7f9e991ad735fd240209f4fa73e6c28c50
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/processor_config.json
@@ -0,0 +1,60 @@
+{
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "Qwen2VLImageProcessor",
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "merge_size": 2,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "longest_edge": 16777216,
+      "shortest_edge": 65536
+    },
+    "temporal_patch_size": 2
+  },
+  "processor_class": "Qwen3VLProcessor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "fps": 2,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_frames": 768,
+    "merge_size": 2,
+    "min_frames": 4,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "size": {
+      "longest_edge": 25165824,
+      "shortest_edge": 4096
+    },
+    "temporal_patch_size": 2,
+    "video_processor_type": "Qwen3VLVideoProcessor"
+  }
+}
diff --git a/checkpoints/Qwen3-VL-8B-SFT/tokenizer.json b/checkpoints/Qwen3-VL-8B-SFT/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650
diff --git a/checkpoints/Qwen3-VL-8B-SFT/tokenizer_config.json b/checkpoints/Qwen3-VL-8B-SFT/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..43900b512f2f27ef1e4be8d571c4d7dba693c654
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/tokenizer_config.json
@@ -0,0 +1,31 @@
+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "Qwen3VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
diff --git a/checkpoints/Qwen3-VL-8B-SFT/train_results.json b/checkpoints/Qwen3-VL-8B-SFT/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..36004053df95edae2ea9e2bc0be78e9ca95f27de
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1865119859998720.0,
+    "train_loss": 0.1913537948036194,
+    "train_runtime": 24530.7619,
+    "train_samples_per_second": 8.153,
+    "train_steps_per_second": 0.127
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3-VL-8B-SFT/trainer_state.json b/checkpoints/Qwen3-VL-8B-SFT/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0badad533214679590ce4ee4ca6885daf4ea671
--- /dev/null
+++ b/checkpoints/Qwen3-VL-8B-SFT/trainer_state.json
@@ -0,0 +1,2227 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 11.83144098371552,
+      "learning_rate": 1.437699680511182e-07,
+      "loss": 0.6488185882568359,
+      "step": 10
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 9.59315923624666,
+      "learning_rate": 3.0351437699680514e-07,
+      "loss": 0.6357447624206543,
+      "step": 20
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 8.611135612577282,
+      "learning_rate": 4.6325878594249205e-07,
+      "loss": 0.5702734470367432,
+      "step": 30
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 3.380574205681449,
+      "learning_rate": 6.230031948881789e-07,
+      "loss": 0.4336705207824707,
+      "step": 40
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.3215372913591454,
+      "learning_rate": 7.82747603833866e-07,
+      "loss": 0.3196877956390381,
+      "step": 50
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.49205152753117876,
+      "learning_rate": 9.424920127795528e-07,
+      "loss": 0.26679015159606934,
+      "step": 60
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.45222798092529587,
+      "learning_rate": 1.1022364217252397e-06,
+      "loss": 0.267962121963501,
+      "step": 70
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.38880908025022265,
+      "learning_rate": 1.2619808306709266e-06,
+      "loss": 0.2507643222808838,
+      "step": 80
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.3466305556727367,
+      "learning_rate": 1.4217252396166134e-06,
+      "loss": 0.24545693397521973,
+      "step": 90
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.3545745065230796,
+      "learning_rate": 1.5814696485623005e-06,
+      "loss": 0.24865365028381348,
+      "step": 100
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.36611277528700825,
+      "learning_rate": 1.7412140575079875e-06,
+      "loss": 0.24806594848632812,
+      "step": 110
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.4168127418004872,
+      "learning_rate": 1.9009584664536742e-06,
+      "loss": 0.23844523429870607,
+      "step": 120
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.3478506619909293,
+      "learning_rate": 2.060702875399361e-06,
+      "loss": 0.24028465747833253,
+      "step": 130
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.37897531613175517,
+      "learning_rate": 2.220447284345048e-06,
+      "loss": 0.23141345977783204,
+      "step": 140
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.44434967369549566,
+      "learning_rate": 2.380191693290735e-06,
+      "loss": 0.227997350692749,
+      "step": 150
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.3846551259722682,
+      "learning_rate": 2.539936102236422e-06,
+      "loss": 0.22319293022155762,
+      "step": 160
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.42029854714577086,
+      "learning_rate": 2.699680511182109e-06,
+      "loss": 0.23121466636657714,
+      "step": 170
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.5531278369713543,
+      "learning_rate": 2.8594249201277955e-06,
+      "loss": 0.22436304092407228,
+      "step": 180
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.41335994292883266,
+      "learning_rate": 3.0191693290734825e-06,
+      "loss": 0.2279865026473999,
+      "step": 190
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.41777451150300726,
+      "learning_rate": 3.17891373801917e-06,
+      "loss": 0.22936224937438965,
+      "step": 200
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.4429107133096889,
+      "learning_rate": 3.3386581469648564e-06,
+      "loss": 0.22612979412078857,
+      "step": 210
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.37955292136954716,
+      "learning_rate": 3.4984025559105434e-06,
+      "loss": 0.21961536407470703,
+      "step": 220
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.3916244375225685,
+      "learning_rate": 3.6581469648562303e-06,
+      "loss": 0.22160472869873046,
+      "step": 230
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.4190841730835042,
+      "learning_rate": 3.817891373801918e-06,
+      "loss": 0.21761112213134765,
+      "step": 240
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.39560583196094196,
+      "learning_rate": 3.977635782747604e-06,
+      "loss": 0.21524910926818847,
+      "step": 250
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4078924637485393,
+      "learning_rate": 4.137380191693291e-06,
+      "loss": 0.21542978286743164,
+      "step": 260
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.4106351573034691,
+      "learning_rate": 4.297124600638978e-06,
+      "loss": 0.2175312042236328,
+      "step": 270
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.350392701468648,
+      "learning_rate": 4.456869009584665e-06,
+      "loss": 0.21332988739013672,
+      "step": 280
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.33990023576201683,
+      "learning_rate": 4.616613418530352e-06,
+      "loss": 0.21945428848266602,
+      "step": 290
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.3930734349391618,
+      "learning_rate": 4.776357827476039e-06,
+      "loss": 0.21993803977966309,
+      "step": 300
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.3988914165504776,
+      "learning_rate": 4.936102236421725e-06,
+      "loss": 0.20795869827270508,
+      "step": 310
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.35731910595219907,
+      "learning_rate": 4.999943833158769e-06,
+      "loss": 0.20925359725952147,
+      "step": 320
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.3059149411920343,
+      "learning_rate": 4.999600600490783e-06,
+      "loss": 0.2151254415512085,
+      "step": 330
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.38132024364024864,
+      "learning_rate": 4.9989453817439345e-06,
+      "loss": 0.20901703834533691,
+      "step": 340
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.3345902579307589,
+      "learning_rate": 4.997978258698942e-06,
+      "loss": 0.20827054977416992,
+      "step": 350
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.3654908632001385,
+      "learning_rate": 4.996699352066659e-06,
+      "loss": 0.21013379096984863,
+      "step": 360
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.39442024987420615,
+      "learning_rate": 4.995108821473014e-06,
+      "loss": 0.20931053161621094,
+      "step": 370
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.4182616523944993,
+      "learning_rate": 4.993206865439084e-06,
+      "loss": 0.20571417808532716,
+      "step": 380
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.341180560016227,
+      "learning_rate": 4.990993721356317e-06,
+      "loss": 0.2043307065963745,
+      "step": 390
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.3752267388801998,
+      "learning_rate": 4.988469665456901e-06,
+      "loss": 0.20494937896728516,
+      "step": 400
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.3245176299142898,
+      "learning_rate": 4.985635012779288e-06,
+      "loss": 0.21275849342346193,
+      "step": 410
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.30464125237598766,
+      "learning_rate": 4.98249011712887e-06,
+      "loss": 0.20537421703338624,
+      "step": 420
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.32422803949501344,
+      "learning_rate": 4.979035371033824e-06,
+      "loss": 0.20476508140563965,
+      "step": 430
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.3539302093186753,
+      "learning_rate": 4.975271205696115e-06,
+      "loss": 0.2044001340866089,
+      "step": 440
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.3405614059989652,
+      "learning_rate": 4.971198090937671e-06,
+      "loss": 0.198509681224823,
+      "step": 450
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.38543797495647464,
+      "learning_rate": 4.966816535141756e-06,
+      "loss": 0.19933369159698486,
+      "step": 460
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.3243047107311659,
+      "learning_rate": 4.9621270851895035e-06,
+      "loss": 0.20511775016784667,
+      "step": 470
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.3856847456921783,
+      "learning_rate": 4.957130326391662e-06,
+      "loss": 0.2059837818145752,
+      "step": 480
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.39057481445802306,
+      "learning_rate": 4.951826882415544e-06,
+      "loss": 0.20367493629455566,
+      "step": 490
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3868568198929062,
+      "learning_rate": 4.946217415207177e-06,
+      "loss": 0.19742422103881835,
+      "step": 500
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.3509586614208196,
+      "learning_rate": 4.940302624908689e-06,
+      "loss": 0.20141959190368652,
+      "step": 510
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.39255560366454684,
+      "learning_rate": 4.934083249770912e-06,
+      "loss": 0.19597206115722657,
+      "step": 520
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.3753941656538999,
+      "learning_rate": 4.927560066061251e-06,
+      "loss": 0.19957098960876465,
+      "step": 530
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.33505547223235554,
+      "learning_rate": 4.920733887966783e-06,
+      "loss": 0.2104717254638672,
+      "step": 540
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.3397525078929753,
+      "learning_rate": 4.913605567492636e-06,
+      "loss": 0.19930799007415773,
+      "step": 550
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.41307748920624604,
+      "learning_rate": 4.906175994355656e-06,
+      "loss": 0.20131049156188965,
+      "step": 560
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.46727990153877497,
+      "learning_rate": 4.898446095873345e-06,
+      "loss": 0.20283927917480468,
+      "step": 570
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.3544063329370901,
+      "learning_rate": 4.890416836848128e-06,
+      "loss": 0.200484037399292,
+      "step": 580
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.3342685909168471,
+      "learning_rate": 4.882089219446925e-06,
+      "loss": 0.19547154903411865,
+      "step": 590
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.37096928349741987,
+      "learning_rate": 4.873464283076074e-06,
+      "loss": 0.20015311241149902,
+      "step": 600
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.3027007752332859,
+      "learning_rate": 4.864543104251587e-06,
+      "loss": 0.19957488775253296,
+      "step": 610
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.33671892410389453,
+      "learning_rate": 4.855326796464798e-06,
+      "loss": 0.20140202045440675,
+      "step": 620
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3380893311921056,
+      "learning_rate": 4.8458165100433725e-06,
+      "loss": 0.1972325325012207,
+      "step": 630
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.33781332412651466,
+      "learning_rate": 4.836013432007738e-06,
+      "loss": 0.19717284440994262,
+      "step": 640
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.34031999571424276,
+      "learning_rate": 4.825918785922921e-06,
+      "loss": 0.20164141654968262,
+      "step": 650
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.3761741276870676,
+      "learning_rate": 4.8155338317458315e-06,
+      "loss": 0.19852631092071532,
+      "step": 660
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.33766360947275226,
+      "learning_rate": 4.804859865668002e-06,
+      "loss": 0.1955265998840332,
+      "step": 670
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.3263649532195094,
+      "learning_rate": 4.793898219953804e-06,
+      "loss": 0.1965425729751587,
+      "step": 680
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.32263376893110113,
+      "learning_rate": 4.782650262774164e-06,
+      "loss": 0.19828790426254272,
+      "step": 690
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.3517548043501075,
+      "learning_rate": 4.7711173980357886e-06,
+      "loss": 0.19447412490844726,
+      "step": 700
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.338792238112512,
+      "learning_rate": 4.759301065205947e-06,
+      "loss": 0.1949925899505615,
+      "step": 710
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.37194857535123965,
+      "learning_rate": 4.7472027391328e-06,
+      "loss": 0.19610352516174318,
+      "step": 720
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.3915181276276985,
+      "learning_rate": 4.734823929861317e-06,
+      "loss": 0.1986656069755554,
+      "step": 730
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.38957297489256615,
+      "learning_rate": 4.722166182444801e-06,
+      "loss": 0.19531142711639404,
+      "step": 740
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.35280725988589035,
+      "learning_rate": 4.709231076752045e-06,
+      "loss": 0.1945881724357605,
+      "step": 750
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.328859584018874,
+      "learning_rate": 4.696020227270142e-06,
+      "loss": 0.1986994504928589,
+      "step": 760
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.42079847846495083,
+      "learning_rate": 4.6825352829029705e-06,
+      "loss": 0.1951799511909485,
+      "step": 770
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.3564546225835861,
+      "learning_rate": 4.668777926765392e-06,
+      "loss": 0.19262158870697021,
+      "step": 780
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.30910181888519894,
+      "learning_rate": 4.6547498759731725e-06,
+      "loss": 0.1888942837715149,
+      "step": 790
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.3800555896836864,
+      "learning_rate": 4.6404528814286575e-06,
+      "loss": 0.191619348526001,
+      "step": 800
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.3214924880946795,
+      "learning_rate": 4.6258887276022425e-06,
+      "loss": 0.1980231523513794,
+      "step": 810
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.3380732634719034,
+      "learning_rate": 4.611059232309639e-06,
+      "loss": 0.19314407110214232,
+      "step": 820
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.389068899000227,
+      "learning_rate": 4.595966246484986e-06,
+      "loss": 0.19506924152374266,
+      "step": 830
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.3863125802433344,
+      "learning_rate": 4.580611653949829e-06,
+      "loss": 0.19590425491333008,
+      "step": 840
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.3386867491077868,
+      "learning_rate": 4.564997371177992e-06,
+      "loss": 0.19343934059143067,
+      "step": 850
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.32498475663304227,
+      "learning_rate": 4.54912534705637e-06,
+      "loss": 0.19271985292434693,
+      "step": 860
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.37076633469721365,
+      "learning_rate": 4.532997562641683e-06,
+      "loss": 0.19003691673278808,
+      "step": 870
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.34537647808173055,
+      "learning_rate": 4.516616030913214e-06,
+      "loss": 0.19376157522201537,
+      "step": 880
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.3396204448169827,
+      "learning_rate": 4.499982796521556e-06,
+      "loss": 0.1885458469390869,
+      "step": 890
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.3278931050248095,
+      "learning_rate": 4.48309993553341e-06,
+      "loss": 0.1910359740257263,
+      "step": 900
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.31081499529851564,
+      "learning_rate": 4.465969555172468e-06,
+      "loss": 0.1909637451171875,
+      "step": 910
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.36721402125547814,
+      "learning_rate": 4.448593793556391e-06,
+      "loss": 0.19662458896636964,
+      "step": 920
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.3535696512302074,
+      "learning_rate": 4.430974819429954e-06,
+      "loss": 0.18980509042739868,
+      "step": 930
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.38926250103548876,
+      "learning_rate": 4.413114831894344e-06,
+      "loss": 0.18494791984558107,
+      "step": 940
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.34453334571881833,
+      "learning_rate": 4.3950160601326865e-06,
+      "loss": 0.184212589263916,
+      "step": 950
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.3762340897100377,
+      "learning_rate": 4.376680763131811e-06,
+      "loss": 0.18981791734695436,
+      "step": 960
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.3530324206457789,
+      "learning_rate": 4.358111229400296e-06,
+      "loss": 0.19288237094879152,
+      "step": 970
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.34771997443076413,
+      "learning_rate": 4.33930977668283e-06,
+      "loss": 0.19517345428466798,
+      "step": 980
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.3259036212205655,
+      "learning_rate": 4.320278751670922e-06,
+      "loss": 0.18931124210357667,
+      "step": 990
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3860199237155075,
+      "learning_rate": 4.301020529710009e-06,
+      "loss": 0.19042216539382933,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.376632222430791,
+      "learning_rate": 4.281537514502962e-06,
+      "loss": 0.19394458532333375,
+      "step": 1010
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.3295954306779813,
+      "learning_rate": 4.261832137810093e-06,
+      "loss": 0.1905941605567932,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.36350379944390687,
+      "learning_rate": 4.241906859145611e-06,
+      "loss": 0.1911671996116638,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.3515414089838784,
+      "learning_rate": 4.221764165470661e-06,
+      "loss": 0.1879589319229126,
+      "step": 1040
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.36056971430032736,
+      "learning_rate": 4.201406570882898e-06,
+      "loss": 0.1876499891281128,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.38550285088851427,
+      "learning_rate": 4.180836616302704e-06,
+      "loss": 0.18704679012298583,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.3907983397501218,
+      "learning_rate": 4.160056869156041e-06,
+      "loss": 0.1898512363433838,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.3185324338246927,
+      "learning_rate": 4.139069923053995e-06,
+      "loss": 0.18859152793884276,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.33476796180481766,
+      "learning_rate": 4.117878397469062e-06,
+      "loss": 0.19204466342926024,
+      "step": 1090
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.36080516187755207,
+      "learning_rate": 4.096484937408195e-06,
+      "loss": 0.18328707218170165,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.3643905053944042,
+      "learning_rate": 4.074892213082676e-06,
+      "loss": 0.18360459804534912,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.3451314428178685,
+      "learning_rate": 4.0531029195748265e-06,
+      "loss": 0.19115616083145143,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.3503646826533139,
+      "learning_rate": 4.03111977650163e-06,
+      "loss": 0.1892825484275818,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.38647304833372,
+      "learning_rate": 4.008945527675281e-06,
+      "loss": 0.18974583148956298,
+      "step": 1140
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.3923573548206856,
+      "learning_rate": 3.986582940760717e-06,
+      "loss": 0.17893118858337403,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.3381639516581911,
+      "learning_rate": 3.9640348069301785e-06,
+      "loss": 0.18569667339324952,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.34840921452583046,
+      "learning_rate": 3.941303940514826e-06,
+      "loss": 0.18692519664764404,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.3621025779320011,
+      "learning_rate": 3.918393178653472e-06,
+      "loss": 0.18706123828887938,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.3409994515842346,
+      "learning_rate": 3.895305380938468e-06,
+      "loss": 0.1850137233734131,
+      "step": 1190
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.32134108140002676,
+      "learning_rate": 3.872043429058783e-06,
+      "loss": 0.18305637836456298,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.34976165687661737,
+      "learning_rate": 3.84861022644033e-06,
+      "loss": 0.18524043560028075,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.35397661097444644,
+      "learning_rate": 3.825008697883574e-06,
+      "loss": 0.19263406991958618,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.3047568820190958,
+      "learning_rate": 3.8012417891984776e-06,
+      "loss": 0.18630678653717042,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.3509346490656729,
+      "learning_rate": 3.777312466836819e-06,
+      "loss": 0.18509007692337037,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.35442414080318474,
+      "learning_rate": 3.7532237175219378e-06,
+      "loss": 0.183595871925354,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.3763561451452722,
+      "learning_rate": 3.728978547875948e-06,
+      "loss": 0.18734192848205566,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.3806442700695017,
+      "learning_rate": 3.7045799840444712e-06,
+      "loss": 0.18231661319732667,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.3693294758862003,
+      "learning_rate": 3.6800310713189258e-06,
+      "loss": 0.18366684913635253,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.4679876778528945,
+      "learning_rate": 3.6553348737564328e-06,
+      "loss": 0.1844363331794739,
+      "step": 1290
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.3188887295686398,
+      "learning_rate": 3.6304944737973794e-06,
+      "loss": 0.189242947101593,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.38141292954197614,
+      "learning_rate": 3.6055129718806836e-06,
+      "loss": 0.18387432098388673,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.42807961078436774,
+      "learning_rate": 3.5803934860568134e-06,
+      "loss": 0.1800466775894165,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.407419776715016,
+      "learning_rate": 3.5551391515986163e-06,
+      "loss": 0.1842915654182434,
+      "step": 1330
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.3457735305030836,
+      "learning_rate": 3.529753120609982e-06,
+      "loss": 0.18095966577529907,
+      "step": 1340
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.40768516482856293,
+      "learning_rate": 3.5042385616324243e-06,
+      "loss": 0.1833919644355774,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.3218920190656802,
+      "learning_rate": 3.4785986592495934e-06,
+      "loss": 0.18204569816589355,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.3296264433157612,
+      "learning_rate": 3.452836613689803e-06,
+      "loss": 0.1804790735244751,
+      "step": 1370
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.3252281867414127,
+      "learning_rate": 3.426955640426584e-06,
+      "loss": 0.18308441638946532,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.3453938909284546,
+      "learning_rate": 3.4009589697773605e-06,
+      "loss": 0.1829667568206787,
+      "step": 1390
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.3705007173736256,
+      "learning_rate": 3.3748498465002475e-06,
+      "loss": 0.17947460412979127,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.3546369769618238,
+      "learning_rate": 3.3486315293890693e-06,
+      "loss": 0.18765747547149658,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.3781284245896334,
+      "learning_rate": 3.3223072908666053e-06,
+      "loss": 0.17857763767242432,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.3588497462943525,
+      "learning_rate": 3.295880416576153e-06,
+      "loss": 0.18716816902160643,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4018300783669189,
+      "learning_rate": 3.269354204971427e-06,
+      "loss": 0.1817032814025879,
+      "step": 1440
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.43569079636878416,
+      "learning_rate": 3.242731966904865e-06,
+      "loss": 0.17936263084411622,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.4112327697697191,
+      "learning_rate": 3.2160170252143913e-06,
+      "loss": 0.17932071685791015,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.32882512900562777,
+      "learning_rate": 3.1892127143086716e-06,
+      "loss": 0.1854783296585083,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.34718382356494926,
+      "learning_rate": 3.1623223797509347e-06,
+      "loss": 0.17551472187042236,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.40014884763121933,
+      "learning_rate": 3.135349377841396e-06,
+      "loss": 0.17985255718231202,
+      "step": 1490
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.37002552374237196,
+      "learning_rate": 3.1082970751983497e-06,
+      "loss": 0.1858470320701599,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.3189241194339908,
+      "learning_rate": 3.0811688483379546e-06,
+      "loss": 0.1844353437423706,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.3292240464587196,
+      "learning_rate": 3.0539680832528074e-06,
+      "loss": 0.183160138130188,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.3489032040504687,
+      "learning_rate": 3.026698174989316e-06,
+      "loss": 0.18127987384796143,
+      "step": 1530
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.33504467669019317,
+      "learning_rate": 2.999362527223952e-06,
+      "loss": 0.1818302035331726,
+      "step": 1540
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.3278367328955766,
+      "learning_rate": 2.9719645518384194e-06,
+      "loss": 0.18281204700469972,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.3932354078093087,
+      "learning_rate": 2.944507668493807e-06,
+      "loss": 0.18620465993881224,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.3669969573194585,
+      "learning_rate": 2.9169953042037623e-06,
+      "loss": 0.18195055723190307,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.3798793826233043,
+      "learning_rate": 2.889430892906754e-06,
+      "loss": 0.17828736305236817,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.3169054303302339,
+      "learning_rate": 2.861817875037462e-06,
+      "loss": 0.1758497476577759,
+      "step": 1590
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.3427628658073453,
+      "learning_rate": 2.8341596970973683e-06,
+      "loss": 0.18466551303863527,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.3313402627200071,
+      "learning_rate": 2.80645981122458e-06,
+      "loss": 0.17985165119171143,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.3299040944947936,
+      "learning_rate": 2.7787216747629508e-06,
+      "loss": 0.186691951751709,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.34753331909668905,
+      "learning_rate": 2.7509487498305615e-06,
+      "loss": 0.1754052758216858,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.3813115280828973,
+      "learning_rate": 2.7231445028875924e-06,
+      "loss": 0.18044731616973878,
+      "step": 1640
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.34093546280867537,
+      "learning_rate": 2.6953124043036604e-06,
+      "loss": 0.18008266687393187,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.37253670534687966,
+      "learning_rate": 2.667455927924667e-06,
+      "loss": 0.17874090671539306,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.3605836169669652,
+      "learning_rate": 2.6395785506392164e-06,
+      "loss": 0.17416150569915773,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.3528045717062405,
+      "learning_rate": 2.6116837519446407e-06,
+      "loss": 0.17853121757507323,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.35663168003844714,
+      "learning_rate": 2.5837750135127192e-06,
+      "loss": 0.1759209394454956,
+      "step": 1690
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.35462371645127194,
+      "learning_rate": 2.555855818755108e-06,
+      "loss": 0.1807964563369751,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.40893120307942266,
+      "learning_rate": 2.5279296523885636e-06,
+      "loss": 0.1787680983543396,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.35660931340798585,
+      "learning_rate": 2.5e-06,
+      "loss": 0.17910590171813964,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.36395042718977416,
+      "learning_rate": 2.472070347611437e-06,
+      "loss": 0.17353482246398927,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.38459148561610207,
+      "learning_rate": 2.444144181244893e-06,
+      "loss": 0.17830569744110109,
+      "step": 1740
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.3483704729147774,
+      "learning_rate": 2.416224986487282e-06,
+      "loss": 0.17659709453582764,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.37643842809936706,
+      "learning_rate": 2.3883162480553605e-06,
+      "loss": 0.17362364530563354,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.3616237003343465,
+      "learning_rate": 2.3604214493607844e-06,
+      "loss": 0.17707206010818483,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.36273369687906065,
+      "learning_rate": 2.332544072075333e-06,
+      "loss": 0.1810509443283081,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.3538045704417264,
+      "learning_rate": 2.30468759569634e-06,
+      "loss": 0.1798029899597168,
+      "step": 1790
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.3665300341057018,
+      "learning_rate": 2.276855497112408e-06,
+      "loss": 0.17657459974288942,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.3888504913735948,
+      "learning_rate": 2.2490512501694394e-06,
+      "loss": 0.1715422749519348,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.3299991726946758,
+      "learning_rate": 2.2212783252370496e-06,
+      "loss": 0.17716653347015382,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.33385446323519324,
+      "learning_rate": 2.1935401887754213e-06,
+      "loss": 0.18214735984802247,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.38150050429665633,
+      "learning_rate": 2.165840302902632e-06,
+      "loss": 0.17498456239700316,
+      "step": 1840
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.3772340757946472,
+      "learning_rate": 2.1381821249625383e-06,
+      "loss": 0.1807033061981201,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.3995250415291945,
+      "learning_rate": 2.1105691070932465e-06,
+      "loss": 0.17290885448455812,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.3722805742071356,
+      "learning_rate": 2.083004695796238e-06,
+      "loss": 0.17120599746704102,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.4127408400570583,
+      "learning_rate": 2.055492331506194e-06,
+      "loss": 0.17853354215621947,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.38023468325708004,
+      "learning_rate": 2.0280354481615814e-06,
+      "loss": 0.17626445293426513,
+      "step": 1890
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.39569320233551497,
+      "learning_rate": 2.000637472776049e-06,
+      "loss": 0.17760238647460938,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.3395104645209661,
+      "learning_rate": 1.973301825010685e-06,
+      "loss": 0.17229535579681396,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.40844052109667645,
+      "learning_rate": 1.9460319167471934e-06,
+      "loss": 0.17733073234558105,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.37810430448153487,
+      "learning_rate": 1.9188311516620466e-06,
+      "loss": 0.17335065603256225,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.40814146676733887,
+      "learning_rate": 1.891702924801651e-06,
+      "loss": 0.17898330688476563,
+      "step": 1940
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.3602487349877917,
+      "learning_rate": 1.864650622158604e-06,
+      "loss": 0.18250794410705568,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.3393538607720307,
+      "learning_rate": 1.8376776202490666e-06,
+      "loss": 0.1782406210899353,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.3915429710722421,
+      "learning_rate": 1.8107872856913293e-06,
+      "loss": 0.1735108494758606,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.37985437272954925,
+      "learning_rate": 1.7839829747856096e-06,
+      "loss": 0.1750550627708435,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.38539358407326346,
+      "learning_rate": 1.7572680330951359e-06,
+      "loss": 0.1733097553253174,
+      "step": 1990
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.3738259217786014,
+      "learning_rate": 1.7306457950285747e-06,
+      "loss": 0.17515769004821777,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.3819943763249579,
+      "learning_rate": 1.704119583423848e-06,
+      "loss": 0.16884570121765136,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.389801745248532,
+      "learning_rate": 1.677692709133396e-06,
+      "loss": 0.17741096019744873,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.3653120667345927,
+      "learning_rate": 1.6513684706109311e-06,
+      "loss": 0.17773522138595582,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.37933212087707896,
+      "learning_rate": 1.6251501534997529e-06,
+      "loss": 0.17607669830322265,
+      "step": 2040
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.3321821531044163,
+      "learning_rate": 1.5990410302226405e-06,
+      "loss": 0.16934506893157958,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4060257595183473,
+      "learning_rate": 1.5730443595734162e-06,
+      "loss": 0.17470494508743287,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.4081327316011303,
+      "learning_rate": 1.5471633863101982e-06,
+      "loss": 0.1757789969444275,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.38926392491151846,
+      "learning_rate": 1.521401340750407e-06,
+      "loss": 0.17842782735824586,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.3688818898128435,
+      "learning_rate": 1.495761438367577e-06,
+      "loss": 0.17657792568206787,
+      "step": 2090
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.3996797165946016,
+      "learning_rate": 1.4702468793900187e-06,
+      "loss": 0.17507438659667968,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.3660129183813682,
+      "learning_rate": 1.444860848401384e-06,
+      "loss": 0.17568498849868774,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.40919180930559174,
+      "learning_rate": 1.4196065139431866e-06,
+      "loss": 0.17516255378723145,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.3883216412697579,
+      "learning_rate": 1.3944870281193178e-06,
+      "loss": 0.17390661239624022,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.36101851282241604,
+      "learning_rate": 1.3695055262026208e-06,
+      "loss": 0.1793433666229248,
+      "step": 2140
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.351474002817543,
+      "learning_rate": 1.3446651262435679e-06,
+      "loss": 0.17248008251190186,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.3584227522515397,
+      "learning_rate": 1.3199689286810746e-06,
+      "loss": 0.17412493228912354,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.3698435441682052,
+      "learning_rate": 1.2954200159555294e-06,
+      "loss": 0.16935389041900634,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.3875871103452335,
+      "learning_rate": 1.2710214521240527e-06,
+      "loss": 0.17429906129837036,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.38818689314212534,
+      "learning_rate": 1.246776282478063e-06,
+      "loss": 0.175516939163208,
+      "step": 2190
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.35684099766648697,
+      "learning_rate": 1.222687533163181e-06,
+      "loss": 0.1735387325286865,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.42314786020598566,
+      "learning_rate": 1.1987582108015228e-06,
+      "loss": 0.1752473831176758,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.3896139405923189,
+      "learning_rate": 1.1749913021164255e-06,
+      "loss": 0.1759577751159668,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.4351615457717856,
+      "learning_rate": 1.1513897735596702e-06,
+      "loss": 0.17103490829467774,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.3834559121154836,
+      "learning_rate": 1.127956570941218e-06,
+      "loss": 0.16780270338058473,
+      "step": 2240
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.39841337538803123,
+      "learning_rate": 1.104694619061533e-06,
+      "loss": 0.1768646717071533,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.37995888893004054,
+      "learning_rate": 1.0816068213465295e-06,
+      "loss": 0.17878193855285646,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.3883854717990393,
+      "learning_rate": 1.0586960594851762e-06,
+      "loss": 0.17407444715499878,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.41133967353441564,
+      "learning_rate": 1.0359651930698217e-06,
+      "loss": 0.17313239574432374,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.42036949314037747,
+      "learning_rate": 1.0134170592392837e-06,
+      "loss": 0.17422957420349122,
+      "step": 2290
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.4147236855875292,
+      "learning_rate": 9.910544723247204e-07,
+      "loss": 0.17318263053894042,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.43489255396692184,
+      "learning_rate": 9.688802234983706e-07,
+      "loss": 0.17206907272338867,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.4244354677425118,
+      "learning_rate": 9.468970804251742e-07,
+      "loss": 0.17518973350524902,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.40784179462155445,
+      "learning_rate": 9.251077869173244e-07,
+      "loss": 0.1695675015449524,
+      "step": 2330
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.3987641796361722,
+      "learning_rate": 9.035150625918054e-07,
+      "loss": 0.17051217555999756,
+      "step": 2340
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.4187928539614081,
+      "learning_rate": 8.821216025309395e-07,
+      "loss": 0.17391297817230225,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.3690037869142762,
+      "learning_rate": 8.609300769460055e-07,
+      "loss": 0.16910953521728517,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.3825088904788983,
+      "learning_rate": 8.399431308439592e-07,
+      "loss": 0.17401565313339235,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.42264696630595905,
+      "learning_rate": 8.191633836972962e-07,
+      "loss": 0.17544336318969728,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.37637912026899967,
+      "learning_rate": 7.985934291171024e-07,
+      "loss": 0.17184804677963256,
+      "step": 2390
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.41579932435136646,
+      "learning_rate": 7.7823583452934e-07,
+      "loss": 0.17463115453720093,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.3686323403281178,
+      "learning_rate": 7.58093140854389e-07,
+      "loss": 0.17356055974960327,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.38576880068059977,
+      "learning_rate": 7.381678621899077e-07,
+      "loss": 0.17460007667541505,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.42703129906735826,
+      "learning_rate": 7.184624854970379e-07,
+      "loss": 0.17067699432373046,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.34858414304666985,
+      "learning_rate": 6.989794702899932e-07,
+      "loss": 0.17033281326293945,
+      "step": 2440
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.4152200143790169,
+      "learning_rate": 6.797212483290777e-07,
+      "loss": 0.17171205282211305,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.38517462156418525,
+      "learning_rate": 6.60690223317171e-07,
+      "loss": 0.16951452493667601,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.3434889573405646,
+      "learning_rate": 6.418887705997046e-07,
+      "loss": 0.17201969623565674,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.44971621368685466,
+      "learning_rate": 6.23319236868189e-07,
+      "loss": 0.1749892234802246,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.36196201020004853,
+      "learning_rate": 6.049839398673141e-07,
+      "loss": 0.17593114376068114,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.43060409309666475,
+      "learning_rate": 5.868851681056567e-07,
+      "loss": 0.17676409482955932,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.3991002538591168,
+      "learning_rate": 5.690251805700467e-07,
+      "loss": 0.17424919605255126,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.40310381220451585,
+      "learning_rate": 5.514062064436096e-07,
+      "loss": 0.17589831352233887,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.39399011188949445,
+      "learning_rate": 5.34030444827533e-07,
+      "loss": 0.17281873226165773,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.4083878901260424,
+      "learning_rate": 5.169000644665895e-07,
+      "loss": 0.16947317123413086,
+      "step": 2540
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.3851361432363648,
+      "learning_rate": 5.000172034784442e-07,
+      "loss": 0.1711735486984253,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.39337090058301416,
+      "learning_rate": 4.833839690867853e-07,
+      "loss": 0.17315969467163086,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.40037871082344,
+      "learning_rate": 4.6700243735831705e-07,
+      "loss": 0.16745550632476808,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.37587338743617393,
+      "learning_rate": 4.508746529436311e-07,
+      "loss": 0.16563820838928223,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.4123037543150221,
+      "learning_rate": 4.350026288220083e-07,
+      "loss": 0.17215006351470946,
+      "step": 2590
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.38068425945027606,
+      "learning_rate": 4.1938834605017133e-07,
+      "loss": 0.1699705719947815,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.3857308249335943,
+      "learning_rate": 4.0403375351501515e-07,
+      "loss": 0.16985273361206055,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.3771439406828394,
+      "learning_rate": 3.88940767690362e-07,
+      "loss": 0.16914334297180175,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.42087726200191267,
+      "learning_rate": 3.7411127239775774e-07,
+      "loss": 0.1676808476448059,
+      "step": 2630
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.4027649556029035,
+      "learning_rate": 3.595471185713431e-07,
+      "loss": 0.16704294681549073,
+      "step": 2640
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.41244639471418854,
+      "learning_rate": 3.4525012402682826e-07,
+      "loss": 0.16766990423202516,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.4188553008501556,
+      "learning_rate": 3.3122207323460804e-07,
+      "loss": 0.17007026672363282,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.37308514287025385,
+      "learning_rate": 3.1746471709702963e-07,
+      "loss": 0.16677470207214357,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.3866325428538323,
+      "learning_rate": 3.039797727298585e-07,
+      "loss": 0.17320499420166016,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.44679886990626905,
+      "learning_rate": 2.9076892324795546e-07,
+      "loss": 0.17161571979522705,
+      "step": 2690
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.39358272297855784,
+      "learning_rate": 2.778338175551995e-07,
+      "loss": 0.16702463626861572,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.40122291418570893,
+      "learning_rate": 2.6517607013868326e-07,
+      "loss": 0.17485493421554565,
+      "step": 2710
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.41570716353907156,
+      "learning_rate": 2.527972608672002e-07,
+      "loss": 0.17134263515472412,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.4519701719039968,
+      "learning_rate": 2.40698934794053e-07,
+      "loss": 0.17035775184631347,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.4118651442289031,
+      "learning_rate": 2.2888260196421237e-07,
+      "loss": 0.1699577808380127,
+      "step": 2740
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3941928006067303,
+      "learning_rate": 2.1734973722583735e-07,
+      "loss": 0.17237366437911988,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.40832217608875215,
+      "learning_rate": 2.0610178004619564e-07,
+      "loss": 0.16440961360931397,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.3971180240809628,
+      "learning_rate": 1.9514013433199834e-07,
+      "loss": 0.17652008533477784,
+      "step": 2770
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.407761577951783,
+      "learning_rate": 1.8446616825416958e-07,
+      "loss": 0.174829363822937,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.43193458090135123,
+      "learning_rate": 1.7408121407708007e-07,
+      "loss": 0.1740306258201599,
+      "step": 2790
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.42658943363969254,
+      "learning_rate": 1.6398656799226253e-07,
+      "loss": 0.16446399688720703,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.3958915075423234,
+      "learning_rate": 1.5418348995662773e-07,
+      "loss": 0.170265531539917,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.42859321093856184,
+      "learning_rate": 1.4467320353520275e-07,
+      "loss": 0.16803120374679564,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.38305141869358017,
+      "learning_rate": 1.3545689574841341e-07,
+      "loss": 0.16951898336410523,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.3798038400686433,
+      "learning_rate": 1.26535716923927e-07,
+      "loss": 0.17529479265213013,
+      "step": 2840
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.4110364408684957,
+      "learning_rate": 1.1791078055307493e-07,
+      "loss": 0.1713550090789795,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.42376391140315484,
+      "learning_rate": 1.0958316315187289e-07,
+      "loss": 0.17042005062103271,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.3797604889728542,
+      "learning_rate": 1.0155390412665528e-07,
+      "loss": 0.1684396505355835,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.3914880075566837,
+      "learning_rate": 9.38240056443443e-08,
+      "loss": 0.16648833751678466,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.4028554721018558,
+      "learning_rate": 8.639443250736402e-08,
+      "loss": 0.16861410140991212,
+      "step": 2890
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.4114822506067345,
+      "learning_rate": 7.926611203321777e-08,
+      "loss": 0.1701727867126465,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.4152545647271041,
+      "learning_rate": 7.243993393874882e-08,
+      "loss": 0.17078380584716796,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.38756806680774125,
+      "learning_rate": 6.591675022908805e-08,
+      "loss": 0.16773648262023927,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.42784086073295813,
+      "learning_rate": 5.969737509131241e-08,
+      "loss": 0.16553585529327391,
+      "step": 2930
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.4042746869722402,
+      "learning_rate": 5.3782584792823334e-08,
+      "loss": 0.17082666158676146,
+      "step": 2940
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.4061618058805029,
+      "learning_rate": 4.817311758445686e-08,
+      "loss": 0.17140612602233887,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.391081181391559,
+      "learning_rate": 4.286967360833866e-08,
+      "loss": 0.17159396409988403,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.41735466488493406,
+      "learning_rate": 3.787291481049754e-08,
+      "loss": 0.17192131280899048,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.4185419841597216,
+      "learning_rate": 3.3183464858244364e-08,
+      "loss": 0.17720068693161012,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.39722761835716225,
+      "learning_rate": 2.8801909062328992e-08,
+      "loss": 0.16387460231781006,
+      "step": 2990
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.3849231647626283,
+      "learning_rate": 2.4728794303886248e-08,
+      "loss": 0.16302640438079835,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.41574175274160263,
+      "learning_rate": 2.0964628966175794e-08,
+      "loss": 0.1683335542678833,
+      "step": 3010
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.39544014121768384,
+      "learning_rate": 1.750988287113009e-08,
+      "loss": 0.16783595085144043,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.37372955897537213,
+      "learning_rate": 1.4364987220713278e-08,
+      "loss": 0.17508283853530884,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.40018108643956146,
+      "learning_rate": 1.1530334543099763e-08,
+      "loss": 0.17274413108825684,
+      "step": 3040
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.41874909868220905,
+      "learning_rate": 9.006278643683697e-09,
+      "loss": 0.17360517978668213,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.43201391828339053,
+      "learning_rate": 6.793134560916514e-09,
+      "loss": 0.17629475593566896,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.35261808222980445,
+      "learning_rate": 4.891178526986451e-09,
+      "loss": 0.1717047333717346,
+      "step": 3070
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.4309234115032205,
+      "learning_rate": 3.3006479333413943e-09,
+      "loss": 0.17459846735000611,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.4212724049580533,
+      "learning_rate": 2.021741301058422e-09,
+      "loss": 0.1706598997116089,
+      "step": 3090
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.45412833541759956,
+      "learning_rate": 1.0546182560652872e-09,
+      "loss": 0.17277355194091798,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.3904315097822292,
+      "learning_rate": 3.9939950921774607e-10,
+      "loss": 0.1682264804840088,
+      "step": 3110
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.3934819932959913,
+      "learning_rate": 5.616684123160854e-11,
+      "loss": 0.1676420211791992,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0,
+      "step": 3125,
+      "total_flos": 1865119859998720.0,
+      "train_loss": 0.1913537948036194,
+      "train_runtime": 24530.7619,
+      "train_samples_per_second": 8.153,
+      "train_steps_per_second": 0.127
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1865119859998720.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoints/Qwen3-VL-8B-SFT/training_loss.png b/checkpoints/Qwen3-VL-8B-SFT/training_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..5328d4592bcf555506300beff5f493f6fad204ea
Binary files /dev/null and b/checkpoints/Qwen3-VL-8B-SFT/training_loss.png differ
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/all_results.json b/checkpoints/Qwen3.5-0.8B-SFT/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..233d46152f5903545009fef74c65a4173ff6ca1d
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1.4635172064982467e+18,
+    "train_loss": 0.20465421264648437,
+    "train_runtime": 15978.2192,
+    "train_samples_per_second": 12.517,
+    "train_steps_per_second": 0.196
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/chat_template.jinja b/checkpoints/Qwen3.5-0.8B-SFT/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..0ef09f214eaa6d9bca297988afc1454b5827b2c7
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/chat_template.jinja
@@ -0,0 +1,154 @@
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if message.role == "system" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is defined %}
+                    {%- for args_name, args_value in tool_call.arguments|items %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is true %}
+        {{- '<think>\n' }}
+    {%- else %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/config.json b/checkpoints/Qwen3.5-0.8B-SFT/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4286f4491ccaf2b7d25683a7bf4b7456a3857dea
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/config.json
@@ -0,0 +1,105 @@
+{
+  "architectures": [
+    "Qwen3_5ForConditionalGeneration"
+  ],
+  "dtype": "bfloat16",
+  "eos_token_id": 248046,
+  "hidden_size": 1024,
+  "image_token_id": 248056,
+  "model_type": "qwen3_5",
+  "pad_token_id": 248044,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_output_gate": true,
+    "bos_token_id": null,
+    "dtype": "bfloat16",
+    "eos_token_id": 248044,
+    "full_attention_interval": 4,
+    "head_dim": 256,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 3584,
+    "layer_types": [
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention"
+    ],
+    "linear_conv_kernel_dim": 4,
+    "linear_key_head_dim": 128,
+    "linear_num_key_heads": 16,
+    "linear_num_value_heads": 16,
+    "linear_value_head_dim": 128,
+    "mamba_ssm_dtype": "float32",
+    "max_position_embeddings": 262144,
+    "mlp_only_layers": [],
+    "model_type": "qwen3_5_text",
+    "mtp_num_hidden_layers": 1,
+    "mtp_use_dedicated_embeddings": false,
+    "num_attention_heads": 8,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 2,
+    "pad_token_id": null,
+    "partial_rotary_factor": 0.25,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        11,
+        11,
+        10
+      ],
+      "partial_rotary_factor": 0.25,
+      "rope_theta": 10000000,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": true,
+    "use_cache": false,
+    "vocab_size": 248320
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.3",
+  "use_cache": false,
+  "video_token_id": 248057,
+  "vision_config": {
+    "deepstack_visual_indexes": [],
+    "depth": 12,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "model_type": "qwen3_5",
+    "num_heads": 12,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 1024,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 248054,
+  "vision_start_token_id": 248053
+}
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/eval_results_job_qwen35_small_qwen35_08b_20260430_083113.json b/checkpoints/Qwen3.5-0.8B-SFT/eval_results_job_qwen35_small_qwen35_08b_20260430_083113.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2e16d7cf9f4baaf2fe2900776783e55b45bdb52
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/eval_results_job_qwen35_small_qwen35_08b_20260430_083113.json
@@ -0,0 +1,56 @@
+{
+  "mae_dx": 0.1739534482758621,
+  "rmse_dx": 0.5504707859704242,
+  "mae_dy": 0.14513103448275863,
+  "rmse_dy": 0.4020164261526533,
+  "mae_dz": 0.01417241379310345,
+  "rmse_dz": 0.08441768077592636,
+  "mae_dpitch": 0.3197724137931034,
+  "rmse_dpitch": 0.6987947505872348,
+  "mae_dyaw": 1.2795741379310344,
+  "rmse_dyaw": 2.728678210390897,
+  "mae_droll": 0.0,
+  "rmse_droll": 0.0,
+  "mae_overall": 0.32210057471264364,
+  "mae_position": 0.11108563218390805,
+  "mae_rotation": 0.5331155172413793,
+  "rmse_overall": 1.1836215975584834,
+  "wp1_euc_mae": 0.0785504786394066,
+  "wp1_euc_median": 0.022360679774997894,
+  "wp2_euc_mae": 0.15462871597591013,
+  "wp2_euc_median": 0.050990195135927854,
+  "wp3_euc_mae": 0.2431200521888108,
+  "wp3_euc_median": 0.09219544457292884,
+  "wp4_euc_mae": 0.35202525203074125,
+  "wp4_euc_median": 0.13379035964562547,
+  "wp5_euc_mae": 0.47539165669032896,
+  "wp5_euc_median": 0.18828128246084586,
+  "euclidean_mae": 0.26074323110503955,
+  "ADE": 0.2607432311050396,
+  "FDE": 0.47539165669032896,
+  "ADE_median": 0.100752198494543,
+  "FDE_median": 0.18828128246084586,
+  "SR@0.5m": 0.88,
+  "SR@1.0m": 0.9484482758620689,
+  "SR@2.0m": 0.9817241379310345,
+  "SR@5.0m": 0.9962068965517241,
+  "TrajSR@1.0m": 0.8870689655172413,
+  "TrajSR@2.0m": 0.9586206896551724,
+  "TrajSR@5.0m": 0.9905172413793103,
+  "RotAcc@1.0deg": 0.6370689655172413,
+  "RotAcc@5.0deg": 0.9410344827586207,
+  "RotAcc@10.0deg": 0.9863793103448276,
+  "wp1_rot_mae": 0.7137537474185929,
+  "wp2_rot_mae": 0.9943262207547675,
+  "wp3_rot_mae": 1.331487042838617,
+  "wp4_rot_mae": 1.7135381878045994,
+  "wp5_rot_mae": 2.1344669020838754,
+  "rotation_euc_mae": 1.3775144201800904,
+  "parse_failure_rate": 0.0,
+  "parse_success_rate": 1.0,
+  "valid_samples": 1160,
+  "total_samples": 1160,
+  "parse_failures": 0,
+  "inference_engine": "vllm",
+  "vllm_version": "0.19.0"
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/generation_config.json b/checkpoints/Qwen3.5-0.8B-SFT/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3eda838f46234dcfea054e23287d1718e943e6aa
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/generation_config.json
@@ -0,0 +1,10 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": [
+    248046,
+    248044
+  ],
+  "pad_token_id": 248044,
+  "transformers_version": "5.5.3",
+  "use_cache": true
+}
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/model.safetensors b/checkpoints/Qwen3.5-0.8B-SFT/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0119119a85749f090039163c09e855fdfd6e46c3
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6aa08a7a930536740ff1ee903028572824f61230da287369643f39b16794ae7
+size 2214589376
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/processor_config.json b/checkpoints/Qwen3.5-0.8B-SFT/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..33818c7f9e991ad735fd240209f4fa73e6c28c50
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/processor_config.json
@@ -0,0 +1,60 @@
+{
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "Qwen2VLImageProcessor",
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "merge_size": 2,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "longest_edge": 16777216,
+      "shortest_edge": 65536
+    },
+    "temporal_patch_size": 2
+  },
+  "processor_class": "Qwen3VLProcessor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "fps": 2,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_frames": 768,
+    "merge_size": 2,
+    "min_frames": 4,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "size": {
+      "longest_edge": 25165824,
+      "shortest_edge": 4096
+    },
+    "temporal_patch_size": 2,
+    "video_processor_type": "Qwen3VLVideoProcessor"
+  }
+}
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/tokenizer.json b/checkpoints/Qwen3.5-0.8B-SFT/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..67741b04f23bfdb46501f748ce27865ec82eccfb
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87a7830d63fcf43bf241c3c5242e96e62dd3fdc29224ca26fed8ea333db72de4
+size 19989343
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/tokenizer_config.json b/checkpoints/Qwen3.5-0.8B-SFT/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..aeb7593d501d68475dd8091372c6d633a7a2c63f
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/tokenizer_config.json
@@ -0,0 +1,33 @@
+{
+  "add_prefix_space": false,
+  "audio_bos_token": "<|audio_start|>",
+  "audio_eos_token": "<|audio_end|>",
+  "audio_token": "<|audio_pad|>",
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "image_token": "<|image_pad|>",
+  "is_local": true,
+  "model_max_length": 262144,
+  "model_specific_special_tokens": {
+    "audio_bos_token": "<|audio_start|>",
+    "audio_eos_token": "<|audio_end|>",
+    "audio_token": "<|audio_pad|>",
+    "image_token": "<|image_pad|>",
+    "video_token": "<|video_pad|>",
+    "vision_bos_token": "<|vision_start|>",
+    "vision_eos_token": "<|vision_end|>"
+  },
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+  "processor_class": "Qwen3VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": null,
+  "video_token": "<|video_pad|>",
+  "vision_bos_token": "<|vision_start|>",
+  "vision_eos_token": "<|vision_end|>"
+}
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/train_results.json b/checkpoints/Qwen3.5-0.8B-SFT/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..233d46152f5903545009fef74c65a4173ff6ca1d
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 1.4635172064982467e+18,
+    "train_loss": 0.20465421264648437,
+    "train_runtime": 15978.2192,
+    "train_samples_per_second": 12.517,
+    "train_steps_per_second": 0.196
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/trainer_state.json b/checkpoints/Qwen3.5-0.8B-SFT/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b26d89212a698d09f188035d20b02a548824ad63
--- /dev/null
+++ b/checkpoints/Qwen3.5-0.8B-SFT/trainer_state.json
@@ -0,0 +1,2227 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 24.154499053955078,
+      "learning_rate": 1.437699680511182e-07,
+      "loss": 0.43009061813354493,
+      "step": 10
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 13.298259735107422,
+      "learning_rate": 3.0351437699680514e-07,
+      "loss": 0.4069969654083252,
+      "step": 20
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 4.6046857833862305,
+      "learning_rate": 4.6325878594249205e-07,
+      "loss": 0.3430281639099121,
+      "step": 30
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 2.4529078006744385,
+      "learning_rate": 6.230031948881789e-07,
+      "loss": 0.3079708099365234,
+      "step": 40
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.7886693477630615,
+      "learning_rate": 7.82747603833866e-07,
+      "loss": 0.2815701484680176,
+      "step": 50
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.699796438217163,
+      "learning_rate": 9.424920127795528e-07,
+      "loss": 0.2694547653198242,
+      "step": 60
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 1.3927942514419556,
+      "learning_rate": 1.1022364217252397e-06,
+      "loss": 0.27406697273254393,
+      "step": 70
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 1.410854458808899,
+      "learning_rate": 1.2619808306709266e-06,
+      "loss": 0.25742459297180176,
+      "step": 80
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 1.304625391960144,
+      "learning_rate": 1.4217252396166134e-06,
+      "loss": 0.2527207851409912,
+      "step": 90
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.3793327808380127,
+      "learning_rate": 1.5814696485623005e-06,
+      "loss": 0.2565553426742554,
+      "step": 100
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 1.3987141847610474,
+      "learning_rate": 1.7412140575079875e-06,
+      "loss": 0.25747051239013674,
+      "step": 110
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 1.3587315082550049,
+      "learning_rate": 1.9009584664536742e-06,
+      "loss": 0.24701259136199952,
+      "step": 120
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 1.4011516571044922,
+      "learning_rate": 2.060702875399361e-06,
+      "loss": 0.24983885288238525,
+      "step": 130
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 1.399139165878296,
+      "learning_rate": 2.220447284345048e-06,
+      "loss": 0.2414431095123291,
+      "step": 140
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.3931989669799805,
+      "learning_rate": 2.380191693290735e-06,
+      "loss": 0.239495849609375,
+      "step": 150
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 1.2638698816299438,
+      "learning_rate": 2.539936102236422e-06,
+      "loss": 0.23560638427734376,
+      "step": 160
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 1.3124679327011108,
+      "learning_rate": 2.699680511182109e-06,
+      "loss": 0.2441340923309326,
+      "step": 170
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 1.3111082315444946,
+      "learning_rate": 2.8594249201277955e-06,
+      "loss": 0.2364725112915039,
+      "step": 180
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 1.3086109161376953,
+      "learning_rate": 3.0191693290734825e-06,
+      "loss": 0.24139461517333985,
+      "step": 190
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.344279170036316,
+      "learning_rate": 3.17891373801917e-06,
+      "loss": 0.24218306541442872,
+      "step": 200
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 1.2795510292053223,
+      "learning_rate": 3.3386581469648564e-06,
+      "loss": 0.23907582759857177,
+      "step": 210
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 1.0937620401382446,
+      "learning_rate": 3.4984025559105434e-06,
+      "loss": 0.2335747480392456,
+      "step": 220
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 1.3082506656646729,
+      "learning_rate": 3.6581469648562303e-06,
+      "loss": 0.2359257698059082,
+      "step": 230
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 1.1195415258407593,
+      "learning_rate": 3.817891373801918e-06,
+      "loss": 0.23113086223602294,
+      "step": 240
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.0815153121948242,
+      "learning_rate": 3.977635782747604e-06,
+      "loss": 0.22873566150665284,
+      "step": 250
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 1.2189197540283203,
+      "learning_rate": 4.137380191693291e-06,
+      "loss": 0.2301633596420288,
+      "step": 260
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 1.0007416009902954,
+      "learning_rate": 4.297124600638978e-06,
+      "loss": 0.23275952339172362,
+      "step": 270
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 1.0670294761657715,
+      "learning_rate": 4.456869009584665e-06,
+      "loss": 0.22844791412353516,
+      "step": 280
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 1.0621100664138794,
+      "learning_rate": 4.616613418530352e-06,
+      "loss": 0.23461694717407228,
+      "step": 290
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 1.184499740600586,
+      "learning_rate": 4.776357827476039e-06,
+      "loss": 0.23582732677459717,
+      "step": 300
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 1.1273094415664673,
+      "learning_rate": 4.936102236421725e-06,
+      "loss": 0.22369306087493895,
+      "step": 310
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.982417106628418,
+      "learning_rate": 4.999943833158769e-06,
+      "loss": 0.2245041847229004,
+      "step": 320
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 1.0180294513702393,
+      "learning_rate": 4.999600600490783e-06,
+      "loss": 0.2303633213043213,
+      "step": 330
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 1.0338643789291382,
+      "learning_rate": 4.9989453817439345e-06,
+      "loss": 0.22460854053497314,
+      "step": 340
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.8967295289039612,
+      "learning_rate": 4.997978258698942e-06,
+      "loss": 0.224709415435791,
+      "step": 350
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 1.074678897857666,
+      "learning_rate": 4.996699352066659e-06,
+      "loss": 0.22632455825805664,
+      "step": 360
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.9827248454093933,
+      "learning_rate": 4.995108821473014e-06,
+      "loss": 0.22602491378784179,
+      "step": 370
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.954534649848938,
+      "learning_rate": 4.993206865439084e-06,
+      "loss": 0.2213623046875,
+      "step": 380
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.8947958946228027,
+      "learning_rate": 4.990993721356317e-06,
+      "loss": 0.22030019760131836,
+      "step": 390
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.994922399520874,
+      "learning_rate": 4.988469665456901e-06,
+      "loss": 0.22146718502044677,
+      "step": 400
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.960228443145752,
+      "learning_rate": 4.985635012779288e-06,
+      "loss": 0.2290752649307251,
+      "step": 410
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.8288887739181519,
+      "learning_rate": 4.98249011712887e-06,
+      "loss": 0.22198934555053712,
+      "step": 420
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.9185967445373535,
+      "learning_rate": 4.979035371033824e-06,
+      "loss": 0.22025692462921143,
+      "step": 430
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.9180241227149963,
+      "learning_rate": 4.975271205696115e-06,
+      "loss": 0.22060942649841309,
+      "step": 440
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.9974319338798523,
+      "learning_rate": 4.971198090937671e-06,
+      "loss": 0.21446082592010499,
+      "step": 450
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 1.0199272632598877,
+      "learning_rate": 4.966816535141756e-06,
+      "loss": 0.21097931861877442,
+      "step": 460
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 1.0272812843322754,
+      "learning_rate": 4.9621270851895035e-06,
+      "loss": 0.22108638286590576,
+      "step": 470
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.969963014125824,
+      "learning_rate": 4.957130326391662e-06,
+      "loss": 0.22175126075744628,
+      "step": 480
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.981428325176239,
+      "learning_rate": 4.951826882415544e-06,
+      "loss": 0.21852502822875977,
+      "step": 490
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.0144567489624023,
+      "learning_rate": 4.946217415207177e-06,
+      "loss": 0.2126768112182617,
+      "step": 500
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.8966764807701111,
+      "learning_rate": 4.940302624908689e-06,
+      "loss": 0.21713790893554688,
+      "step": 510
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 1.0284103155136108,
+      "learning_rate": 4.934083249770912e-06,
+      "loss": 0.21167585849761963,
+      "step": 520
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.9075572490692139,
+      "learning_rate": 4.927560066061251e-06,
+      "loss": 0.21518006324768066,
+      "step": 530
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.8963767886161804,
+      "learning_rate": 4.920733887966783e-06,
+      "loss": 0.22648308277130128,
+      "step": 540
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.8435696363449097,
+      "learning_rate": 4.913605567492636e-06,
+      "loss": 0.2141176223754883,
+      "step": 550
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.864231526851654,
+      "learning_rate": 4.906175994355656e-06,
+      "loss": 0.21634316444396973,
+      "step": 560
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 1.0810909271240234,
+      "learning_rate": 4.898446095873345e-06,
+      "loss": 0.2190020799636841,
+      "step": 570
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.9246620535850525,
+      "learning_rate": 4.890416836848128e-06,
+      "loss": 0.21658658981323242,
+      "step": 580
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.9349477291107178,
+      "learning_rate": 4.882089219446925e-06,
+      "loss": 0.21075100898742677,
+      "step": 590
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 1.0043901205062866,
+      "learning_rate": 4.873464283076074e-06,
+      "loss": 0.21601030826568604,
+      "step": 600
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.8367205858230591,
+      "learning_rate": 4.864543104251587e-06,
+      "loss": 0.2150674819946289,
+      "step": 610
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.9035486578941345,
+      "learning_rate": 4.855326796464798e-06,
+      "loss": 0.21784515380859376,
+      "step": 620
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.8648081421852112,
+      "learning_rate": 4.8458165100433725e-06,
+      "loss": 0.21317291259765625,
+      "step": 630
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.9622132182121277,
+      "learning_rate": 4.836013432007738e-06,
+      "loss": 0.21332902908325196,
+      "step": 640
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.8541603684425354,
+      "learning_rate": 4.825918785922921e-06,
+      "loss": 0.21687707901000977,
+      "step": 650
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.9410703182220459,
+      "learning_rate": 4.8155338317458315e-06,
+      "loss": 0.21425023078918456,
+      "step": 660
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.8768807649612427,
+      "learning_rate": 4.804859865668002e-06,
+      "loss": 0.21114716529846192,
+      "step": 670
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.9346839785575867,
+      "learning_rate": 4.793898219953804e-06,
+      "loss": 0.21280355453491212,
+      "step": 680
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.773104727268219,
+      "learning_rate": 4.782650262774164e-06,
+      "loss": 0.21328551769256593,
+      "step": 690
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.8742129802703857,
+      "learning_rate": 4.7711173980357886e-06,
+      "loss": 0.2097193717956543,
+      "step": 700
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.851413905620575,
+      "learning_rate": 4.759301065205947e-06,
+      "loss": 0.21059794425964357,
+      "step": 710
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.9423609972000122,
+      "learning_rate": 4.7472027391328e-06,
+      "loss": 0.2117161273956299,
+      "step": 720
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 1.0031023025512695,
+      "learning_rate": 4.734823929861317e-06,
+      "loss": 0.21474902629852294,
+      "step": 730
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.9702557325363159,
+      "learning_rate": 4.722166182444801e-06,
+      "loss": 0.2112131357192993,
+      "step": 740
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.8935959935188293,
+      "learning_rate": 4.709231076752045e-06,
+      "loss": 0.2096531867980957,
+      "step": 750
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.8280373215675354,
+      "learning_rate": 4.696020227270142e-06,
+      "loss": 0.2149428367614746,
+      "step": 760
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.8387318849563599,
+      "learning_rate": 4.6825352829029705e-06,
+      "loss": 0.21069800853729248,
+      "step": 770
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.9887408018112183,
+      "learning_rate": 4.668777926765392e-06,
+      "loss": 0.20976176261901855,
+      "step": 780
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.9188724756240845,
+      "learning_rate": 4.6547498759731725e-06,
+      "loss": 0.20372920036315917,
+      "step": 790
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.8613991737365723,
+      "learning_rate": 4.6404528814286575e-06,
+      "loss": 0.20750484466552735,
+      "step": 800
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.8634099960327148,
+      "learning_rate": 4.6258887276022425e-06,
+      "loss": 0.21537406444549562,
+      "step": 810
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 1.0032637119293213,
+      "learning_rate": 4.611059232309639e-06,
+      "loss": 0.20938510894775392,
+      "step": 820
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 1.013604998588562,
+      "learning_rate": 4.595966246484986e-06,
+      "loss": 0.21059298515319824,
+      "step": 830
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.884544312953949,
+      "learning_rate": 4.580611653949829e-06,
+      "loss": 0.2108741283416748,
+      "step": 840
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.8698775768280029,
+      "learning_rate": 4.564997371177992e-06,
+      "loss": 0.2090519905090332,
+      "step": 850
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.883059024810791,
+      "learning_rate": 4.54912534705637e-06,
+      "loss": 0.20870964527130126,
+      "step": 860
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.8841277360916138,
+      "learning_rate": 4.532997562641683e-06,
+      "loss": 0.20556044578552246,
+      "step": 870
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.9181431531906128,
+      "learning_rate": 4.516616030913214e-06,
+      "loss": 0.20940375328063965,
+      "step": 880
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.8961875438690186,
+      "learning_rate": 4.499982796521556e-06,
+      "loss": 0.20449237823486327,
+      "step": 890
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.9166404604911804,
+      "learning_rate": 4.48309993553341e-06,
+      "loss": 0.20691614151000975,
+      "step": 900
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.8640049695968628,
+      "learning_rate": 4.465969555172468e-06,
+      "loss": 0.20648136138916015,
+      "step": 910
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.8916935324668884,
+      "learning_rate": 4.448593793556391e-06,
+      "loss": 0.2128384828567505,
+      "step": 920
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.9438337087631226,
+      "learning_rate": 4.430974819429954e-06,
+      "loss": 0.20601146221160888,
+      "step": 930
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.8253967761993408,
+      "learning_rate": 4.413114831894344e-06,
+      "loss": 0.20144739151000976,
+      "step": 940
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.8320073485374451,
+      "learning_rate": 4.3950160601326865e-06,
+      "loss": 0.1991488218307495,
+      "step": 950
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.9561188817024231,
+      "learning_rate": 4.376680763131811e-06,
+      "loss": 0.20644969940185548,
+      "step": 960
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.803125262260437,
+      "learning_rate": 4.358111229400296e-06,
+      "loss": 0.20915827751159669,
+      "step": 970
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.8132813572883606,
+      "learning_rate": 4.33930977668283e-06,
+      "loss": 0.21236319541931153,
+      "step": 980
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.9199708700180054,
+      "learning_rate": 4.320278751670922e-06,
+      "loss": 0.2055152416229248,
+      "step": 990
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.8928452134132385,
+      "learning_rate": 4.301020529710009e-06,
+      "loss": 0.206940221786499,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.9517531991004944,
+      "learning_rate": 4.281537514502962e-06,
+      "loss": 0.2108684539794922,
+      "step": 1010
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.9454931020736694,
+      "learning_rate": 4.261832137810093e-06,
+      "loss": 0.20762853622436522,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.8216907978057861,
+      "learning_rate": 4.241906859145611e-06,
+      "loss": 0.2080090045928955,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.7746848464012146,
+      "learning_rate": 4.221764165470661e-06,
+      "loss": 0.2045358896255493,
+      "step": 1040
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.9690631031990051,
+      "learning_rate": 4.201406570882898e-06,
+      "loss": 0.2042619466781616,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.8224774599075317,
+      "learning_rate": 4.180836616302704e-06,
+      "loss": 0.20257024765014647,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 1.0545451641082764,
+      "learning_rate": 4.160056869156041e-06,
+      "loss": 0.20705747604370117,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.7687672972679138,
+      "learning_rate": 4.139069923053995e-06,
+      "loss": 0.20620079040527345,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.8217872977256775,
+      "learning_rate": 4.117878397469062e-06,
+      "loss": 0.2085184097290039,
+      "step": 1090
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.8416702747344971,
+      "learning_rate": 4.096484937408195e-06,
+      "loss": 0.2004465103149414,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.9369352459907532,
+      "learning_rate": 4.074892213082676e-06,
+      "loss": 0.2000356674194336,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.8932799100875854,
+      "learning_rate": 4.0531029195748265e-06,
+      "loss": 0.2081597328186035,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.8241865634918213,
+      "learning_rate": 4.03111977650163e-06,
+      "loss": 0.20646026134490966,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.895818829536438,
+      "learning_rate": 4.008945527675281e-06,
+      "loss": 0.20636439323425293,
+      "step": 1140
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 1.025847315788269,
+      "learning_rate": 3.986582940760717e-06,
+      "loss": 0.19549331665039063,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.8418158292770386,
+      "learning_rate": 3.9640348069301785e-06,
+      "loss": 0.2030022621154785,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.8930888175964355,
+      "learning_rate": 3.941303940514826e-06,
+      "loss": 0.2045799732208252,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.9579597115516663,
+      "learning_rate": 3.918393178653472e-06,
+      "loss": 0.2056349515914917,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.8847565054893494,
+      "learning_rate": 3.895305380938468e-06,
+      "loss": 0.2011105537414551,
+      "step": 1190
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.8585783839225769,
+      "learning_rate": 3.872043429058783e-06,
+      "loss": 0.199721622467041,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.8783968091011047,
+      "learning_rate": 3.84861022644033e-06,
+      "loss": 0.20214991569519042,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.814146101474762,
+      "learning_rate": 3.825008697883574e-06,
+      "loss": 0.21019458770751953,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.808813214302063,
+      "learning_rate": 3.8012417891984776e-06,
+      "loss": 0.20353050231933595,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.86122727394104,
+      "learning_rate": 3.777312466836819e-06,
+      "loss": 0.20200772285461427,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.8682144284248352,
+      "learning_rate": 3.7532237175219378e-06,
+      "loss": 0.2004477024078369,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.9007877707481384,
+      "learning_rate": 3.728978547875948e-06,
+      "loss": 0.2053370952606201,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.8896200060844421,
+      "learning_rate": 3.7045799840444712e-06,
+      "loss": 0.19951150417327881,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.951781690120697,
+      "learning_rate": 3.6800310713189258e-06,
+      "loss": 0.20167291164398193,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.9838011860847473,
+      "learning_rate": 3.6553348737564328e-06,
+      "loss": 0.20190558433532715,
+      "step": 1290
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.7393888235092163,
+      "learning_rate": 3.6304944737973794e-06,
+      "loss": 0.20698275566101074,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.9344687461853027,
+      "learning_rate": 3.6055129718806836e-06,
+      "loss": 0.2017432689666748,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.8322877287864685,
+      "learning_rate": 3.5803934860568134e-06,
+      "loss": 0.19672255516052245,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.8978738784790039,
+      "learning_rate": 3.5551391515986163e-06,
+      "loss": 0.20154342651367188,
+      "step": 1330
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.9294552206993103,
+      "learning_rate": 3.529753120609982e-06,
+      "loss": 0.19790271520614625,
+      "step": 1340
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.9712645411491394,
+      "learning_rate": 3.5042385616324243e-06,
+      "loss": 0.2009965419769287,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.7751940488815308,
+      "learning_rate": 3.4785986592495934e-06,
+      "loss": 0.19958891868591308,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.7986553311347961,
+      "learning_rate": 3.452836613689803e-06,
+      "loss": 0.19687262773513795,
+      "step": 1370
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.7831050753593445,
+      "learning_rate": 3.426955640426584e-06,
+      "loss": 0.20044875144958496,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.8416183590888977,
+      "learning_rate": 3.4009589697773605e-06,
+      "loss": 0.1997106909751892,
+      "step": 1390
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.9130992293357849,
+      "learning_rate": 3.3748498465002475e-06,
+      "loss": 0.19710896015167237,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.8053887486457825,
+      "learning_rate": 3.3486315293890693e-06,
+      "loss": 0.20499968528747559,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.8130092024803162,
+      "learning_rate": 3.3223072908666053e-06,
+      "loss": 0.1947783946990967,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.8554696440696716,
+      "learning_rate": 3.295880416576153e-06,
+      "loss": 0.20600056648254395,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.8399017453193665,
+      "learning_rate": 3.269354204971427e-06,
+      "loss": 0.19943264722824097,
+      "step": 1440
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.9924997687339783,
+      "learning_rate": 3.242731966904865e-06,
+      "loss": 0.19647090435028075,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.8956066966056824,
+      "learning_rate": 3.2160170252143913e-06,
+      "loss": 0.19706339836120607,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.773949384689331,
+      "learning_rate": 3.1892127143086716e-06,
+      "loss": 0.20296840667724608,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.9293017983436584,
+      "learning_rate": 3.1623223797509347e-06,
+      "loss": 0.19287054538726806,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 1.0113096237182617,
+      "learning_rate": 3.135349377841396e-06,
+      "loss": 0.19713553190231323,
+      "step": 1490
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8387943506240845,
+      "learning_rate": 3.1082970751983497e-06,
+      "loss": 0.2030486583709717,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.8359475135803223,
+      "learning_rate": 3.0811688483379546e-06,
+      "loss": 0.20096054077148437,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.8501754999160767,
+      "learning_rate": 3.0539680832528074e-06,
+      "loss": 0.2003248691558838,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.8864139318466187,
+      "learning_rate": 3.026698174989316e-06,
+      "loss": 0.19884690046310424,
+      "step": 1530
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.8472388982772827,
+      "learning_rate": 2.999362527223952e-06,
+      "loss": 0.19895727634429933,
+      "step": 1540
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.835716187953949,
+      "learning_rate": 2.9719645518384194e-06,
+      "loss": 0.20096054077148437,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 1.056063175201416,
+      "learning_rate": 2.944507668493807e-06,
+      "loss": 0.20421340465545654,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.8620187044143677,
+      "learning_rate": 2.9169953042037623e-06,
+      "loss": 0.1987598180770874,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.9530871510505676,
+      "learning_rate": 2.889430892906754e-06,
+      "loss": 0.19546263217926024,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.9080846309661865,
+      "learning_rate": 2.861817875037462e-06,
+      "loss": 0.1925678253173828,
+      "step": 1590
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.8608678579330444,
+      "learning_rate": 2.8341596970973683e-06,
+      "loss": 0.2030557632446289,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.8551720380783081,
+      "learning_rate": 2.80645981122458e-06,
+      "loss": 0.1977646231651306,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.8102371692657471,
+      "learning_rate": 2.7787216747629508e-06,
+      "loss": 0.20460376739501954,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.757784366607666,
+      "learning_rate": 2.7509487498305615e-06,
+      "loss": 0.1916171431541443,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.8284843564033508,
+      "learning_rate": 2.7231445028875924e-06,
+      "loss": 0.19816431999206544,
+      "step": 1640
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.8836312890052795,
+      "learning_rate": 2.6953124043036604e-06,
+      "loss": 0.19672504663467408,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.853327214717865,
+      "learning_rate": 2.667455927924667e-06,
+      "loss": 0.1962347984313965,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.7774537205696106,
+      "learning_rate": 2.6395785506392164e-06,
+      "loss": 0.19192854166030884,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.9308248162269592,
+      "learning_rate": 2.6116837519446407e-06,
+      "loss": 0.194757080078125,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.8841214776039124,
+      "learning_rate": 2.5837750135127192e-06,
+      "loss": 0.19211833477020263,
+      "step": 1690
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.8462106585502625,
+      "learning_rate": 2.555855818755108e-06,
+      "loss": 0.19800645112991333,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.954951286315918,
+      "learning_rate": 2.5279296523885636e-06,
+      "loss": 0.19544832706451415,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.9142067432403564,
+      "learning_rate": 2.5e-06,
+      "loss": 0.19691340923309325,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.8898570537567139,
+      "learning_rate": 2.472070347611437e-06,
+      "loss": 0.1904543399810791,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.8316683769226074,
+      "learning_rate": 2.444144181244893e-06,
+      "loss": 0.19625117778778076,
+      "step": 1740
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.8947731852531433,
+      "learning_rate": 2.416224986487282e-06,
+      "loss": 0.1924436092376709,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.8805821537971497,
+      "learning_rate": 2.3883162480553605e-06,
+      "loss": 0.19019029140472413,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.9083222150802612,
+      "learning_rate": 2.3604214493607844e-06,
+      "loss": 0.19431073665618898,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.9499437212944031,
+      "learning_rate": 2.332544072075333e-06,
+      "loss": 0.19770257472991942,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.8856005668640137,
+      "learning_rate": 2.30468759569634e-06,
+      "loss": 0.1973590612411499,
+      "step": 1790
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.930976152420044,
+      "learning_rate": 2.276855497112408e-06,
+      "loss": 0.1932486414909363,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.8594582676887512,
+      "learning_rate": 2.2490512501694394e-06,
+      "loss": 0.18842577934265137,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.8443211913108826,
+      "learning_rate": 2.2212783252370496e-06,
+      "loss": 0.19471209049224852,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.8561191558837891,
+      "learning_rate": 2.1935401887754213e-06,
+      "loss": 0.1995155930519104,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.8672814965248108,
+      "learning_rate": 2.165840302902632e-06,
+      "loss": 0.19246792793273926,
+      "step": 1840
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.9349257349967957,
+      "learning_rate": 2.1381821249625383e-06,
+      "loss": 0.1974799871444702,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.9783422946929932,
+      "learning_rate": 2.1105691070932465e-06,
+      "loss": 0.18990163803100585,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.9529528617858887,
+      "learning_rate": 2.083004695796238e-06,
+      "loss": 0.18725800514221191,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.9632482528686523,
+      "learning_rate": 2.055492331506194e-06,
+      "loss": 0.19491477012634278,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.9552566409111023,
+      "learning_rate": 2.0280354481615814e-06,
+      "loss": 0.19355373382568358,
+      "step": 1890
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.8642057776451111,
+      "learning_rate": 2.000637472776049e-06,
+      "loss": 0.19464772939682007,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.9171792268753052,
+      "learning_rate": 1.973301825010685e-06,
+      "loss": 0.18813092708587648,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.9625651836395264,
+      "learning_rate": 1.9460319167471934e-06,
+      "loss": 0.19436421394348144,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.8342574238777161,
+      "learning_rate": 1.9188311516620466e-06,
+      "loss": 0.18922022581100464,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.9471722841262817,
+      "learning_rate": 1.891702924801651e-06,
+      "loss": 0.19615614414215088,
+      "step": 1940
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.9361758232116699,
+      "learning_rate": 1.864650622158604e-06,
+      "loss": 0.20051980018615723,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.8602835536003113,
+      "learning_rate": 1.8376776202490666e-06,
+      "loss": 0.19571347236633302,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.9917039275169373,
+      "learning_rate": 1.8107872856913293e-06,
+      "loss": 0.19011812210083007,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.7912291884422302,
+      "learning_rate": 1.7839829747856096e-06,
+      "loss": 0.1915016531944275,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.9529620409011841,
+      "learning_rate": 1.7572680330951359e-06,
+      "loss": 0.19014396667480468,
+      "step": 1990
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.8650413751602173,
+      "learning_rate": 1.7306457950285747e-06,
+      "loss": 0.19222151041030883,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.9507162570953369,
+      "learning_rate": 1.704119583423848e-06,
+      "loss": 0.1860235333442688,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.8666033744812012,
+      "learning_rate": 1.677692709133396e-06,
+      "loss": 0.194149649143219,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.9662922620773315,
+      "learning_rate": 1.6513684706109311e-06,
+      "loss": 0.19424891471862793,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.8611494302749634,
+      "learning_rate": 1.6251501534997529e-06,
+      "loss": 0.19236912727355956,
+      "step": 2040
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.8275763988494873,
+      "learning_rate": 1.5990410302226405e-06,
+      "loss": 0.18528724908828736,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.9188935160636902,
+      "learning_rate": 1.5730443595734162e-06,
+      "loss": 0.19188053607940675,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 1.038678526878357,
+      "learning_rate": 1.5471633863101982e-06,
+      "loss": 0.1937857151031494,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.9681159853935242,
+      "learning_rate": 1.521401340750407e-06,
+      "loss": 0.1949242353439331,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.8842741250991821,
+      "learning_rate": 1.495761438367577e-06,
+      "loss": 0.1935848593711853,
+      "step": 2090
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 1.0433149337768555,
+      "learning_rate": 1.4702468793900187e-06,
+      "loss": 0.19175238609313966,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.9612663388252258,
+      "learning_rate": 1.444860848401384e-06,
+      "loss": 0.19264159202575684,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.8770299553871155,
+      "learning_rate": 1.4196065139431866e-06,
+      "loss": 0.19034202098846437,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.9762235283851624,
+      "learning_rate": 1.3944870281193178e-06,
+      "loss": 0.18992135524749756,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.8780629634857178,
+      "learning_rate": 1.3695055262026208e-06,
+      "loss": 0.1962789535522461,
+      "step": 2140
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.8781450986862183,
+      "learning_rate": 1.3446651262435679e-06,
+      "loss": 0.18934234380722045,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.9071021676063538,
+      "learning_rate": 1.3199689286810746e-06,
+      "loss": 0.19153733253479005,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.854719340801239,
+      "learning_rate": 1.2954200159555294e-06,
+      "loss": 0.1858367443084717,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.9517054557800293,
+      "learning_rate": 1.2710214521240527e-06,
+      "loss": 0.18999232053756715,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.9305564761161804,
+      "learning_rate": 1.246776282478063e-06,
+      "loss": 0.19198548793792725,
+      "step": 2190
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.867964506149292,
+      "learning_rate": 1.222687533163181e-06,
+      "loss": 0.19081370830535888,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 1.079858660697937,
+      "learning_rate": 1.1987582108015228e-06,
+      "loss": 0.19260694980621337,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 1.0393986701965332,
+      "learning_rate": 1.1749913021164255e-06,
+      "loss": 0.192878794670105,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.9691563248634338,
+      "learning_rate": 1.1513897735596702e-06,
+      "loss": 0.1872938871383667,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.9591838121414185,
+      "learning_rate": 1.127956570941218e-06,
+      "loss": 0.18445268869400025,
+      "step": 2240
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.0650240182876587,
+      "learning_rate": 1.104694619061533e-06,
+      "loss": 0.193602454662323,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.9803401827812195,
+      "learning_rate": 1.0816068213465295e-06,
+      "loss": 0.19585211277008058,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.8571217656135559,
+      "learning_rate": 1.0586960594851762e-06,
+      "loss": 0.19123618602752684,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.9580956697463989,
+      "learning_rate": 1.0359651930698217e-06,
+      "loss": 0.1895804524421692,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.9303188323974609,
+      "learning_rate": 1.0134170592392837e-06,
+      "loss": 0.1917721748352051,
+      "step": 2290
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.9539825320243835,
+      "learning_rate": 9.910544723247204e-07,
+      "loss": 0.1903479814529419,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.905822217464447,
+      "learning_rate": 9.688802234983706e-07,
+      "loss": 0.1895788311958313,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 1.0421855449676514,
+      "learning_rate": 9.468970804251742e-07,
+      "loss": 0.19260928630828858,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.938606321811676,
+      "learning_rate": 9.251077869173244e-07,
+      "loss": 0.18598917722702027,
+      "step": 2330
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.9746700525283813,
+      "learning_rate": 9.035150625918054e-07,
+      "loss": 0.18742547035217286,
+      "step": 2340
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.9145529270172119,
+      "learning_rate": 8.821216025309395e-07,
+      "loss": 0.19074430465698242,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.8072162866592407,
+      "learning_rate": 8.609300769460055e-07,
+      "loss": 0.18499974012374878,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.9440052509307861,
+      "learning_rate": 8.399431308439592e-07,
+      "loss": 0.19107134342193605,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.9550809264183044,
+      "learning_rate": 8.191633836972962e-07,
+      "loss": 0.1914163589477539,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.8588680624961853,
+      "learning_rate": 7.985934291171024e-07,
+      "loss": 0.18734397888183593,
+      "step": 2390
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.9598610401153564,
+      "learning_rate": 7.7823583452934e-07,
+      "loss": 0.1909665584564209,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.9106743931770325,
+      "learning_rate": 7.58093140854389e-07,
+      "loss": 0.19062764644622804,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.9662719964981079,
+      "learning_rate": 7.381678621899077e-07,
+      "loss": 0.19155207872390748,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 1.0235718488693237,
+      "learning_rate": 7.184624854970379e-07,
+      "loss": 0.18778762817382813,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.8647546768188477,
+      "learning_rate": 6.989794702899932e-07,
+      "loss": 0.1867724895477295,
+      "step": 2440
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.9516528844833374,
+      "learning_rate": 6.797212483290777e-07,
+      "loss": 0.18868536949157716,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.9623562097549438,
+      "learning_rate": 6.60690223317171e-07,
+      "loss": 0.18562084436416626,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.8501712679862976,
+      "learning_rate": 6.418887705997046e-07,
+      "loss": 0.18841230869293213,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 1.1048811674118042,
+      "learning_rate": 6.23319236868189e-07,
+      "loss": 0.1919555187225342,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.8838031888008118,
+      "learning_rate": 6.049839398673141e-07,
+      "loss": 0.19270492792129518,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9717556238174438,
+      "learning_rate": 5.868851681056567e-07,
+      "loss": 0.19487048387527467,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.9107718467712402,
+      "learning_rate": 5.690251805700467e-07,
+      "loss": 0.1918075442314148,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.9718947410583496,
+      "learning_rate": 5.514062064436096e-07,
+      "loss": 0.19251750707626342,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.9999333620071411,
+      "learning_rate": 5.34030444827533e-07,
+      "loss": 0.1888274908065796,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.9839712381362915,
+      "learning_rate": 5.169000644665895e-07,
+      "loss": 0.1859964370727539,
+      "step": 2540
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.8771271705627441,
+      "learning_rate": 5.000172034784442e-07,
+      "loss": 0.1878008484840393,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 1.0618975162506104,
+      "learning_rate": 4.833839690867853e-07,
+      "loss": 0.19025259017944335,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.9912660121917725,
+      "learning_rate": 4.6700243735831705e-07,
+      "loss": 0.18360999822616578,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.9649500250816345,
+      "learning_rate": 4.508746529436311e-07,
+      "loss": 0.18223977088928223,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.9794525504112244,
+      "learning_rate": 4.350026288220083e-07,
+      "loss": 0.19027379751205445,
+      "step": 2590
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.9974610209465027,
+      "learning_rate": 4.1938834605017133e-07,
+      "loss": 0.1869723081588745,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.9569944143295288,
+      "learning_rate": 4.0403375351501515e-07,
+      "loss": 0.18647921085357666,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.8668234944343567,
+      "learning_rate": 3.88940767690362e-07,
+      "loss": 0.18584033250808715,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 1.0080299377441406,
+      "learning_rate": 3.7411127239775774e-07,
+      "loss": 0.1850695490837097,
+      "step": 2630
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.9814310669898987,
+      "learning_rate": 3.595471185713431e-07,
+      "loss": 0.18304891586303712,
+      "step": 2640
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.9878867268562317,
+      "learning_rate": 3.4525012402682826e-07,
+      "loss": 0.18500676155090331,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 1.0064340829849243,
+      "learning_rate": 3.3122207323460804e-07,
+      "loss": 0.18739557266235352,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.9369391798973083,
+      "learning_rate": 3.1746471709702963e-07,
+      "loss": 0.18277575969696044,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.9446983337402344,
+      "learning_rate": 3.039797727298585e-07,
+      "loss": 0.1903679609298706,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 1.0289263725280762,
+      "learning_rate": 2.9076892324795546e-07,
+      "loss": 0.18804672956466675,
+      "step": 2690
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 1.047361135482788,
+      "learning_rate": 2.778338175551995e-07,
+      "loss": 0.18376014232635499,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 1.0056003332138062,
+      "learning_rate": 2.6517607013868326e-07,
+      "loss": 0.19209181070327758,
+      "step": 2710
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.9757674336433411,
+      "learning_rate": 2.527972608672002e-07,
+      "loss": 0.18722193241119384,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 1.0518726110458374,
+      "learning_rate": 2.40698934794053e-07,
+      "loss": 0.187286376953125,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.93593430519104,
+      "learning_rate": 2.2888260196421237e-07,
+      "loss": 0.1866356611251831,
+      "step": 2740
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.9336698651313782,
+      "learning_rate": 2.1734973722583735e-07,
+      "loss": 0.1894855499267578,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 1.0136207342147827,
+      "learning_rate": 2.0610178004619564e-07,
+      "loss": 0.1810680389404297,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.8792027831077576,
+      "learning_rate": 1.9514013433199834e-07,
+      "loss": 0.1932243824005127,
+      "step": 2770
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.8541420698165894,
+      "learning_rate": 1.8446616825416958e-07,
+      "loss": 0.1917858123779297,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 1.0666835308074951,
+      "learning_rate": 1.7408121407708007e-07,
+      "loss": 0.1921330451965332,
+      "step": 2790
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 1.0335843563079834,
+      "learning_rate": 1.6398656799226253e-07,
+      "loss": 0.1800443172454834,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 1.0246766805648804,
+      "learning_rate": 1.5418348995662773e-07,
+      "loss": 0.18684718608856202,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 1.0253738164901733,
+      "learning_rate": 1.4467320353520275e-07,
+      "loss": 0.1846174955368042,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.8940634727478027,
+      "learning_rate": 1.3545689574841341e-07,
+      "loss": 0.1857313632965088,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.9817743897438049,
+      "learning_rate": 1.26535716923927e-07,
+      "loss": 0.19169492721557618,
+      "step": 2840
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.9626559019088745,
+      "learning_rate": 1.1791078055307493e-07,
+      "loss": 0.18825350999832152,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 1.0254485607147217,
+      "learning_rate": 1.0958316315187289e-07,
+      "loss": 0.1878815174102783,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.9539675712585449,
+      "learning_rate": 1.0155390412665528e-07,
+      "loss": 0.18591114282608032,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.9155623316764832,
+      "learning_rate": 9.38240056443443e-08,
+      "loss": 0.18296668529510499,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.9511232376098633,
+      "learning_rate": 8.639443250736402e-08,
+      "loss": 0.18568153381347657,
+      "step": 2890
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.921597957611084,
+      "learning_rate": 7.926611203321777e-08,
+      "loss": 0.18664343357086183,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.9718701243400574,
+      "learning_rate": 7.243993393874882e-08,
+      "loss": 0.1875317335128784,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.9737975597381592,
+      "learning_rate": 6.591675022908805e-08,
+      "loss": 0.18509577512741088,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.9686116576194763,
+      "learning_rate": 5.969737509131241e-08,
+      "loss": 0.18223493099212645,
+      "step": 2930
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.9519445896148682,
+      "learning_rate": 5.3782584792823334e-08,
+      "loss": 0.1867220640182495,
+      "step": 2940
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 1.0010050535202026,
+      "learning_rate": 4.817311758445686e-08,
+      "loss": 0.18873190879821777,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 1.0026582479476929,
+      "learning_rate": 4.286967360833866e-08,
+      "loss": 0.1890841007232666,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.9423681497573853,
+      "learning_rate": 3.787291481049754e-08,
+      "loss": 0.18887195587158204,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.984921932220459,
+      "learning_rate": 3.3183464858244364e-08,
+      "loss": 0.19451572895050048,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.9740422964096069,
+      "learning_rate": 2.8801909062328992e-08,
+      "loss": 0.18073530197143556,
+      "step": 2990
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.9824504256248474,
+      "learning_rate": 2.4728794303886248e-08,
+      "loss": 0.17942516803741454,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 1.0434110164642334,
+      "learning_rate": 2.0964628966175794e-08,
+      "loss": 0.18552230596542357,
+      "step": 3010
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.9475349187850952,
+      "learning_rate": 1.750988287113009e-08,
+      "loss": 0.1841592311859131,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.928511917591095,
+      "learning_rate": 1.4364987220713278e-08,
+      "loss": 0.19167165756225585,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.9599359631538391,
+      "learning_rate": 1.1530334543099763e-08,
+      "loss": 0.1884666085243225,
+      "step": 3040
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.9781372547149658,
+      "learning_rate": 9.006278643683697e-09,
+      "loss": 0.19168753623962403,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.9357272982597351,
+      "learning_rate": 6.793134560916514e-09,
+      "loss": 0.1932600259780884,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.9409093260765076,
+      "learning_rate": 4.891178526986451e-09,
+      "loss": 0.18934156894683837,
+      "step": 3070
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 1.0096839666366577,
+      "learning_rate": 3.3006479333413943e-09,
+      "loss": 0.19215707778930663,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.9278941750526428,
+      "learning_rate": 2.021741301058422e-09,
+      "loss": 0.1875869631767273,
+      "step": 3090
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.9983802437782288,
+      "learning_rate": 1.0546182560652872e-09,
+      "loss": 0.18937885761260986,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.9678918123245239,
+      "learning_rate": 3.9939950921774607e-10,
+      "loss": 0.1848172664642334,
+      "step": 3110
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.97052401304245,
+      "learning_rate": 5.616684123160854e-11,
+      "loss": 0.18443670272827148,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0,
+      "step": 3125,
+      "total_flos": 1.4635172064982467e+18,
+      "train_loss": 0.20465421264648437,
+      "train_runtime": 15978.2192,
+      "train_samples_per_second": 12.517,
+      "train_steps_per_second": 0.196
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4635172064982467e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoints/Qwen3.5-0.8B-SFT/training_loss.png b/checkpoints/Qwen3.5-0.8B-SFT/training_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..c48aeeabb92cee48a5379a7dc12693fa3d364df1
Binary files /dev/null and b/checkpoints/Qwen3.5-0.8B-SFT/training_loss.png differ
diff --git a/checkpoints/Qwen3.5-2B-SFT/all_results.json b/checkpoints/Qwen3.5-2B-SFT/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ffaeaa73c2eda3a52b574e1dbdc141341a08de1
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 4.165948759592665e+18,
+    "train_loss": 0.19418699533462525,
+    "train_runtime": 24088.1057,
+    "train_samples_per_second": 8.303,
+    "train_steps_per_second": 0.13
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-2B-SFT/chat_template.jinja b/checkpoints/Qwen3.5-2B-SFT/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..0ef09f214eaa6d9bca297988afc1454b5827b2c7
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/chat_template.jinja
@@ -0,0 +1,154 @@
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if message.role == "system" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is defined %}
+                    {%- for args_name, args_value in tool_call.arguments|items %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is true %}
+        {{- '<think>\n' }}
+    {%- else %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-2B-SFT/config.json b/checkpoints/Qwen3.5-2B-SFT/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dc8b8ac9cf89638dbb9cbec08b8786c3e320b59
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/config.json
@@ -0,0 +1,105 @@
+{
+  "architectures": [
+    "Qwen3_5ForConditionalGeneration"
+  ],
+  "dtype": "bfloat16",
+  "eos_token_id": 248046,
+  "hidden_size": 2048,
+  "image_token_id": 248056,
+  "model_type": "qwen3_5",
+  "pad_token_id": 248044,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_output_gate": true,
+    "bos_token_id": null,
+    "dtype": "bfloat16",
+    "eos_token_id": 248044,
+    "full_attention_interval": 4,
+    "head_dim": 256,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "layer_types": [
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention"
+    ],
+    "linear_conv_kernel_dim": 4,
+    "linear_key_head_dim": 128,
+    "linear_num_key_heads": 16,
+    "linear_num_value_heads": 16,
+    "linear_value_head_dim": 128,
+    "mamba_ssm_dtype": "float32",
+    "max_position_embeddings": 262144,
+    "mlp_only_layers": [],
+    "model_type": "qwen3_5_text",
+    "mtp_num_hidden_layers": 1,
+    "mtp_use_dedicated_embeddings": false,
+    "num_attention_heads": 8,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 2,
+    "pad_token_id": null,
+    "partial_rotary_factor": 0.25,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        11,
+        11,
+        10
+      ],
+      "partial_rotary_factor": 0.25,
+      "rope_theta": 10000000,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": true,
+    "use_cache": false,
+    "vocab_size": 248320
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.3",
+  "use_cache": false,
+  "video_token_id": 248057,
+  "vision_config": {
+    "deepstack_visual_indexes": [],
+    "depth": 24,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1024,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "model_type": "qwen3_5",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 2048,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 248054,
+  "vision_start_token_id": 248053
+}
diff --git a/checkpoints/Qwen3.5-2B-SFT/eval_results_job_qwen35_2b_retrain_20260430_214700.json b/checkpoints/Qwen3.5-2B-SFT/eval_results_job_qwen35_2b_retrain_20260430_214700.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddadb65951bcafa6895feb73ea44fc78f15e0eee
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/eval_results_job_qwen35_2b_retrain_20260430_214700.json
@@ -0,0 +1,103 @@
+{
+  "mae_dx": 0.17143448275862067,
+  "rmse_dx": 0.586956028430022,
+  "mae_dy": 0.15297758620689655,
+  "rmse_dy": 0.4877811656032256,
+  "mae_dz": 0.016406896551724136,
+  "rmse_dz": 0.12572726366061662,
+  "mae_dpitch": 0.3087327586206896,
+  "rmse_dpitch": 0.7452746658690385,
+  "mae_dyaw": 1.2252741379310346,
+  "rmse_dyaw": 2.7683458804479444,
+  "mae_droll": 0.0,
+  "rmse_droll": 0.0,
+  "mae_overall": 0.31247097701149423,
+  "mae_position": 0.11360632183908044,
+  "mae_rotation": 0.511335632183908,
+  "rmse_overall": 1.2122588028850991,
+  "wp1_euc_mae": 0.08045473206756154,
+  "wp1_euc_median": 0.020000000000000018,
+  "wp2_euc_mae": 0.15801006148557586,
+  "wp2_euc_median": 0.044721359549995836,
+  "wp3_euc_mae": 0.2501664583707116,
+  "wp3_euc_median": 0.08062257748298557,
+  "wp4_euc_mae": 0.3646406455046208,
+  "wp4_euc_median": 0.1300000000000001,
+  "wp5_euc_mae": 0.49070159090908383,
+  "wp5_euc_median": 0.1811077027627483,
+  "euclidean_mae": 0.26879469766751074,
+  "ADE": 0.26879469766751074,
+  "FDE": 0.49070159090908383,
+  "ADE_median": 0.094639591080072,
+  "FDE_median": 0.1811077027627483,
+  "SR@0.1m": 0.5546551724137931,
+  "SR@0.2m": 0.7143103448275862,
+  "SR@0.3m": 0.8043103448275862,
+  "SR@0.5m": 0.885,
+  "SR@1.0m": 0.9479310344827586,
+  "SR@2.0m": 0.9791379310344828,
+  "SR@5.0m": 0.995,
+  "TrajSR@0.3m": 0.6431034482758621,
+  "TrajSR@0.5m": 0.7775862068965518,
+  "TrajSR@1.0m": 0.8862068965517241,
+  "TrajSR@2.0m": 0.9560344827586207,
+  "TrajSR@5.0m": 0.9879310344827587,
+  "RotAcc@0.5deg": 0.5272413793103449,
+  "RotAcc@1.0deg": 0.6413793103448275,
+  "RotAcc@2.0deg": 0.801896551724138,
+  "RotAcc@5.0deg": 0.9475862068965517,
+  "RotAcc@10.0deg": 0.985,
+  "TrajRotSR@1.0deg": 0.4896551724137931,
+  "TrajRotSR@2.0deg": 0.678448275862069,
+  "TrajRotSR@5.0deg": 0.8931034482758621,
+  "TrajRotSR@10.0deg": 0.9672413793103448,
+  "JointSR@(0.5m,1.0deg)": 0.7948275862068965,
+  "JointSR@(0.5m,5.0deg)": 0.971551724137931,
+  "JointSR@(1.0m,1.0deg)": 0.7991379310344827,
+  "JointSR@(1.0m,5.0deg)": 0.9810344827586207,
+  "JointSR@(0.3m,1.0deg)": 0.7853448275862069,
+  "JointSR@(0.5m,2.0deg)": 0.9129310344827586,
+  "TrajJointSR@(0.5m,1.0deg)": 0.4732758620689655,
+  "TrajJointSR@(0.5m,5.0deg)": 0.7362068965517241,
+  "TrajJointSR@(1.0m,1.0deg)": 0.4836206896551724,
+  "TrajJointSR@(1.0m,5.0deg)": 0.8310344827586207,
+  "TrajJointSR@(0.3m,1.0deg)": 0.44482758620689655,
+  "TrajJointSR@(0.5m,2.0deg)": 0.6146551724137931,
+  "wp1_rot_mae": 0.6531088744470089,
+  "wp2_rot_mae": 0.9547093532296965,
+  "wp3_rot_mae": 1.283564870678303,
+  "wp4_rot_mae": 1.6748759345391497,
+  "wp5_rot_mae": 2.068755703850988,
+  "rotation_euc_mae": 1.3270029473490292,
+  "ADE_p50": 0.094639591080072,
+  "ADE_p75": 0.2599817911603864,
+  "ADE_p90": 0.570698568175275,
+  "ADE_p95": 0.9572262195020409,
+  "ADE_p99": 3.4132114747490343,
+  "ADE_max": 8.209747913390368,
+  "FDE_p50": 0.1811077027627483,
+  "FDE_p75": 0.4528520502722452,
+  "FDE_p90": 1.073723169104385,
+  "FDE_p95": 1.8549481686147773,
+  "FDE_p99": 5.462859561010458,
+  "FDE_max": 12.807700808497987,
+  "rot_err_p50": 0.41231056256176624,
+  "rot_err_p75": 1.5694345736883306,
+  "rot_err_p90": 3.3655217305746357,
+  "rot_err_p95": 5.132800694767453,
+  "rot_err_p99": 12.461762280983246,
+  "rot_err_max": 44.327771204968116,
+  "HardFailRate_pos_gt_2.0m": 0.04396551724137931,
+  "HardFailRate_pos_gt_5.0m": 0.01206896551724138,
+  "HardFailRate_pos_gt_10.0m": 0.002586206896551724,
+  "HardFailRate_rot_gt_10.0deg": 0.032758620689655175,
+  "HardFailRate_rot_gt_30.0deg": 0.002586206896551724,
+  "HardFailRate_rot_gt_60.0deg": 0.0,
+  "parse_failure_rate": 0.0,
+  "parse_success_rate": 1.0,
+  "valid_samples": 1160,
+  "total_samples": 1160,
+  "parse_failures": 0,
+  "inference_engine": "vllm",
+  "vllm_version": "0.19.0"
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-2B-SFT/generation_config.json b/checkpoints/Qwen3.5-2B-SFT/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3eda838f46234dcfea054e23287d1718e943e6aa
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/generation_config.json
@@ -0,0 +1,10 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": [
+    248046,
+    248044
+  ],
+  "pad_token_id": 248044,
+  "transformers_version": "5.5.3",
+  "use_cache": true
+}
diff --git a/checkpoints/Qwen3.5-2B-SFT/model.safetensors b/checkpoints/Qwen3.5-2B-SFT/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c0346f3dd6e661acd9282a03b3a02517555a5cf7
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb158c5670c2fa8ec2c96bb0b05bd69cc66f76633b8ca72e53c6a87587cfa69
+size 5443675896
diff --git a/checkpoints/Qwen3.5-2B-SFT/processor_config.json b/checkpoints/Qwen3.5-2B-SFT/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..33818c7f9e991ad735fd240209f4fa73e6c28c50
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/processor_config.json
@@ -0,0 +1,60 @@
+{
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "Qwen2VLImageProcessor",
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "merge_size": 2,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "longest_edge": 16777216,
+      "shortest_edge": 65536
+    },
+    "temporal_patch_size": 2
+  },
+  "processor_class": "Qwen3VLProcessor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "fps": 2,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_frames": 768,
+    "merge_size": 2,
+    "min_frames": 4,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "size": {
+      "longest_edge": 25165824,
+      "shortest_edge": 4096
+    },
+    "temporal_patch_size": 2,
+    "video_processor_type": "Qwen3VLVideoProcessor"
+  }
+}
diff --git a/checkpoints/Qwen3.5-2B-SFT/tokenizer.json b/checkpoints/Qwen3.5-2B-SFT/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..67741b04f23bfdb46501f748ce27865ec82eccfb
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87a7830d63fcf43bf241c3c5242e96e62dd3fdc29224ca26fed8ea333db72de4
+size 19989343
diff --git a/checkpoints/Qwen3.5-2B-SFT/tokenizer_config.json b/checkpoints/Qwen3.5-2B-SFT/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..aeb7593d501d68475dd8091372c6d633a7a2c63f
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/tokenizer_config.json
@@ -0,0 +1,33 @@
+{
+  "add_prefix_space": false,
+  "audio_bos_token": "<|audio_start|>",
+  "audio_eos_token": "<|audio_end|>",
+  "audio_token": "<|audio_pad|>",
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "image_token": "<|image_pad|>",
+  "is_local": true,
+  "model_max_length": 262144,
+  "model_specific_special_tokens": {
+    "audio_bos_token": "<|audio_start|>",
+    "audio_eos_token": "<|audio_end|>",
+    "audio_token": "<|audio_pad|>",
+    "image_token": "<|image_pad|>",
+    "video_token": "<|video_pad|>",
+    "vision_bos_token": "<|vision_start|>",
+    "vision_eos_token": "<|vision_end|>"
+  },
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+  "processor_class": "Qwen3VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": null,
+  "video_token": "<|video_pad|>",
+  "vision_bos_token": "<|vision_start|>",
+  "vision_eos_token": "<|vision_end|>"
+}
diff --git a/checkpoints/Qwen3.5-2B-SFT/train_results.json b/checkpoints/Qwen3.5-2B-SFT/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ffaeaa73c2eda3a52b574e1dbdc141341a08de1
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 4.165948759592665e+18,
+    "train_loss": 0.19418699533462525,
+    "train_runtime": 24088.1057,
+    "train_samples_per_second": 8.303,
+    "train_steps_per_second": 0.13
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-2B-SFT/trainer_state.json b/checkpoints/Qwen3.5-2B-SFT/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..285b91883db0ef56265336d24416f04798f02c4e
--- /dev/null
+++ b/checkpoints/Qwen3.5-2B-SFT/trainer_state.json
@@ -0,0 +1,2227 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 17.00181007385254,
+      "learning_rate": 1.437699680511182e-07,
+      "loss": 0.40097870826721194,
+      "step": 10
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 9.006903648376465,
+      "learning_rate": 3.0351437699680514e-07,
+      "loss": 0.3749201536178589,
+      "step": 20
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 2.7475762367248535,
+      "learning_rate": 4.6325878594249205e-07,
+      "loss": 0.3104386329650879,
+      "step": 30
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 1.6237599849700928,
+      "learning_rate": 6.230031948881789e-07,
+      "loss": 0.28350210189819336,
+      "step": 40
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 1.1944012641906738,
+      "learning_rate": 7.82747603833866e-07,
+      "loss": 0.26195507049560546,
+      "step": 50
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 1.111968994140625,
+      "learning_rate": 9.424920127795528e-07,
+      "loss": 0.2548145532608032,
+      "step": 60
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 1.0771538019180298,
+      "learning_rate": 1.1022364217252397e-06,
+      "loss": 0.26291556358337403,
+      "step": 70
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.9149865508079529,
+      "learning_rate": 1.2619808306709266e-06,
+      "loss": 0.24681921005249025,
+      "step": 80
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 1.1506657600402832,
+      "learning_rate": 1.4217252396166134e-06,
+      "loss": 0.24312305450439453,
+      "step": 90
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.067981243133545,
+      "learning_rate": 1.5814696485623005e-06,
+      "loss": 0.24636492729187012,
+      "step": 100
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 1.1217856407165527,
+      "learning_rate": 1.7412140575079875e-06,
+      "loss": 0.24657981395721434,
+      "step": 110
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 1.265798807144165,
+      "learning_rate": 1.9009584664536742e-06,
+      "loss": 0.23692235946655274,
+      "step": 120
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 1.001699686050415,
+      "learning_rate": 2.060702875399361e-06,
+      "loss": 0.2406609535217285,
+      "step": 130
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 1.0411975383758545,
+      "learning_rate": 2.220447284345048e-06,
+      "loss": 0.23273375034332275,
+      "step": 140
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 1.1774442195892334,
+      "learning_rate": 2.380191693290735e-06,
+      "loss": 0.23040361404418946,
+      "step": 150
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.9190797805786133,
+      "learning_rate": 2.539936102236422e-06,
+      "loss": 0.2255420684814453,
+      "step": 160
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.917509913444519,
+      "learning_rate": 2.699680511182109e-06,
+      "loss": 0.23383898735046388,
+      "step": 170
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 1.1193501949310303,
+      "learning_rate": 2.8594249201277955e-06,
+      "loss": 0.22656521797180176,
+      "step": 180
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 1.022067904472351,
+      "learning_rate": 3.0191693290734825e-06,
+      "loss": 0.23097100257873535,
+      "step": 190
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.9210631847381592,
+      "learning_rate": 3.17891373801917e-06,
+      "loss": 0.2329324722290039,
+      "step": 200
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.9588786959648132,
+      "learning_rate": 3.3386581469648564e-06,
+      "loss": 0.2294376850128174,
+      "step": 210
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.8580502867698669,
+      "learning_rate": 3.4984025559105434e-06,
+      "loss": 0.22389583587646483,
+      "step": 220
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.946774423122406,
+      "learning_rate": 3.6581469648562303e-06,
+      "loss": 0.22544665336608888,
+      "step": 230
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.9038301110267639,
+      "learning_rate": 3.817891373801918e-06,
+      "loss": 0.2216867446899414,
+      "step": 240
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7587664127349854,
+      "learning_rate": 3.977635782747604e-06,
+      "loss": 0.2192216396331787,
+      "step": 250
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.8255429267883301,
+      "learning_rate": 4.137380191693291e-06,
+      "loss": 0.22012462615966796,
+      "step": 260
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.7769321799278259,
+      "learning_rate": 4.297124600638978e-06,
+      "loss": 0.22262439727783204,
+      "step": 270
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.8024886846542358,
+      "learning_rate": 4.456869009584665e-06,
+      "loss": 0.21874587535858153,
+      "step": 280
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.8127010464668274,
+      "learning_rate": 4.616613418530352e-06,
+      "loss": 0.22413947582244872,
+      "step": 290
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.8755475878715515,
+      "learning_rate": 4.776357827476039e-06,
+      "loss": 0.2250971794128418,
+      "step": 300
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.9584972858428955,
+      "learning_rate": 4.936102236421725e-06,
+      "loss": 0.21371636390686036,
+      "step": 310
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.8256917595863342,
+      "learning_rate": 4.999943833158769e-06,
+      "loss": 0.21431188583374022,
+      "step": 320
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.7529917359352112,
+      "learning_rate": 4.999600600490783e-06,
+      "loss": 0.22084765434265136,
+      "step": 330
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.7913526296615601,
+      "learning_rate": 4.9989453817439345e-06,
+      "loss": 0.21453213691711426,
+      "step": 340
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.7930585145950317,
+      "learning_rate": 4.997978258698942e-06,
+      "loss": 0.21424226760864257,
+      "step": 350
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.7409906983375549,
+      "learning_rate": 4.996699352066659e-06,
+      "loss": 0.21544296741485597,
+      "step": 360
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.776908814907074,
+      "learning_rate": 4.995108821473014e-06,
+      "loss": 0.21555554866790771,
+      "step": 370
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.7414209842681885,
+      "learning_rate": 4.993206865439084e-06,
+      "loss": 0.21163430213928222,
+      "step": 380
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.7566289901733398,
+      "learning_rate": 4.990993721356317e-06,
+      "loss": 0.21131091117858886,
+      "step": 390
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.7883878946304321,
+      "learning_rate": 4.988469665456901e-06,
+      "loss": 0.2116244316101074,
+      "step": 400
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.7117395401000977,
+      "learning_rate": 4.985635012779288e-06,
+      "loss": 0.21834149360656738,
+      "step": 410
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.6810120940208435,
+      "learning_rate": 4.98249011712887e-06,
+      "loss": 0.21092677116394043,
+      "step": 420
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.7290633320808411,
+      "learning_rate": 4.979035371033824e-06,
+      "loss": 0.2102036714553833,
+      "step": 430
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.7017252445220947,
+      "learning_rate": 4.975271205696115e-06,
+      "loss": 0.20992655754089357,
+      "step": 440
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.7360793948173523,
+      "learning_rate": 4.971198090937671e-06,
+      "loss": 0.20512619018554687,
+      "step": 450
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.7540563344955444,
+      "learning_rate": 4.966816535141756e-06,
+      "loss": 0.2015425443649292,
+      "step": 460
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.7753161787986755,
+      "learning_rate": 4.9621270851895035e-06,
+      "loss": 0.21057233810424805,
+      "step": 470
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.7387011647224426,
+      "learning_rate": 4.957130326391662e-06,
+      "loss": 0.21185002326965333,
+      "step": 480
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.7276087999343872,
+      "learning_rate": 4.951826882415544e-06,
+      "loss": 0.20872690677642822,
+      "step": 490
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.8299422264099121,
+      "learning_rate": 4.946217415207177e-06,
+      "loss": 0.20370903015136718,
+      "step": 500
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.7076508402824402,
+      "learning_rate": 4.940302624908689e-06,
+      "loss": 0.20804412364959718,
+      "step": 510
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.706371545791626,
+      "learning_rate": 4.934083249770912e-06,
+      "loss": 0.20231704711914061,
+      "step": 520
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.7540279030799866,
+      "learning_rate": 4.927560066061251e-06,
+      "loss": 0.2063138246536255,
+      "step": 530
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.6493498682975769,
+      "learning_rate": 4.920733887966783e-06,
+      "loss": 0.21687688827514648,
+      "step": 540
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.6475550532341003,
+      "learning_rate": 4.913605567492636e-06,
+      "loss": 0.2051780939102173,
+      "step": 550
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.7162631750106812,
+      "learning_rate": 4.906175994355656e-06,
+      "loss": 0.20715060234069824,
+      "step": 560
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.8360611796379089,
+      "learning_rate": 4.898446095873345e-06,
+      "loss": 0.20873281955718995,
+      "step": 570
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.615298867225647,
+      "learning_rate": 4.890416836848128e-06,
+      "loss": 0.20635380744934081,
+      "step": 580
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.7096477746963501,
+      "learning_rate": 4.882089219446925e-06,
+      "loss": 0.20148007869720458,
+      "step": 590
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.7298900485038757,
+      "learning_rate": 4.873464283076074e-06,
+      "loss": 0.2067399024963379,
+      "step": 600
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.7385826706886292,
+      "learning_rate": 4.864543104251587e-06,
+      "loss": 0.20580265522003174,
+      "step": 610
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.65291827917099,
+      "learning_rate": 4.855326796464798e-06,
+      "loss": 0.2078333854675293,
+      "step": 620
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.6820300817489624,
+      "learning_rate": 4.8458165100433725e-06,
+      "loss": 0.20342307090759276,
+      "step": 630
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.7165417075157166,
+      "learning_rate": 4.836013432007738e-06,
+      "loss": 0.20381028652191163,
+      "step": 640
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.7687081694602966,
+      "learning_rate": 4.825918785922921e-06,
+      "loss": 0.2078242301940918,
+      "step": 650
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.7032524347305298,
+      "learning_rate": 4.8155338317458315e-06,
+      "loss": 0.20481655597686768,
+      "step": 660
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.7231862545013428,
+      "learning_rate": 4.804859865668002e-06,
+      "loss": 0.20165665149688722,
+      "step": 670
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.6389712691307068,
+      "learning_rate": 4.793898219953804e-06,
+      "loss": 0.20336999893188476,
+      "step": 680
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.6152437925338745,
+      "learning_rate": 4.782650262774164e-06,
+      "loss": 0.20431649684906006,
+      "step": 690
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.6727204918861389,
+      "learning_rate": 4.7711173980357886e-06,
+      "loss": 0.2011037588119507,
+      "step": 700
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.6739911437034607,
+      "learning_rate": 4.759301065205947e-06,
+      "loss": 0.20095202922821045,
+      "step": 710
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.7094120979309082,
+      "learning_rate": 4.7472027391328e-06,
+      "loss": 0.20286693572998046,
+      "step": 720
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.7743499875068665,
+      "learning_rate": 4.734823929861317e-06,
+      "loss": 0.20463786125183106,
+      "step": 730
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.8282691836357117,
+      "learning_rate": 4.722166182444801e-06,
+      "loss": 0.20196740627288817,
+      "step": 740
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7588798403739929,
+      "learning_rate": 4.709231076752045e-06,
+      "loss": 0.20084238052368164,
+      "step": 750
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.6279271245002747,
+      "learning_rate": 4.696020227270142e-06,
+      "loss": 0.20528481006622315,
+      "step": 760
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.6513218879699707,
+      "learning_rate": 4.6825352829029705e-06,
+      "loss": 0.200866436958313,
+      "step": 770
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.6800708174705505,
+      "learning_rate": 4.668777926765392e-06,
+      "loss": 0.2006533145904541,
+      "step": 780
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.678886890411377,
+      "learning_rate": 4.6547498759731725e-06,
+      "loss": 0.1949576735496521,
+      "step": 790
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.7107610702514648,
+      "learning_rate": 4.6404528814286575e-06,
+      "loss": 0.19777085781097412,
+      "step": 800
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.6964312791824341,
+      "learning_rate": 4.6258887276022425e-06,
+      "loss": 0.2050178050994873,
+      "step": 810
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.725965142250061,
+      "learning_rate": 4.611059232309639e-06,
+      "loss": 0.20062518119812012,
+      "step": 820
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.7800929546356201,
+      "learning_rate": 4.595966246484986e-06,
+      "loss": 0.20146868228912354,
+      "step": 830
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.749195396900177,
+      "learning_rate": 4.580611653949829e-06,
+      "loss": 0.20182161331176757,
+      "step": 840
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.7181940078735352,
+      "learning_rate": 4.564997371177992e-06,
+      "loss": 0.1992729663848877,
+      "step": 850
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.6476516723632812,
+      "learning_rate": 4.54912534705637e-06,
+      "loss": 0.19952178001403809,
+      "step": 860
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.6579780578613281,
+      "learning_rate": 4.532997562641683e-06,
+      "loss": 0.19619479179382324,
+      "step": 870
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.6974263787269592,
+      "learning_rate": 4.516616030913214e-06,
+      "loss": 0.20040647983551024,
+      "step": 880
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.6694504618644714,
+      "learning_rate": 4.499982796521556e-06,
+      "loss": 0.19575827121734618,
+      "step": 890
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.6807686686515808,
+      "learning_rate": 4.48309993553341e-06,
+      "loss": 0.19785771369934083,
+      "step": 900
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.6845031976699829,
+      "learning_rate": 4.465969555172468e-06,
+      "loss": 0.19707002639770507,
+      "step": 910
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.7153148055076599,
+      "learning_rate": 4.448593793556391e-06,
+      "loss": 0.2026472806930542,
+      "step": 920
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.6482206583023071,
+      "learning_rate": 4.430974819429954e-06,
+      "loss": 0.19655225276947022,
+      "step": 930
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.6306204795837402,
+      "learning_rate": 4.413114831894344e-06,
+      "loss": 0.19124293327331543,
+      "step": 940
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.6225886940956116,
+      "learning_rate": 4.3950160601326865e-06,
+      "loss": 0.19019542932510375,
+      "step": 950
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.7444518804550171,
+      "learning_rate": 4.376680763131811e-06,
+      "loss": 0.1960599660873413,
+      "step": 960
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.7806506752967834,
+      "learning_rate": 4.358111229400296e-06,
+      "loss": 0.19993667602539061,
+      "step": 970
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.7076775431632996,
+      "learning_rate": 4.33930977668283e-06,
+      "loss": 0.20315132141113282,
+      "step": 980
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.681239664554596,
+      "learning_rate": 4.320278751670922e-06,
+      "loss": 0.19584376811981202,
+      "step": 990
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.7305501103401184,
+      "learning_rate": 4.301020529710009e-06,
+      "loss": 0.19784679412841796,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.7520341873168945,
+      "learning_rate": 4.281537514502962e-06,
+      "loss": 0.20119853019714357,
+      "step": 1010
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.6639211177825928,
+      "learning_rate": 4.261832137810093e-06,
+      "loss": 0.197571325302124,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.6915456056594849,
+      "learning_rate": 4.241906859145611e-06,
+      "loss": 0.19875795841217042,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.6685866117477417,
+      "learning_rate": 4.221764165470661e-06,
+      "loss": 0.19526104927062987,
+      "step": 1040
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.7343277335166931,
+      "learning_rate": 4.201406570882898e-06,
+      "loss": 0.19495513439178466,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.6586959958076477,
+      "learning_rate": 4.180836616302704e-06,
+      "loss": 0.19428329467773436,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.7318632006645203,
+      "learning_rate": 4.160056869156041e-06,
+      "loss": 0.19764463901519774,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.6407229900360107,
+      "learning_rate": 4.139069923053995e-06,
+      "loss": 0.19589632749557495,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.6162118911743164,
+      "learning_rate": 4.117878397469062e-06,
+      "loss": 0.1992954969406128,
+      "step": 1090
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.6185060739517212,
+      "learning_rate": 4.096484937408195e-06,
+      "loss": 0.19082267284393312,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.6647833585739136,
+      "learning_rate": 4.074892213082676e-06,
+      "loss": 0.1906151533126831,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.7094317674636841,
+      "learning_rate": 4.0531029195748265e-06,
+      "loss": 0.19853758811950684,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.6957128643989563,
+      "learning_rate": 4.03111977650163e-06,
+      "loss": 0.1966111183166504,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.6727164387702942,
+      "learning_rate": 4.008945527675281e-06,
+      "loss": 0.19672225713729857,
+      "step": 1140
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.7790160179138184,
+      "learning_rate": 3.986582940760717e-06,
+      "loss": 0.18639541864395143,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.7126836776733398,
+      "learning_rate": 3.9640348069301785e-06,
+      "loss": 0.19289255142211914,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.6615356802940369,
+      "learning_rate": 3.941303940514826e-06,
+      "loss": 0.19387151002883912,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.7060992121696472,
+      "learning_rate": 3.918393178653472e-06,
+      "loss": 0.19526903629302977,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.6411144137382507,
+      "learning_rate": 3.895305380938468e-06,
+      "loss": 0.19223073720932007,
+      "step": 1190
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.6146537065505981,
+      "learning_rate": 3.872043429058783e-06,
+      "loss": 0.1911651611328125,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.6731158494949341,
+      "learning_rate": 3.84861022644033e-06,
+      "loss": 0.1918567180633545,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.7398373484611511,
+      "learning_rate": 3.825008697883574e-06,
+      "loss": 0.20071697235107422,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.7013578414916992,
+      "learning_rate": 3.8012417891984776e-06,
+      "loss": 0.19355409145355223,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.698662519454956,
+      "learning_rate": 3.777312466836819e-06,
+      "loss": 0.19278268814086913,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6574701070785522,
+      "learning_rate": 3.7532237175219378e-06,
+      "loss": 0.19122891426086425,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.7399858832359314,
+      "learning_rate": 3.728978547875948e-06,
+      "loss": 0.1955470085144043,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.6889676451683044,
+      "learning_rate": 3.7045799840444712e-06,
+      "loss": 0.18996728658676149,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.73622727394104,
+      "learning_rate": 3.6800310713189258e-06,
+      "loss": 0.19117178916931152,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.8547033071517944,
+      "learning_rate": 3.6553348737564328e-06,
+      "loss": 0.19134058952331542,
+      "step": 1290
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.6093372702598572,
+      "learning_rate": 3.6304944737973794e-06,
+      "loss": 0.197102952003479,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.7401471138000488,
+      "learning_rate": 3.6055129718806836e-06,
+      "loss": 0.19205976724624635,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.7101146578788757,
+      "learning_rate": 3.5803934860568134e-06,
+      "loss": 0.18738794326782227,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.7336035370826721,
+      "learning_rate": 3.5551391515986163e-06,
+      "loss": 0.1925289273262024,
+      "step": 1330
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.6703352928161621,
+      "learning_rate": 3.529753120609982e-06,
+      "loss": 0.1879317045211792,
+      "step": 1340
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.7639795541763306,
+      "learning_rate": 3.5042385616324243e-06,
+      "loss": 0.19051835536956788,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.6993486285209656,
+      "learning_rate": 3.4785986592495934e-06,
+      "loss": 0.18969300985336304,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.6297987103462219,
+      "learning_rate": 3.452836613689803e-06,
+      "loss": 0.1876993417739868,
+      "step": 1370
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.6074863076210022,
+      "learning_rate": 3.426955640426584e-06,
+      "loss": 0.1906881809234619,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.6951903700828552,
+      "learning_rate": 3.4009589697773605e-06,
+      "loss": 0.18978222608566284,
+      "step": 1390
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.7054743766784668,
+      "learning_rate": 3.3748498465002475e-06,
+      "loss": 0.18714208602905275,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.6384505033493042,
+      "learning_rate": 3.3486315293890693e-06,
+      "loss": 0.19547748565673828,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.6575579643249512,
+      "learning_rate": 3.3223072908666053e-06,
+      "loss": 0.18565211296081544,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.6746217608451843,
+      "learning_rate": 3.295880416576153e-06,
+      "loss": 0.1961667060852051,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.6876834630966187,
+      "learning_rate": 3.269354204971427e-06,
+      "loss": 0.18892121315002441,
+      "step": 1440
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.753948986530304,
+      "learning_rate": 3.242731966904865e-06,
+      "loss": 0.186625075340271,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.6489424109458923,
+      "learning_rate": 3.2160170252143913e-06,
+      "loss": 0.18691904544830323,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.619157075881958,
+      "learning_rate": 3.1892127143086716e-06,
+      "loss": 0.1932499647140503,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.693301796913147,
+      "learning_rate": 3.1623223797509347e-06,
+      "loss": 0.18296878337860106,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.7320811152458191,
+      "learning_rate": 3.135349377841396e-06,
+      "loss": 0.1871619701385498,
+      "step": 1490
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7020161747932434,
+      "learning_rate": 3.1082970751983497e-06,
+      "loss": 0.19298884868621827,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.7094489932060242,
+      "learning_rate": 3.0811688483379546e-06,
+      "loss": 0.19058284759521485,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.6349408030509949,
+      "learning_rate": 3.0539680832528074e-06,
+      "loss": 0.19067602157592772,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.6655322909355164,
+      "learning_rate": 3.026698174989316e-06,
+      "loss": 0.18840067386627196,
+      "step": 1530
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.6164231896400452,
+      "learning_rate": 2.999362527223952e-06,
+      "loss": 0.1893659234046936,
+      "step": 1540
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.6291287541389465,
+      "learning_rate": 2.9719645518384194e-06,
+      "loss": 0.19131317138671874,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.7157194018363953,
+      "learning_rate": 2.944507668493807e-06,
+      "loss": 0.19394657611846924,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.701957106590271,
+      "learning_rate": 2.9169953042037623e-06,
+      "loss": 0.18929052352905273,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.7337380051612854,
+      "learning_rate": 2.889430892906754e-06,
+      "loss": 0.1855141282081604,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.7762733101844788,
+      "learning_rate": 2.861817875037462e-06,
+      "loss": 0.18262506723403932,
+      "step": 1590
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.721386730670929,
+      "learning_rate": 2.8341596970973683e-06,
+      "loss": 0.1928829550743103,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.6835659146308899,
+      "learning_rate": 2.80645981122458e-06,
+      "loss": 0.18776969909667968,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.6848421692848206,
+      "learning_rate": 2.7787216747629508e-06,
+      "loss": 0.19408185482025148,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.6096782684326172,
+      "learning_rate": 2.7509487498305615e-06,
+      "loss": 0.18251194953918456,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.6688647270202637,
+      "learning_rate": 2.7231445028875924e-06,
+      "loss": 0.18787600994110107,
+      "step": 1640
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.6558749675750732,
+      "learning_rate": 2.6953124043036604e-06,
+      "loss": 0.18639014959335326,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.6501591801643372,
+      "learning_rate": 2.667455927924667e-06,
+      "loss": 0.18570780754089355,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.6679975986480713,
+      "learning_rate": 2.6395785506392164e-06,
+      "loss": 0.18148298263549806,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.6896058320999146,
+      "learning_rate": 2.6116837519446407e-06,
+      "loss": 0.18533236980438234,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.6504887342453003,
+      "learning_rate": 2.5837750135127192e-06,
+      "loss": 0.18286749124526977,
+      "step": 1690
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.6970167756080627,
+      "learning_rate": 2.555855818755108e-06,
+      "loss": 0.1885154962539673,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.7530934810638428,
+      "learning_rate": 2.5279296523885636e-06,
+      "loss": 0.18549870252609252,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.7539217472076416,
+      "learning_rate": 2.5e-06,
+      "loss": 0.18675594329833983,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.7183989882469177,
+      "learning_rate": 2.472070347611437e-06,
+      "loss": 0.181170654296875,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.6666719317436218,
+      "learning_rate": 2.444144181244893e-06,
+      "loss": 0.18674240112304688,
+      "step": 1740
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7493124604225159,
+      "learning_rate": 2.416224986487282e-06,
+      "loss": 0.18279486894607544,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.683366596698761,
+      "learning_rate": 2.3883162480553605e-06,
+      "loss": 0.18034757375717164,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.7152907252311707,
+      "learning_rate": 2.3604214493607844e-06,
+      "loss": 0.18426483869552612,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.7228681445121765,
+      "learning_rate": 2.332544072075333e-06,
+      "loss": 0.18772796392440796,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.6640520095825195,
+      "learning_rate": 2.30468759569634e-06,
+      "loss": 0.18662737607955932,
+      "step": 1790
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.7385361194610596,
+      "learning_rate": 2.276855497112408e-06,
+      "loss": 0.18393266201019287,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.6731036305427551,
+      "learning_rate": 2.2490512501694394e-06,
+      "loss": 0.17882565259933472,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.6727462410926819,
+      "learning_rate": 2.2212783252370496e-06,
+      "loss": 0.1844714403152466,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.6584063768386841,
+      "learning_rate": 2.1935401887754213e-06,
+      "loss": 0.1889561891555786,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.7410686016082764,
+      "learning_rate": 2.165840302902632e-06,
+      "loss": 0.18200514316558838,
+      "step": 1840
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.6837431788444519,
+      "learning_rate": 2.1381821249625383e-06,
+      "loss": 0.18779854774475097,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.7690724730491638,
+      "learning_rate": 2.1105691070932465e-06,
+      "loss": 0.18021690845489502,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.7139452695846558,
+      "learning_rate": 2.083004695796238e-06,
+      "loss": 0.1776273250579834,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.7305198907852173,
+      "learning_rate": 2.055492331506194e-06,
+      "loss": 0.18511974811553955,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.7747855186462402,
+      "learning_rate": 2.0280354481615814e-06,
+      "loss": 0.18330963850021362,
+      "step": 1890
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.7213889360427856,
+      "learning_rate": 2.000637472776049e-06,
+      "loss": 0.18462073802947998,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.6749533414840698,
+      "learning_rate": 1.973301825010685e-06,
+      "loss": 0.17921651601791383,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.7167413830757141,
+      "learning_rate": 1.9460319167471934e-06,
+      "loss": 0.18418827056884765,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.6720898747444153,
+      "learning_rate": 1.9188311516620466e-06,
+      "loss": 0.17980489730834961,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.721839189529419,
+      "learning_rate": 1.891702924801651e-06,
+      "loss": 0.18582551479339598,
+      "step": 1940
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.6840994954109192,
+      "learning_rate": 1.864650622158604e-06,
+      "loss": 0.19025051593780518,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.649747908115387,
+      "learning_rate": 1.8376776202490666e-06,
+      "loss": 0.18501710891723633,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.7345659732818604,
+      "learning_rate": 1.8107872856913293e-06,
+      "loss": 0.17963820695877075,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.7330230474472046,
+      "learning_rate": 1.7839829747856096e-06,
+      "loss": 0.18132896423339845,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.7300711870193481,
+      "learning_rate": 1.7572680330951359e-06,
+      "loss": 0.17989938259124755,
+      "step": 1990
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6808901429176331,
+      "learning_rate": 1.7306457950285747e-06,
+      "loss": 0.18194899559020997,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.7305233478546143,
+      "learning_rate": 1.704119583423848e-06,
+      "loss": 0.1753689169883728,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.6736240983009338,
+      "learning_rate": 1.677692709133396e-06,
+      "loss": 0.1841512680053711,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.7294182777404785,
+      "learning_rate": 1.6513684706109311e-06,
+      "loss": 0.18403513431549073,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.7472907900810242,
+      "learning_rate": 1.6251501534997529e-06,
+      "loss": 0.181609046459198,
+      "step": 2040
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.6928985714912415,
+      "learning_rate": 1.5990410302226405e-06,
+      "loss": 0.17561020851135253,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.7173072695732117,
+      "learning_rate": 1.5730443595734162e-06,
+      "loss": 0.18147742748260498,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.7320525050163269,
+      "learning_rate": 1.5471633863101982e-06,
+      "loss": 0.18284125328063966,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.7757209539413452,
+      "learning_rate": 1.521401340750407e-06,
+      "loss": 0.18450961112976075,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.7066748142242432,
+      "learning_rate": 1.495761438367577e-06,
+      "loss": 0.18320014476776122,
+      "step": 2090
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.7570253014564514,
+      "learning_rate": 1.4702468793900187e-06,
+      "loss": 0.18169447183609008,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.7254899144172668,
+      "learning_rate": 1.444860848401384e-06,
+      "loss": 0.1820533037185669,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.71804279088974,
+      "learning_rate": 1.4196065139431866e-06,
+      "loss": 0.18047101497650148,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.7074591517448425,
+      "learning_rate": 1.3944870281193178e-06,
+      "loss": 0.18028099536895753,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.7010617852210999,
+      "learning_rate": 1.3695055262026208e-06,
+      "loss": 0.18579988479614257,
+      "step": 2140
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.6825765371322632,
+      "learning_rate": 1.3446651262435679e-06,
+      "loss": 0.17880038022994996,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.7335405945777893,
+      "learning_rate": 1.3199689286810746e-06,
+      "loss": 0.1810365080833435,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.7248731851577759,
+      "learning_rate": 1.2954200159555294e-06,
+      "loss": 0.17573955059051513,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.7687682509422302,
+      "learning_rate": 1.2710214521240527e-06,
+      "loss": 0.1800560474395752,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.7647348046302795,
+      "learning_rate": 1.246776282478063e-06,
+      "loss": 0.18106215000152587,
+      "step": 2190
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.6903149485588074,
+      "learning_rate": 1.222687533163181e-06,
+      "loss": 0.1801396608352661,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.8023920655250549,
+      "learning_rate": 1.1987582108015228e-06,
+      "loss": 0.18119274377822875,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.8501482009887695,
+      "learning_rate": 1.1749913021164255e-06,
+      "loss": 0.18241000175476074,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.7184765338897705,
+      "learning_rate": 1.1513897735596702e-06,
+      "loss": 0.17752939462661743,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.7270201444625854,
+      "learning_rate": 1.127956570941218e-06,
+      "loss": 0.1741568326950073,
+      "step": 2240
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8636892437934875,
+      "learning_rate": 1.104694619061533e-06,
+      "loss": 0.1828829050064087,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.7514937520027161,
+      "learning_rate": 1.0816068213465295e-06,
+      "loss": 0.1855594277381897,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.7141839265823364,
+      "learning_rate": 1.0586960594851762e-06,
+      "loss": 0.18119051456451415,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.7850434184074402,
+      "learning_rate": 1.0359651930698217e-06,
+      "loss": 0.1793533205986023,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.7188661098480225,
+      "learning_rate": 1.0134170592392837e-06,
+      "loss": 0.18033140897750854,
+      "step": 2290
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.7670432329177856,
+      "learning_rate": 9.910544723247204e-07,
+      "loss": 0.17953202724456788,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.7422866225242615,
+      "learning_rate": 9.688802234983706e-07,
+      "loss": 0.17864506244659423,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.8212261199951172,
+      "learning_rate": 9.468970804251742e-07,
+      "loss": 0.18146371841430664,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.7881020903587341,
+      "learning_rate": 9.251077869173244e-07,
+      "loss": 0.17564767599105835,
+      "step": 2330
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.7393279671669006,
+      "learning_rate": 9.035150625918054e-07,
+      "loss": 0.1761918306350708,
+      "step": 2340
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.7876865863800049,
+      "learning_rate": 8.821216025309395e-07,
+      "loss": 0.17920515537261963,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.7123493552207947,
+      "learning_rate": 8.609300769460055e-07,
+      "loss": 0.17455912828445436,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.7895896434783936,
+      "learning_rate": 8.399431308439592e-07,
+      "loss": 0.1802891969680786,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.7665727138519287,
+      "learning_rate": 8.191633836972962e-07,
+      "loss": 0.18085892200469972,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.7168281674385071,
+      "learning_rate": 7.985934291171024e-07,
+      "loss": 0.1772964835166931,
+      "step": 2390
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.7637878656387329,
+      "learning_rate": 7.7823583452934e-07,
+      "loss": 0.18090858459472656,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.7479363679885864,
+      "learning_rate": 7.58093140854389e-07,
+      "loss": 0.1801297664642334,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.7759881615638733,
+      "learning_rate": 7.381678621899077e-07,
+      "loss": 0.18061695098876954,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.7594044804573059,
+      "learning_rate": 7.184624854970379e-07,
+      "loss": 0.17679831981658936,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.6967716813087463,
+      "learning_rate": 6.989794702899932e-07,
+      "loss": 0.17618390321731567,
+      "step": 2440
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.7475560307502747,
+      "learning_rate": 6.797212483290777e-07,
+      "loss": 0.1785645604133606,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.7539611458778381,
+      "learning_rate": 6.60690223317171e-07,
+      "loss": 0.17528293132781983,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.7217035293579102,
+      "learning_rate": 6.418887705997046e-07,
+      "loss": 0.17803908586502076,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.9057507514953613,
+      "learning_rate": 6.23319236868189e-07,
+      "loss": 0.1803913116455078,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.7403589487075806,
+      "learning_rate": 6.049839398673141e-07,
+      "loss": 0.18166184425354004,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7768926620483398,
+      "learning_rate": 5.868851681056567e-07,
+      "loss": 0.18331187963485718,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.7482465505599976,
+      "learning_rate": 5.690251805700467e-07,
+      "loss": 0.18083620071411133,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.8199617266654968,
+      "learning_rate": 5.514062064436096e-07,
+      "loss": 0.1817050337791443,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.8081801533699036,
+      "learning_rate": 5.34030444827533e-07,
+      "loss": 0.17815799713134767,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.7771419286727905,
+      "learning_rate": 5.169000644665895e-07,
+      "loss": 0.17568042278289794,
+      "step": 2540
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.7164714336395264,
+      "learning_rate": 5.000172034784442e-07,
+      "loss": 0.1774258255958557,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.8132921457290649,
+      "learning_rate": 4.833839690867853e-07,
+      "loss": 0.1790420413017273,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.7847883105278015,
+      "learning_rate": 4.6700243735831705e-07,
+      "loss": 0.17356895208358764,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.7992151379585266,
+      "learning_rate": 4.508746529436311e-07,
+      "loss": 0.1719137191772461,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.7740475535392761,
+      "learning_rate": 4.350026288220083e-07,
+      "loss": 0.17878987789154052,
+      "step": 2590
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.751238226890564,
+      "learning_rate": 4.1938834605017133e-07,
+      "loss": 0.17630958557128906,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.7768763303756714,
+      "learning_rate": 4.0403375351501515e-07,
+      "loss": 0.17546494007110597,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.7661908864974976,
+      "learning_rate": 3.88940767690362e-07,
+      "loss": 0.17503058910369873,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.8674443960189819,
+      "learning_rate": 3.7411127239775774e-07,
+      "loss": 0.1738388180732727,
+      "step": 2630
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.8360145688056946,
+      "learning_rate": 3.595471185713431e-07,
+      "loss": 0.1726490616798401,
+      "step": 2640
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.7894387245178223,
+      "learning_rate": 3.4525012402682826e-07,
+      "loss": 0.17366337776184082,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.793187141418457,
+      "learning_rate": 3.3122207323460804e-07,
+      "loss": 0.17671537399291992,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.7538631558418274,
+      "learning_rate": 3.1746471709702963e-07,
+      "loss": 0.1724637746810913,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.7905218005180359,
+      "learning_rate": 3.039797727298585e-07,
+      "loss": 0.17893683910369873,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.8051464557647705,
+      "learning_rate": 2.9076892324795546e-07,
+      "loss": 0.17726495265960693,
+      "step": 2690
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.7938066720962524,
+      "learning_rate": 2.778338175551995e-07,
+      "loss": 0.1730543851852417,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.8020470142364502,
+      "learning_rate": 2.6517607013868326e-07,
+      "loss": 0.1814263343811035,
+      "step": 2710
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.8322368264198303,
+      "learning_rate": 2.527972608672002e-07,
+      "loss": 0.17693625688552855,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.8700771331787109,
+      "learning_rate": 2.40698934794053e-07,
+      "loss": 0.17613067626953124,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.8105334043502808,
+      "learning_rate": 2.2888260196421237e-07,
+      "loss": 0.175561785697937,
+      "step": 2740
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7427732348442078,
+      "learning_rate": 2.1734973722583735e-07,
+      "loss": 0.1789578080177307,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.7436424493789673,
+      "learning_rate": 2.0610178004619564e-07,
+      "loss": 0.17029953002929688,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.7291203737258911,
+      "learning_rate": 1.9514013433199834e-07,
+      "loss": 0.18220956325531007,
+      "step": 2770
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.7028378248214722,
+      "learning_rate": 1.8446616825416958e-07,
+      "loss": 0.18021023273468018,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.8554368615150452,
+      "learning_rate": 1.7408121407708007e-07,
+      "loss": 0.18080689907073974,
+      "step": 2790
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.8060093522071838,
+      "learning_rate": 1.6398656799226253e-07,
+      "loss": 0.16961441040039063,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.7748635411262512,
+      "learning_rate": 1.5418348995662773e-07,
+      "loss": 0.17575571537017823,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.8475610613822937,
+      "learning_rate": 1.4467320353520275e-07,
+      "loss": 0.17327470779418946,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.8207055926322937,
+      "learning_rate": 1.3545689574841341e-07,
+      "loss": 0.17445048093795776,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.8258077502250671,
+      "learning_rate": 1.26535716923927e-07,
+      "loss": 0.18045923709869385,
+      "step": 2840
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.7929080724716187,
+      "learning_rate": 1.1791078055307493e-07,
+      "loss": 0.17697184085845946,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.8584709167480469,
+      "learning_rate": 1.0958316315187289e-07,
+      "loss": 0.1764688014984131,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.7925869822502136,
+      "learning_rate": 1.0155390412665528e-07,
+      "loss": 0.17493282556533812,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.7998941540718079,
+      "learning_rate": 9.38240056443443e-08,
+      "loss": 0.17196444272994996,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.771435022354126,
+      "learning_rate": 8.639443250736402e-08,
+      "loss": 0.17460463047027588,
+      "step": 2890
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.7990598082542419,
+      "learning_rate": 7.926611203321777e-08,
+      "loss": 0.17567555904388427,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.8290013670921326,
+      "learning_rate": 7.243993393874882e-08,
+      "loss": 0.17620582580566407,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.7977768778800964,
+      "learning_rate": 6.591675022908805e-08,
+      "loss": 0.17442046403884887,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.8192407488822937,
+      "learning_rate": 5.969737509131241e-08,
+      "loss": 0.17177685499191284,
+      "step": 2930
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.7740393877029419,
+      "learning_rate": 5.3782584792823334e-08,
+      "loss": 0.1760073184967041,
+      "step": 2940
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.7851013541221619,
+      "learning_rate": 4.817311758445686e-08,
+      "loss": 0.1770064949989319,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.7475484609603882,
+      "learning_rate": 4.286967360833866e-08,
+      "loss": 0.1779289126396179,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.7964699268341064,
+      "learning_rate": 3.787291481049754e-08,
+      "loss": 0.17799465656280516,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.8085147738456726,
+      "learning_rate": 3.3183464858244364e-08,
+      "loss": 0.18326969146728517,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.8301185369491577,
+      "learning_rate": 2.8801909062328992e-08,
+      "loss": 0.16990091800689697,
+      "step": 2990
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.7779503464698792,
+      "learning_rate": 2.4728794303886248e-08,
+      "loss": 0.1689441442489624,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.8286770582199097,
+      "learning_rate": 2.0964628966175794e-08,
+      "loss": 0.17429101467132568,
+      "step": 3010
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.7744504809379578,
+      "learning_rate": 1.750988287113009e-08,
+      "loss": 0.17314184904098512,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.8272920846939087,
+      "learning_rate": 1.4364987220713278e-08,
+      "loss": 0.18057892322540284,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.7745753526687622,
+      "learning_rate": 1.1530334543099763e-08,
+      "loss": 0.17820963859558106,
+      "step": 3040
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.8267188668251038,
+      "learning_rate": 9.006278643683697e-09,
+      "loss": 0.1798885703086853,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.7813923954963684,
+      "learning_rate": 6.793134560916514e-09,
+      "loss": 0.18165643215179444,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.735295832157135,
+      "learning_rate": 4.891178526986451e-09,
+      "loss": 0.1776532292366028,
+      "step": 3070
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.8481670022010803,
+      "learning_rate": 3.3006479333413943e-09,
+      "loss": 0.1802857756614685,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.7306717038154602,
+      "learning_rate": 2.021741301058422e-09,
+      "loss": 0.17649333477020263,
+      "step": 3090
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.8213961124420166,
+      "learning_rate": 1.0546182560652872e-09,
+      "loss": 0.1785252332687378,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.7591016888618469,
+      "learning_rate": 3.9939950921774607e-10,
+      "loss": 0.1741119384765625,
+      "step": 3110
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.7711865901947021,
+      "learning_rate": 5.616684123160854e-11,
+      "loss": 0.17355493307113648,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0,
+      "step": 3125,
+      "total_flos": 4.165948759592665e+18,
+      "train_loss": 0.19418699533462525,
+      "train_runtime": 24088.1057,
+      "train_samples_per_second": 8.303,
+      "train_steps_per_second": 0.13
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.165948759592665e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoints/Qwen3.5-2B-SFT/training_loss.png b/checkpoints/Qwen3.5-2B-SFT/training_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d0acc0b7a8d46d58ed77cbe8036f0055241b39f
Binary files /dev/null and b/checkpoints/Qwen3.5-2B-SFT/training_loss.png differ
diff --git a/checkpoints/Qwen3.5-9B-SFT/all_results.json b/checkpoints/Qwen3.5-9B-SFT/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cdf9016f3c7d9a1fdeac0f260aa979865b74fa1
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 2.0482865290660545e+19,
+    "train_loss": 0.18024104360580445,
+    "train_runtime": 72884.8323,
+    "train_samples_per_second": 2.744,
+    "train_steps_per_second": 0.043
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-9B-SFT/chat_template.jinja b/checkpoints/Qwen3.5-9B-SFT/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..a585dec894e63da457d9440ec6aa7caa16d20860
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/chat_template.jinja
@@ -0,0 +1,154 @@
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if message.role == "system" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is defined %}
+                    {%- for args_name, args_value in tool_call.arguments|items %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- else %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-9B-SFT/config.json b/checkpoints/Qwen3.5-9B-SFT/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..71227c5c4b4ecbd56e9afab6bcfe87e66dd6aafd
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/config.json
@@ -0,0 +1,113 @@
+{
+  "architectures": [
+    "Qwen3_5ForConditionalGeneration"
+  ],
+  "dtype": "bfloat16",
+  "eos_token_id": 248046,
+  "hidden_size": 4096,
+  "image_token_id": 248056,
+  "model_type": "qwen3_5",
+  "pad_token_id": 248044,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_output_gate": true,
+    "bos_token_id": null,
+    "dtype": "bfloat16",
+    "eos_token_id": 248044,
+    "full_attention_interval": 4,
+    "head_dim": 256,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "layer_types": [
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention",
+      "linear_attention",
+      "linear_attention",
+      "linear_attention",
+      "full_attention"
+    ],
+    "linear_conv_kernel_dim": 4,
+    "linear_key_head_dim": 128,
+    "linear_num_key_heads": 16,
+    "linear_num_value_heads": 32,
+    "linear_value_head_dim": 128,
+    "mamba_ssm_dtype": "float32",
+    "max_position_embeddings": 262144,
+    "mlp_only_layers": [],
+    "model_type": "qwen3_5_text",
+    "mtp_num_hidden_layers": 1,
+    "mtp_use_dedicated_embeddings": false,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 4,
+    "pad_token_id": null,
+    "partial_rotary_factor": 0.25,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        11,
+        11,
+        10
+      ],
+      "partial_rotary_factor": 0.25,
+      "rope_theta": 10000000,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": false,
+    "use_cache": false,
+    "vocab_size": 248320
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.5.3",
+  "use_cache": false,
+  "video_token_id": 248057,
+  "vision_config": {
+    "deepstack_visual_indexes": [],
+    "depth": 27,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "model_type": "qwen3_5",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 4096,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 248054,
+  "vision_start_token_id": 248053
+}
diff --git a/checkpoints/Qwen3.5-9B-SFT/eval_results_job_qwen35_9b_retrain_20260430_213219.json b/checkpoints/Qwen3.5-9B-SFT/eval_results_job_qwen35_9b_retrain_20260430_213219.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fd0829875b9b81439859fe70b7708b110be1526
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/eval_results_job_qwen35_9b_retrain_20260430_213219.json
@@ -0,0 +1,103 @@
+{
+  "mae_dx": 0.15343275862068967,
+  "rmse_dx": 0.5341561343484403,
+  "mae_dy": 0.13656551724137933,
+  "rmse_dy": 0.42426236189902344,
+  "mae_dz": 0.013632758620689654,
+  "rmse_dz": 0.09480715453869575,
+  "mae_dpitch": 0.2582155172413793,
+  "rmse_dpitch": 0.6067670262447779,
+  "mae_dyaw": 1.0164931034482758,
+  "rmse_dyaw": 2.522523415017261,
+  "mae_droll": 0.0,
+  "rmse_droll": 0.0,
+  "mae_overall": 0.2630566091954023,
+  "mae_position": 0.1012103448275862,
+  "mae_rotation": 0.4249028735632184,
+  "rmse_overall": 1.095871063552202,
+  "wp1_euc_mae": 0.06754110858254206,
+  "wp1_euc_median": 0.014142135623730963,
+  "wp2_euc_mae": 0.1401890514522129,
+  "wp2_euc_median": 0.040000000000000036,
+  "wp3_euc_mae": 0.22289144696737478,
+  "wp3_euc_median": 0.07071067811865475,
+  "wp4_euc_mae": 0.32612561640376864,
+  "wp4_euc_median": 0.10908326913195984,
+  "wp5_euc_mae": 0.43614143999566835,
+  "wp5_euc_median": 0.15247941867100795,
+  "euclidean_mae": 0.23857773268031338,
+  "ADE": 0.23857773268031332,
+  "FDE": 0.43614143999566835,
+  "ADE_median": 0.08079292858939022,
+  "FDE_median": 0.15247941867100795,
+  "SR@0.1m": 0.5870689655172414,
+  "SR@0.2m": 0.7353448275862069,
+  "SR@0.3m": 0.8167241379310345,
+  "SR@0.5m": 0.8951724137931034,
+  "SR@1.0m": 0.9536206896551724,
+  "SR@2.0m": 0.9837931034482759,
+  "SR@5.0m": 0.9963793103448276,
+  "TrajSR@0.3m": 0.6646551724137931,
+  "TrajSR@0.5m": 0.7939655172413793,
+  "TrajSR@1.0m": 0.8939655172413793,
+  "TrajSR@2.0m": 0.9620689655172414,
+  "TrajSR@5.0m": 0.9913793103448276,
+  "RotAcc@0.5deg": 0.5817241379310345,
+  "RotAcc@1.0deg": 0.7067241379310345,
+  "RotAcc@2.0deg": 0.8489655172413794,
+  "RotAcc@5.0deg": 0.9579310344827586,
+  "RotAcc@10.0deg": 0.9891379310344828,
+  "TrajRotSR@1.0deg": 0.5543103448275862,
+  "TrajRotSR@2.0deg": 0.7396551724137931,
+  "TrajRotSR@5.0deg": 0.9155172413793103,
+  "TrajRotSR@10.0deg": 0.975,
+  "JointSR@(0.5m,1.0deg)": 0.8508620689655172,
+  "JointSR@(0.5m,5.0deg)": 0.9775862068965517,
+  "JointSR@(1.0m,1.0deg)": 0.8612068965517241,
+  "JointSR@(1.0m,5.0deg)": 0.9862068965517241,
+  "JointSR@(0.3m,1.0deg)": 0.8396551724137931,
+  "JointSR@(0.5m,2.0deg)": 0.9387931034482758,
+  "TrajJointSR@(0.5m,1.0deg)": 0.5439655172413793,
+  "TrajJointSR@(0.5m,5.0deg)": 0.7655172413793103,
+  "TrajJointSR@(1.0m,1.0deg)": 0.5517241379310345,
+  "TrajJointSR@(1.0m,5.0deg)": 0.8551724137931035,
+  "TrajJointSR@(0.3m,1.0deg)": 0.5017241379310344,
+  "TrajJointSR@(0.5m,2.0deg)": 0.6758620689655173,
+  "wp1_rot_mae": 0.5196723087296781,
+  "wp2_rot_mae": 0.7743373043521142,
+  "wp3_rot_mae": 1.05358415582728,
+  "wp4_rot_mae": 1.3937223028539498,
+  "wp5_rot_mae": 1.7340464665714714,
+  "rotation_euc_mae": 1.0950725076668986,
+  "ADE_p50": 0.08079292858939022,
+  "ADE_p75": 0.23279656933645615,
+  "ADE_p90": 0.5287726396289388,
+  "ADE_p95": 0.8929334723810287,
+  "ADE_p99": 2.912877609712158,
+  "ADE_max": 6.8209393839768255,
+  "FDE_p50": 0.15247941867100795,
+  "FDE_p75": 0.4132160936367516,
+  "FDE_p90": 1.0675624229068608,
+  "FDE_p95": 1.57236856680788,
+  "FDE_p99": 4.44038146071553,
+  "FDE_max": 11.080839318391003,
+  "rot_err_p50": 0.31000000000000005,
+  "rot_err_p75": 1.2532358118087752,
+  "rot_err_p90": 2.770032490445444,
+  "rot_err_p95": 4.475917224403236,
+  "rot_err_p99": 10.816108833880216,
+  "rot_err_max": 45.749336607212136,
+  "HardFailRate_pos_gt_2.0m": 0.03793103448275862,
+  "HardFailRate_pos_gt_5.0m": 0.008620689655172414,
+  "HardFailRate_pos_gt_10.0m": 0.0034482758620689655,
+  "HardFailRate_rot_gt_10.0deg": 0.025,
+  "HardFailRate_rot_gt_30.0deg": 0.004310344827586207,
+  "HardFailRate_rot_gt_60.0deg": 0.0,
+  "parse_failure_rate": 0.0,
+  "parse_success_rate": 1.0,
+  "valid_samples": 1160,
+  "total_samples": 1160,
+  "parse_failures": 0,
+  "inference_engine": "vllm",
+  "vllm_version": "0.19.0"
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-9B-SFT/generation_config.json b/checkpoints/Qwen3.5-9B-SFT/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3eda838f46234dcfea054e23287d1718e943e6aa
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/generation_config.json
@@ -0,0 +1,10 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": [
+    248046,
+    248044
+  ],
+  "pad_token_id": 248044,
+  "transformers_version": "5.5.3",
+  "use_cache": true
+}
diff --git a/checkpoints/Qwen3.5-9B-SFT/model.safetensors b/checkpoints/Qwen3.5-9B-SFT/model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aba813b54c5a53544ec4766a36435c886d7f31c7
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9e744b280f85b2c32d167f742a66937629018b0ef539faee7f0f396250653f1
+size 19306303304
diff --git a/checkpoints/Qwen3.5-9B-SFT/processor_config.json b/checkpoints/Qwen3.5-9B-SFT/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..33818c7f9e991ad735fd240209f4fa73e6c28c50
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/processor_config.json
@@ -0,0 +1,60 @@
+{
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "Qwen2VLImageProcessor",
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "merge_size": 2,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "longest_edge": 16777216,
+      "shortest_edge": 65536
+    },
+    "temporal_patch_size": 2
+  },
+  "processor_class": "Qwen3VLProcessor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "fps": 2,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_frames": 768,
+    "merge_size": 2,
+    "min_frames": 4,
+    "patch_size": 16,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "size": {
+      "longest_edge": 25165824,
+      "shortest_edge": 4096
+    },
+    "temporal_patch_size": 2,
+    "video_processor_type": "Qwen3VLVideoProcessor"
+  }
+}
diff --git a/checkpoints/Qwen3.5-9B-SFT/tokenizer.json b/checkpoints/Qwen3.5-9B-SFT/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..67741b04f23bfdb46501f748ce27865ec82eccfb
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87a7830d63fcf43bf241c3c5242e96e62dd3fdc29224ca26fed8ea333db72de4
+size 19989343
diff --git a/checkpoints/Qwen3.5-9B-SFT/tokenizer_config.json b/checkpoints/Qwen3.5-9B-SFT/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..aeb7593d501d68475dd8091372c6d633a7a2c63f
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/tokenizer_config.json
@@ -0,0 +1,33 @@
+{
+  "add_prefix_space": false,
+  "audio_bos_token": "<|audio_start|>",
+  "audio_eos_token": "<|audio_end|>",
+  "audio_token": "<|audio_pad|>",
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "image_token": "<|image_pad|>",
+  "is_local": true,
+  "model_max_length": 262144,
+  "model_specific_special_tokens": {
+    "audio_bos_token": "<|audio_start|>",
+    "audio_eos_token": "<|audio_end|>",
+    "audio_token": "<|audio_pad|>",
+    "image_token": "<|image_pad|>",
+    "video_token": "<|video_pad|>",
+    "vision_bos_token": "<|vision_start|>",
+    "vision_eos_token": "<|vision_end|>"
+  },
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+  "processor_class": "Qwen3VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": null,
+  "video_token": "<|video_pad|>",
+  "vision_bos_token": "<|vision_start|>",
+  "vision_eos_token": "<|vision_end|>"
+}
diff --git a/checkpoints/Qwen3.5-9B-SFT/train_results.json b/checkpoints/Qwen3.5-9B-SFT/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cdf9016f3c7d9a1fdeac0f260aa979865b74fa1
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.0,
+    "total_flos": 2.0482865290660545e+19,
+    "train_loss": 0.18024104360580445,
+    "train_runtime": 72884.8323,
+    "train_samples_per_second": 2.744,
+    "train_steps_per_second": 0.043
+}
\ No newline at end of file
diff --git a/checkpoints/Qwen3.5-9B-SFT/trainer_state.json b/checkpoints/Qwen3.5-9B-SFT/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b713063ddecac87da574007171b92891c6d57c1
--- /dev/null
+++ b/checkpoints/Qwen3.5-9B-SFT/trainer_state.json
@@ -0,0 +1,2227 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 14.152398109436035,
+      "learning_rate": 1.437699680511182e-07,
+      "loss": 0.37259225845336913,
+      "step": 10
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 5.804495334625244,
+      "learning_rate": 3.0351437699680514e-07,
+      "loss": 0.3420424461364746,
+      "step": 20
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.7388691902160645,
+      "learning_rate": 4.6325878594249205e-07,
+      "loss": 0.2912754058837891,
+      "step": 30
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.9213385581970215,
+      "learning_rate": 6.230031948881789e-07,
+      "loss": 0.2695852518081665,
+      "step": 40
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7402991056442261,
+      "learning_rate": 7.82747603833866e-07,
+      "loss": 0.2490011692047119,
+      "step": 50
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.8080454468727112,
+      "learning_rate": 9.424920127795528e-07,
+      "loss": 0.24377479553222656,
+      "step": 60
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7839148640632629,
+      "learning_rate": 1.1022364217252397e-06,
+      "loss": 0.2515081882476807,
+      "step": 70
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.6039342880249023,
+      "learning_rate": 1.2619808306709266e-06,
+      "loss": 0.236421799659729,
+      "step": 80
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6679620146751404,
+      "learning_rate": 1.4217252396166134e-06,
+      "loss": 0.23224935531616211,
+      "step": 90
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.6516961455345154,
+      "learning_rate": 1.5814696485623005e-06,
+      "loss": 0.23475091457366942,
+      "step": 100
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.6763356924057007,
+      "learning_rate": 1.7412140575079875e-06,
+      "loss": 0.2348011016845703,
+      "step": 110
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.7100752592086792,
+      "learning_rate": 1.9009584664536742e-06,
+      "loss": 0.22568964958190918,
+      "step": 120
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.7218499779701233,
+      "learning_rate": 2.060702875399361e-06,
+      "loss": 0.22881340980529785,
+      "step": 130
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.7274235486984253,
+      "learning_rate": 2.220447284345048e-06,
+      "loss": 0.2210618019104004,
+      "step": 140
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.7489368319511414,
+      "learning_rate": 2.380191693290735e-06,
+      "loss": 0.21785550117492675,
+      "step": 150
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.636695384979248,
+      "learning_rate": 2.539936102236422e-06,
+      "loss": 0.21415135860443116,
+      "step": 160
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.7647548317909241,
+      "learning_rate": 2.699680511182109e-06,
+      "loss": 0.22171540260314943,
+      "step": 170
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.6201676726341248,
+      "learning_rate": 2.8594249201277955e-06,
+      "loss": 0.21528491973876954,
+      "step": 180
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.7164552807807922,
+      "learning_rate": 3.0191693290734825e-06,
+      "loss": 0.21876811981201172,
+      "step": 190
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.7416535019874573,
+      "learning_rate": 3.17891373801917e-06,
+      "loss": 0.22004494667053223,
+      "step": 200
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.6418226361274719,
+      "learning_rate": 3.3386581469648564e-06,
+      "loss": 0.21785356998443603,
+      "step": 210
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.5532397031784058,
+      "learning_rate": 3.4984025559105434e-06,
+      "loss": 0.2117140293121338,
+      "step": 220
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.6566811800003052,
+      "learning_rate": 3.6581469648562303e-06,
+      "loss": 0.21361856460571288,
+      "step": 230
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.5842558741569519,
+      "learning_rate": 3.817891373801918e-06,
+      "loss": 0.21009182929992676,
+      "step": 240
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5284646153450012,
+      "learning_rate": 3.977635782747604e-06,
+      "loss": 0.20717477798461914,
+      "step": 250
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.4791855812072754,
+      "learning_rate": 4.137380191693291e-06,
+      "loss": 0.20851461887359618,
+      "step": 260
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.5538341999053955,
+      "learning_rate": 4.297124600638978e-06,
+      "loss": 0.21010227203369142,
+      "step": 270
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.5214399695396423,
+      "learning_rate": 4.456869009584665e-06,
+      "loss": 0.20617682933807374,
+      "step": 280
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.5693404674530029,
+      "learning_rate": 4.616613418530352e-06,
+      "loss": 0.21186504364013672,
+      "step": 290
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.520547091960907,
+      "learning_rate": 4.776357827476039e-06,
+      "loss": 0.2132932186126709,
+      "step": 300
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.5054877400398254,
+      "learning_rate": 4.936102236421725e-06,
+      "loss": 0.2020902156829834,
+      "step": 310
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.5992368459701538,
+      "learning_rate": 4.999943833158769e-06,
+      "loss": 0.20265860557556153,
+      "step": 320
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.4485788941383362,
+      "learning_rate": 4.999600600490783e-06,
+      "loss": 0.2094430923461914,
+      "step": 330
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.48217350244522095,
+      "learning_rate": 4.9989453817439345e-06,
+      "loss": 0.20311150550842286,
+      "step": 340
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.48118671774864197,
+      "learning_rate": 4.997978258698942e-06,
+      "loss": 0.20234436988830568,
+      "step": 350
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.4691317677497864,
+      "learning_rate": 4.996699352066659e-06,
+      "loss": 0.20289850234985352,
+      "step": 360
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.5056607723236084,
+      "learning_rate": 4.995108821473014e-06,
+      "loss": 0.20348825454711914,
+      "step": 370
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.46245047450065613,
+      "learning_rate": 4.993206865439084e-06,
+      "loss": 0.1985713481903076,
+      "step": 380
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.45679083466529846,
+      "learning_rate": 4.990993721356317e-06,
+      "loss": 0.1985337495803833,
+      "step": 390
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.513569712638855,
+      "learning_rate": 4.988469665456901e-06,
+      "loss": 0.1989649534225464,
+      "step": 400
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.47685959935188293,
+      "learning_rate": 4.985635012779288e-06,
+      "loss": 0.20632779598236084,
+      "step": 410
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.42005518078804016,
+      "learning_rate": 4.98249011712887e-06,
+      "loss": 0.1997704029083252,
+      "step": 420
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.40938711166381836,
+      "learning_rate": 4.979035371033824e-06,
+      "loss": 0.198714542388916,
+      "step": 430
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.4478866755962372,
+      "learning_rate": 4.975271205696115e-06,
+      "loss": 0.1973956823348999,
+      "step": 440
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.4451947510242462,
+      "learning_rate": 4.971198090937671e-06,
+      "loss": 0.19358736276626587,
+      "step": 450
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.4245293140411377,
+      "learning_rate": 4.966816535141756e-06,
+      "loss": 0.18962936401367186,
+      "step": 460
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.44330471754074097,
+      "learning_rate": 4.9621270851895035e-06,
+      "loss": 0.19763081073760985,
+      "step": 470
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.4569932520389557,
+      "learning_rate": 4.957130326391662e-06,
+      "loss": 0.19951647520065308,
+      "step": 480
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.46161818504333496,
+      "learning_rate": 4.951826882415544e-06,
+      "loss": 0.1966404676437378,
+      "step": 490
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5257421135902405,
+      "learning_rate": 4.946217415207177e-06,
+      "loss": 0.1917549967765808,
+      "step": 500
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.47060427069664,
+      "learning_rate": 4.940302624908689e-06,
+      "loss": 0.1965235710144043,
+      "step": 510
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.49784040451049805,
+      "learning_rate": 4.934083249770912e-06,
+      "loss": 0.19047110080718993,
+      "step": 520
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.46195048093795776,
+      "learning_rate": 4.927560066061251e-06,
+      "loss": 0.1933290719985962,
+      "step": 530
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.44262751936912537,
+      "learning_rate": 4.920733887966783e-06,
+      "loss": 0.20422852039337158,
+      "step": 540
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.4286530315876007,
+      "learning_rate": 4.913605567492636e-06,
+      "loss": 0.19317498207092285,
+      "step": 550
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.4754306375980377,
+      "learning_rate": 4.906175994355656e-06,
+      "loss": 0.19484668970108032,
+      "step": 560
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.542935311794281,
+      "learning_rate": 4.898446095873345e-06,
+      "loss": 0.19600849151611327,
+      "step": 570
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.4545859396457672,
+      "learning_rate": 4.890416836848128e-06,
+      "loss": 0.193756628036499,
+      "step": 580
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.44452813267707825,
+      "learning_rate": 4.882089219446925e-06,
+      "loss": 0.18871793746948243,
+      "step": 590
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.49089452624320984,
+      "learning_rate": 4.873464283076074e-06,
+      "loss": 0.19349279403686523,
+      "step": 600
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.40194588899612427,
+      "learning_rate": 4.864543104251587e-06,
+      "loss": 0.19310340881347657,
+      "step": 610
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.4422169327735901,
+      "learning_rate": 4.855326796464798e-06,
+      "loss": 0.1940554141998291,
+      "step": 620
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.3977762460708618,
+      "learning_rate": 4.8458165100433725e-06,
+      "loss": 0.1905130386352539,
+      "step": 630
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.47129112482070923,
+      "learning_rate": 4.836013432007738e-06,
+      "loss": 0.1905440092086792,
+      "step": 640
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.44452038407325745,
+      "learning_rate": 4.825918785922921e-06,
+      "loss": 0.19458037614822388,
+      "step": 650
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.511098325252533,
+      "learning_rate": 4.8155338317458315e-06,
+      "loss": 0.19132745265960693,
+      "step": 660
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.44879648089408875,
+      "learning_rate": 4.804859865668002e-06,
+      "loss": 0.18898229598999022,
+      "step": 670
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.42120668292045593,
+      "learning_rate": 4.793898219953804e-06,
+      "loss": 0.1905623197555542,
+      "step": 680
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.4307490587234497,
+      "learning_rate": 4.782650262774164e-06,
+      "loss": 0.1913731336593628,
+      "step": 690
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.4430791735649109,
+      "learning_rate": 4.7711173980357886e-06,
+      "loss": 0.1875431776046753,
+      "step": 700
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.45506489276885986,
+      "learning_rate": 4.759301065205947e-06,
+      "loss": 0.18780747652053834,
+      "step": 710
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.4784105122089386,
+      "learning_rate": 4.7472027391328e-06,
+      "loss": 0.18949007987976074,
+      "step": 720
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.5166275501251221,
+      "learning_rate": 4.734823929861317e-06,
+      "loss": 0.19141184091567992,
+      "step": 730
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.4763261675834656,
+      "learning_rate": 4.722166182444801e-06,
+      "loss": 0.1886807918548584,
+      "step": 740
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.4896913170814514,
+      "learning_rate": 4.709231076752045e-06,
+      "loss": 0.18765724897384645,
+      "step": 750
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.4405442178249359,
+      "learning_rate": 4.696020227270142e-06,
+      "loss": 0.1914137840270996,
+      "step": 760
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.4831804633140564,
+      "learning_rate": 4.6825352829029705e-06,
+      "loss": 0.1876143217086792,
+      "step": 770
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.541253387928009,
+      "learning_rate": 4.668777926765392e-06,
+      "loss": 0.18711633682250978,
+      "step": 780
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.4409460723400116,
+      "learning_rate": 4.6547498759731725e-06,
+      "loss": 0.182257080078125,
+      "step": 790
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.4363306164741516,
+      "learning_rate": 4.6404528814286575e-06,
+      "loss": 0.18522121906280517,
+      "step": 800
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.42511361837387085,
+      "learning_rate": 4.6258887276022425e-06,
+      "loss": 0.19108107089996337,
+      "step": 810
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.549828290939331,
+      "learning_rate": 4.611059232309639e-06,
+      "loss": 0.18742183446884156,
+      "step": 820
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.48544079065322876,
+      "learning_rate": 4.595966246484986e-06,
+      "loss": 0.18868749141693114,
+      "step": 830
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.4877803325653076,
+      "learning_rate": 4.580611653949829e-06,
+      "loss": 0.1887261152267456,
+      "step": 840
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.43121787905693054,
+      "learning_rate": 4.564997371177992e-06,
+      "loss": 0.18640785217285155,
+      "step": 850
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.46837401390075684,
+      "learning_rate": 4.54912534705637e-06,
+      "loss": 0.1863306999206543,
+      "step": 860
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.4588525593280792,
+      "learning_rate": 4.532997562641683e-06,
+      "loss": 0.18255772590637206,
+      "step": 870
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.4227045774459839,
+      "learning_rate": 4.516616030913214e-06,
+      "loss": 0.18717021942138673,
+      "step": 880
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.4720568358898163,
+      "learning_rate": 4.499982796521556e-06,
+      "loss": 0.18217651844024657,
+      "step": 890
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.4398342967033386,
+      "learning_rate": 4.48309993553341e-06,
+      "loss": 0.18433446884155275,
+      "step": 900
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.44484105706214905,
+      "learning_rate": 4.465969555172468e-06,
+      "loss": 0.18349353075027466,
+      "step": 910
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.43579065799713135,
+      "learning_rate": 4.448593793556391e-06,
+      "loss": 0.1893969178199768,
+      "step": 920
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.43624043464660645,
+      "learning_rate": 4.430974819429954e-06,
+      "loss": 0.18258814811706542,
+      "step": 930
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.42816856503486633,
+      "learning_rate": 4.413114831894344e-06,
+      "loss": 0.1781400442123413,
+      "step": 940
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.4396454989910126,
+      "learning_rate": 4.3950160601326865e-06,
+      "loss": 0.17804189920425414,
+      "step": 950
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 19.137165069580078,
+      "learning_rate": 4.376680763131811e-06,
+      "loss": 0.18540620803833008,
+      "step": 960
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.4217820167541504,
+      "learning_rate": 4.358111229400296e-06,
+      "loss": 0.19113874435424805,
+      "step": 970
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.44505834579467773,
+      "learning_rate": 4.33930977668283e-06,
+      "loss": 0.18937530517578124,
+      "step": 980
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.5085921287536621,
+      "learning_rate": 4.320278751670922e-06,
+      "loss": 0.18254513740539552,
+      "step": 990
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.49309051036834717,
+      "learning_rate": 4.301020529710009e-06,
+      "loss": 0.18421100378036498,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.47118672728538513,
+      "learning_rate": 4.281537514502962e-06,
+      "loss": 0.1877603769302368,
+      "step": 1010
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.4600847363471985,
+      "learning_rate": 4.261832137810093e-06,
+      "loss": 0.1849065065383911,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.44828227162361145,
+      "learning_rate": 4.241906859145611e-06,
+      "loss": 0.18526692390441896,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.4324962794780731,
+      "learning_rate": 4.221764165470661e-06,
+      "loss": 0.18226839303970338,
+      "step": 1040
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.5017353296279907,
+      "learning_rate": 4.201406570882898e-06,
+      "loss": 0.181982159614563,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.44924962520599365,
+      "learning_rate": 4.180836616302704e-06,
+      "loss": 0.18122615814208984,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.5138829350471497,
+      "learning_rate": 4.160056869156041e-06,
+      "loss": 0.18444604873657228,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.41282355785369873,
+      "learning_rate": 4.139069923053995e-06,
+      "loss": 0.18227832317352294,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.4269866943359375,
+      "learning_rate": 4.117878397469062e-06,
+      "loss": 0.18615881204605103,
+      "step": 1090
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.39903032779693604,
+      "learning_rate": 4.096484937408195e-06,
+      "loss": 0.17755311727523804,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.4636191725730896,
+      "learning_rate": 4.074892213082676e-06,
+      "loss": 0.17751554250717164,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.48484447598457336,
+      "learning_rate": 4.0531029195748265e-06,
+      "loss": 0.18447484970092773,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.4330807030200958,
+      "learning_rate": 4.03111977650163e-06,
+      "loss": 0.18296458721160888,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.4666971266269684,
+      "learning_rate": 4.008945527675281e-06,
+      "loss": 0.18310744762420655,
+      "step": 1140
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.46516141295433044,
+      "learning_rate": 3.986582940760717e-06,
+      "loss": 0.1737138032913208,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.47476157546043396,
+      "learning_rate": 3.9640348069301785e-06,
+      "loss": 0.1797748327255249,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.4147510230541229,
+      "learning_rate": 3.941303940514826e-06,
+      "loss": 0.18082959651947023,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.4981541633605957,
+      "learning_rate": 3.918393178653472e-06,
+      "loss": 0.1821175694465637,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.44226428866386414,
+      "learning_rate": 3.895305380938468e-06,
+      "loss": 0.1790626049041748,
+      "step": 1190
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.44139742851257324,
+      "learning_rate": 3.872043429058783e-06,
+      "loss": 0.1771311044692993,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.4409257173538208,
+      "learning_rate": 3.84861022644033e-06,
+      "loss": 0.17819294929504395,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.4447258412837982,
+      "learning_rate": 3.825008697883574e-06,
+      "loss": 0.1871392011642456,
+      "step": 1220
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.4376235008239746,
+      "learning_rate": 3.8012417891984776e-06,
+      "loss": 0.1800956606864929,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.484053373336792,
+      "learning_rate": 3.777312466836819e-06,
+      "loss": 0.17867467403411866,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4140814542770386,
+      "learning_rate": 3.7532237175219378e-06,
+      "loss": 0.17768914699554444,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.46091777086257935,
+      "learning_rate": 3.728978547875948e-06,
+      "loss": 0.1821272373199463,
+      "step": 1260
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.48253530263900757,
+      "learning_rate": 3.7045799840444712e-06,
+      "loss": 0.1769007444381714,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.4717654287815094,
+      "learning_rate": 3.6800310713189258e-06,
+      "loss": 0.17760204076766967,
+      "step": 1280
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.510908305644989,
+      "learning_rate": 3.6553348737564328e-06,
+      "loss": 0.17857449054718016,
+      "step": 1290
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.41923636198043823,
+      "learning_rate": 3.6304944737973794e-06,
+      "loss": 0.1835271120071411,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.4893503189086914,
+      "learning_rate": 3.6055129718806836e-06,
+      "loss": 0.17832870483398439,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.46311140060424805,
+      "learning_rate": 3.5803934860568134e-06,
+      "loss": 0.17421386241912842,
+      "step": 1320
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.5104085803031921,
+      "learning_rate": 3.5551391515986163e-06,
+      "loss": 0.17839887142181396,
+      "step": 1330
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.5179736018180847,
+      "learning_rate": 3.529753120609982e-06,
+      "loss": 0.17463579177856445,
+      "step": 1340
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.5104801654815674,
+      "learning_rate": 3.5042385616324243e-06,
+      "loss": 0.17735633850097657,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.4137459993362427,
+      "learning_rate": 3.4785986592495934e-06,
+      "loss": 0.17638884782791137,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.40862590074539185,
+      "learning_rate": 3.452836613689803e-06,
+      "loss": 0.17429978847503663,
+      "step": 1370
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.4509046673774719,
+      "learning_rate": 3.426955640426584e-06,
+      "loss": 0.17711507081985473,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.41654732823371887,
+      "learning_rate": 3.4009589697773605e-06,
+      "loss": 0.17662335634231568,
+      "step": 1390
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.44660893082618713,
+      "learning_rate": 3.3748498465002475e-06,
+      "loss": 0.17387230396270753,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.4556673765182495,
+      "learning_rate": 3.3486315293890693e-06,
+      "loss": 0.1813045024871826,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.47021520137786865,
+      "learning_rate": 3.3223072908666053e-06,
+      "loss": 0.17260375022888183,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.47382044792175293,
+      "learning_rate": 3.295880416576153e-06,
+      "loss": 0.1822422504425049,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.4591139554977417,
+      "learning_rate": 3.269354204971427e-06,
+      "loss": 0.17569501399993898,
+      "step": 1440
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.5069236755371094,
+      "learning_rate": 3.242731966904865e-06,
+      "loss": 0.1741647243499756,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.47500520944595337,
+      "learning_rate": 3.2160170252143913e-06,
+      "loss": 0.17435843944549562,
+      "step": 1460
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.4182279109954834,
+      "learning_rate": 3.1892127143086716e-06,
+      "loss": 0.18048869371414183,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.4673804044723511,
+      "learning_rate": 3.1623223797509347e-06,
+      "loss": 0.1697669506072998,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.5240358710289001,
+      "learning_rate": 3.135349377841396e-06,
+      "loss": 0.17352311611175536,
+      "step": 1490
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4815457761287689,
+      "learning_rate": 3.1082970751983497e-06,
+      "loss": 0.17982523441314696,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.4510243237018585,
+      "learning_rate": 3.0811688483379546e-06,
+      "loss": 0.17709922790527344,
+      "step": 1510
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.49320361018180847,
+      "learning_rate": 3.0539680832528074e-06,
+      "loss": 0.17739441394805908,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.500535786151886,
+      "learning_rate": 3.026698174989316e-06,
+      "loss": 0.17496352195739745,
+      "step": 1530
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.4047980010509491,
+      "learning_rate": 2.999362527223952e-06,
+      "loss": 0.17673254013061523,
+      "step": 1540
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.44109296798706055,
+      "learning_rate": 2.9719645518384194e-06,
+      "loss": 0.17737388610839844,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.5294535160064697,
+      "learning_rate": 2.944507668493807e-06,
+      "loss": 0.17971434593200683,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.4643715023994446,
+      "learning_rate": 2.9169953042037623e-06,
+      "loss": 0.1756264567375183,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.5058485865592957,
+      "learning_rate": 2.889430892906754e-06,
+      "loss": 0.17225983142852783,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.44740837812423706,
+      "learning_rate": 2.861817875037462e-06,
+      "loss": 0.16964515447616577,
+      "step": 1590
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.4732149839401245,
+      "learning_rate": 2.8341596970973683e-06,
+      "loss": 0.1788057804107666,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.4844909906387329,
+      "learning_rate": 2.80645981122458e-06,
+      "loss": 0.17410922050476074,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.42798951268196106,
+      "learning_rate": 2.7787216747629508e-06,
+      "loss": 0.17997238636016846,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.40051475167274475,
+      "learning_rate": 2.7509487498305615e-06,
+      "loss": 0.1678016424179077,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.4792901873588562,
+      "learning_rate": 2.7231445028875924e-06,
+      "loss": 0.17371201515197754,
+      "step": 1640
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.47259077429771423,
+      "learning_rate": 2.6953124043036604e-06,
+      "loss": 0.1733928442001343,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.47142958641052246,
+      "learning_rate": 2.667455927924667e-06,
+      "loss": 0.1721155285835266,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.45087724924087524,
+      "learning_rate": 2.6395785506392164e-06,
+      "loss": 0.16819543838500978,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.46989184617996216,
+      "learning_rate": 2.6116837519446407e-06,
+      "loss": 0.17220993041992189,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.46657466888427734,
+      "learning_rate": 2.5837750135127192e-06,
+      "loss": 0.1694636106491089,
+      "step": 1690
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.4564855098724365,
+      "learning_rate": 2.555855818755108e-06,
+      "loss": 0.17499985694885253,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.46723976731300354,
+      "learning_rate": 2.5279296523885636e-06,
+      "loss": 0.17200368642807007,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.4736442565917969,
+      "learning_rate": 2.5e-06,
+      "loss": 0.1722201704978943,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.46455293893814087,
+      "learning_rate": 2.472070347611437e-06,
+      "loss": 0.16756889820098878,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.5148719549179077,
+      "learning_rate": 2.444144181244893e-06,
+      "loss": 0.1717504858970642,
+      "step": 1740
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.4863338768482208,
+      "learning_rate": 2.416224986487282e-06,
+      "loss": 0.16999526023864747,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.4933525323867798,
+      "learning_rate": 2.3883162480553605e-06,
+      "loss": 0.167264187335968,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.47000402212142944,
+      "learning_rate": 2.3604214493607844e-06,
+      "loss": 0.17092803716659546,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 0.5010572075843811,
+      "learning_rate": 2.332544072075333e-06,
+      "loss": 0.17414600849151612,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.48513519763946533,
+      "learning_rate": 2.30468759569634e-06,
+      "loss": 0.17303409576416015,
+      "step": 1790
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.5627573728561401,
+      "learning_rate": 2.276855497112408e-06,
+      "loss": 0.1702120542526245,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.4507666230201721,
+      "learning_rate": 2.2490512501694394e-06,
+      "loss": 0.16601570844650268,
+      "step": 1810
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.44019457697868347,
+      "learning_rate": 2.2212783252370496e-06,
+      "loss": 0.1702256679534912,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.4958936870098114,
+      "learning_rate": 2.1935401887754213e-06,
+      "loss": 0.17476747035980225,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.5116126537322998,
+      "learning_rate": 2.165840302902632e-06,
+      "loss": 0.16839592456817626,
+      "step": 1840
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.45241907238960266,
+      "learning_rate": 2.1381821249625383e-06,
+      "loss": 0.17359941005706786,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.4567364454269409,
+      "learning_rate": 2.1105691070932465e-06,
+      "loss": 0.16584824323654174,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.4592301845550537,
+      "learning_rate": 2.083004695796238e-06,
+      "loss": 0.16465656757354735,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 0.5488863587379456,
+      "learning_rate": 2.055492331506194e-06,
+      "loss": 0.17121386528015137,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 0.5324817299842834,
+      "learning_rate": 2.0280354481615814e-06,
+      "loss": 0.16972928047180175,
+      "step": 1890
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.48892301321029663,
+      "learning_rate": 2.000637472776049e-06,
+      "loss": 0.170430588722229,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.47859957814216614,
+      "learning_rate": 1.973301825010685e-06,
+      "loss": 0.16521625518798827,
+      "step": 1910
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.4869942367076874,
+      "learning_rate": 1.9460319167471934e-06,
+      "loss": 0.17017598152160646,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.45271286368370056,
+      "learning_rate": 1.9188311516620466e-06,
+      "loss": 0.16556847095489502,
+      "step": 1930
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.5558905005455017,
+      "learning_rate": 1.891702924801651e-06,
+      "loss": 0.1715184211730957,
+      "step": 1940
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.585498571395874,
+      "learning_rate": 1.864650622158604e-06,
+      "loss": 0.17473173141479492,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.48184657096862793,
+      "learning_rate": 1.8376776202490666e-06,
+      "loss": 0.17099432945251464,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.5236788392066956,
+      "learning_rate": 1.8107872856913293e-06,
+      "loss": 0.16600199937820434,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.46146464347839355,
+      "learning_rate": 1.7839829747856096e-06,
+      "loss": 0.1678165912628174,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 0.5520542860031128,
+      "learning_rate": 1.7572680330951359e-06,
+      "loss": 0.1660862684249878,
+      "step": 1990
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.4780367910861969,
+      "learning_rate": 1.7306457950285747e-06,
+      "loss": 0.16768529415130615,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.49715733528137207,
+      "learning_rate": 1.704119583423848e-06,
+      "loss": 0.1622864842414856,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.4588998258113861,
+      "learning_rate": 1.677692709133396e-06,
+      "loss": 0.16991891860961914,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.515424370765686,
+      "learning_rate": 1.6513684706109311e-06,
+      "loss": 0.1692219614982605,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.5444751977920532,
+      "learning_rate": 1.6251501534997529e-06,
+      "loss": 0.16774747371673585,
+      "step": 2040
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.46245628595352173,
+      "learning_rate": 1.5990410302226405e-06,
+      "loss": 0.1620822787284851,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.4891432225704193,
+      "learning_rate": 1.5730443595734162e-06,
+      "loss": 0.16800422668457032,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.5363055467605591,
+      "learning_rate": 1.5471633863101982e-06,
+      "loss": 0.16863477230072021,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.5734866261482239,
+      "learning_rate": 1.521401340750407e-06,
+      "loss": 0.17034238576889038,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.5337269902229309,
+      "learning_rate": 1.495761438367577e-06,
+      "loss": 0.1681105375289917,
+      "step": 2090
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.553776204586029,
+      "learning_rate": 1.4702468793900187e-06,
+      "loss": 0.16727843284606933,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.4766635298728943,
+      "learning_rate": 1.444860848401384e-06,
+      "loss": 0.16690752506256104,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.5310425758361816,
+      "learning_rate": 1.4196065139431866e-06,
+      "loss": 0.166944682598114,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 0.5216221809387207,
+      "learning_rate": 1.3944870281193178e-06,
+      "loss": 0.16600677967071534,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.48818936944007874,
+      "learning_rate": 1.3695055262026208e-06,
+      "loss": 0.17084674835205077,
+      "step": 2140
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.4620889723300934,
+      "learning_rate": 1.3446651262435679e-06,
+      "loss": 0.16466886997222902,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.5135729908943176,
+      "learning_rate": 1.3199689286810746e-06,
+      "loss": 0.16638584136962892,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.5088804364204407,
+      "learning_rate": 1.2954200159555294e-06,
+      "loss": 0.16140960454940795,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.5873313546180725,
+      "learning_rate": 1.2710214521240527e-06,
+      "loss": 0.16632808446884156,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 0.5511561036109924,
+      "learning_rate": 1.246776282478063e-06,
+      "loss": 0.16666927337646484,
+      "step": 2190
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.5059501528739929,
+      "learning_rate": 1.222687533163181e-06,
+      "loss": 0.1660354971885681,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.5848804116249084,
+      "learning_rate": 1.1987582108015228e-06,
+      "loss": 0.167069149017334,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.551862359046936,
+      "learning_rate": 1.1749913021164255e-06,
+      "loss": 0.1673675775527954,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.545039713382721,
+      "learning_rate": 1.1513897735596702e-06,
+      "loss": 0.16290701627731324,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.5339580178260803,
+      "learning_rate": 1.127956570941218e-06,
+      "loss": 0.16006081104278563,
+      "step": 2240
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.6216872930526733,
+      "learning_rate": 1.104694619061533e-06,
+      "loss": 0.16769601106643678,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.5120308995246887,
+      "learning_rate": 1.0816068213465295e-06,
+      "loss": 0.1700352430343628,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.5662424564361572,
+      "learning_rate": 1.0586960594851762e-06,
+      "loss": 0.16522903442382814,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 0.5819669365882874,
+      "learning_rate": 1.0359651930698217e-06,
+      "loss": 0.16397686004638673,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 0.5186004042625427,
+      "learning_rate": 1.0134170592392837e-06,
+      "loss": 0.1659336805343628,
+      "step": 2290
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.548119843006134,
+      "learning_rate": 9.910544723247204e-07,
+      "loss": 0.16470273733139038,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.5775285959243774,
+      "learning_rate": 9.688802234983706e-07,
+      "loss": 0.16365277767181396,
+      "step": 2310
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.527587354183197,
+      "learning_rate": 9.468970804251742e-07,
+      "loss": 0.1663814902305603,
+      "step": 2320
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 0.5380632281303406,
+      "learning_rate": 9.251077869173244e-07,
+      "loss": 0.1611353039741516,
+      "step": 2330
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.5524181723594666,
+      "learning_rate": 9.035150625918054e-07,
+      "loss": 0.16146550178527833,
+      "step": 2340
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.5604588985443115,
+      "learning_rate": 8.821216025309395e-07,
+      "loss": 0.16433074474334716,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.5228386521339417,
+      "learning_rate": 8.609300769460055e-07,
+      "loss": 0.16035985946655273,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.5645732283592224,
+      "learning_rate": 8.399431308439592e-07,
+      "loss": 0.16581802368164061,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.5424208045005798,
+      "learning_rate": 8.191633836972962e-07,
+      "loss": 0.16631256341934203,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.5168543457984924,
+      "learning_rate": 7.985934291171024e-07,
+      "loss": 0.16223517656326295,
+      "step": 2390
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.5431832075119019,
+      "learning_rate": 7.7823583452934e-07,
+      "loss": 0.16580779552459718,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.5234625339508057,
+      "learning_rate": 7.58093140854389e-07,
+      "loss": 0.1643224239349365,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.5630083680152893,
+      "learning_rate": 7.381678621899077e-07,
+      "loss": 0.16557644605636596,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.564468264579773,
+      "learning_rate": 7.184624854970379e-07,
+      "loss": 0.16128422021865846,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.5211976170539856,
+      "learning_rate": 6.989794702899932e-07,
+      "loss": 0.16088463068008424,
+      "step": 2440
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.5524035096168518,
+      "learning_rate": 6.797212483290777e-07,
+      "loss": 0.162975013256073,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.5299164652824402,
+      "learning_rate": 6.60690223317171e-07,
+      "loss": 0.1605682849884033,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.5172830820083618,
+      "learning_rate": 6.418887705997046e-07,
+      "loss": 0.16314079761505126,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.600565493106842,
+      "learning_rate": 6.23319236868189e-07,
+      "loss": 0.16519222259521485,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.5615212321281433,
+      "learning_rate": 6.049839398673141e-07,
+      "loss": 0.1658394455909729,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.6055536866188049,
+      "learning_rate": 5.868851681056567e-07,
+      "loss": 0.1668398857116699,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.5374464988708496,
+      "learning_rate": 5.690251805700467e-07,
+      "loss": 0.1650017738342285,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.5940120816230774,
+      "learning_rate": 5.514062064436096e-07,
+      "loss": 0.16656217575073243,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.5781455039978027,
+      "learning_rate": 5.34030444827533e-07,
+      "loss": 0.16284786462783812,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.6245594024658203,
+      "learning_rate": 5.169000644665895e-07,
+      "loss": 0.16080342531204223,
+      "step": 2540
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.5611685514450073,
+      "learning_rate": 5.000172034784442e-07,
+      "loss": 0.16152651309967042,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.6154001951217651,
+      "learning_rate": 4.833839690867853e-07,
+      "loss": 0.16308718919754028,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.603412926197052,
+      "learning_rate": 4.6700243735831705e-07,
+      "loss": 0.15839451551437378,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.5827690362930298,
+      "learning_rate": 4.508746529436311e-07,
+      "loss": 0.15733330249786376,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.5977171659469604,
+      "learning_rate": 4.350026288220083e-07,
+      "loss": 0.1626259684562683,
+      "step": 2590
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.577282726764679,
+      "learning_rate": 4.1938834605017133e-07,
+      "loss": 0.16123565435409545,
+      "step": 2600
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.5709326267242432,
+      "learning_rate": 4.0403375351501515e-07,
+      "loss": 0.1605008363723755,
+      "step": 2610
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.5334956049919128,
+      "learning_rate": 3.88940767690362e-07,
+      "loss": 0.15991599559783937,
+      "step": 2620
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.6113538146018982,
+      "learning_rate": 3.7411127239775774e-07,
+      "loss": 0.15835078954696655,
+      "step": 2630
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.5511420965194702,
+      "learning_rate": 3.595471185713431e-07,
+      "loss": 0.15640071630477906,
+      "step": 2640
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.6026161313056946,
+      "learning_rate": 3.4525012402682826e-07,
+      "loss": 0.1582810401916504,
+      "step": 2650
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.584636926651001,
+      "learning_rate": 3.3122207323460804e-07,
+      "loss": 0.16068333387374878,
+      "step": 2660
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 0.6109904646873474,
+      "learning_rate": 3.1746471709702963e-07,
+      "loss": 0.15707263946533204,
+      "step": 2670
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.5890913009643555,
+      "learning_rate": 3.039797727298585e-07,
+      "loss": 0.16340034008026122,
+      "step": 2680
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.5972010493278503,
+      "learning_rate": 2.9076892324795546e-07,
+      "loss": 0.16199719905853271,
+      "step": 2690
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.5684933662414551,
+      "learning_rate": 2.778338175551995e-07,
+      "loss": 0.157791805267334,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.5997552871704102,
+      "learning_rate": 2.6517607013868326e-07,
+      "loss": 0.16547198295593263,
+      "step": 2710
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.5953125953674316,
+      "learning_rate": 2.527972608672002e-07,
+      "loss": 0.16187076568603515,
+      "step": 2720
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.6352548003196716,
+      "learning_rate": 2.40698934794053e-07,
+      "loss": 0.16053141355514527,
+      "step": 2730
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.5708346366882324,
+      "learning_rate": 2.2888260196421237e-07,
+      "loss": 0.15960177183151245,
+      "step": 2740
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5586206912994385,
+      "learning_rate": 2.1734973722583735e-07,
+      "loss": 0.16245728731155396,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.568911075592041,
+      "learning_rate": 2.0610178004619564e-07,
+      "loss": 0.15456781387329102,
+      "step": 2760
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.5839951634407043,
+      "learning_rate": 1.9514013433199834e-07,
+      "loss": 0.1657383680343628,
+      "step": 2770
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.5618033409118652,
+      "learning_rate": 1.8446616825416958e-07,
+      "loss": 0.16385490894317628,
+      "step": 2780
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 0.6637345552444458,
+      "learning_rate": 1.7408121407708007e-07,
+      "loss": 0.1651144504547119,
+      "step": 2790
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.6217195391654968,
+      "learning_rate": 1.6398656799226253e-07,
+      "loss": 0.15437800884246827,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.5852035284042358,
+      "learning_rate": 1.5418348995662773e-07,
+      "loss": 0.15989196300506592,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.5942526459693909,
+      "learning_rate": 1.4467320353520275e-07,
+      "loss": 0.15796263217926027,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 0.6098470687866211,
+      "learning_rate": 1.3545689574841341e-07,
+      "loss": 0.15871069431304932,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.5966947674751282,
+      "learning_rate": 1.26535716923927e-07,
+      "loss": 0.16476703882217408,
+      "step": 2840
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.6142219305038452,
+      "learning_rate": 1.1791078055307493e-07,
+      "loss": 0.16100226640701293,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.6414586901664734,
+      "learning_rate": 1.0958316315187289e-07,
+      "loss": 0.1599841833114624,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.5591761469841003,
+      "learning_rate": 1.0155390412665528e-07,
+      "loss": 0.15885504484176635,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.5783108472824097,
+      "learning_rate": 9.38240056443443e-08,
+      "loss": 0.1566369891166687,
+      "step": 2880
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 0.5923556685447693,
+      "learning_rate": 8.639443250736402e-08,
+      "loss": 0.15874269008636474,
+      "step": 2890
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.5997049808502197,
+      "learning_rate": 7.926611203321777e-08,
+      "loss": 0.16008044481277467,
+      "step": 2900
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.6039618253707886,
+      "learning_rate": 7.243993393874882e-08,
+      "loss": 0.16028298139572145,
+      "step": 2910
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.5951699614524841,
+      "learning_rate": 6.591675022908805e-08,
+      "loss": 0.1579365015029907,
+      "step": 2920
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.5689706206321716,
+      "learning_rate": 5.969737509131241e-08,
+      "loss": 0.15579373836517335,
+      "step": 2930
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.6160743236541748,
+      "learning_rate": 5.3782584792823334e-08,
+      "loss": 0.16048473119735718,
+      "step": 2940
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.6045230627059937,
+      "learning_rate": 4.817311758445686e-08,
+      "loss": 0.16096792221069336,
+      "step": 2950
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.5729262232780457,
+      "learning_rate": 4.286967360833866e-08,
+      "loss": 0.16088333129882812,
+      "step": 2960
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 0.5658770203590393,
+      "learning_rate": 3.787291481049754e-08,
+      "loss": 0.16108319759368897,
+      "step": 2970
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.6044265627861023,
+      "learning_rate": 3.3183464858244364e-08,
+      "loss": 0.16624315977096557,
+      "step": 2980
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.5959712862968445,
+      "learning_rate": 2.8801909062328992e-08,
+      "loss": 0.15484490394592285,
+      "step": 2990
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.61944979429245,
+      "learning_rate": 2.4728794303886248e-08,
+      "loss": 0.15291309356689453,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.6033022403717041,
+      "learning_rate": 2.0964628966175794e-08,
+      "loss": 0.15846710205078124,
+      "step": 3010
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.5712878704071045,
+      "learning_rate": 1.750988287113009e-08,
+      "loss": 0.15711405277252197,
+      "step": 3020
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.5986247062683105,
+      "learning_rate": 1.4364987220713278e-08,
+      "loss": 0.16440168619155884,
+      "step": 3030
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.6022250652313232,
+      "learning_rate": 1.1530334543099763e-08,
+      "loss": 0.16256611347198485,
+      "step": 3040
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.6291612386703491,
+      "learning_rate": 9.006278643683697e-09,
+      "loss": 0.1630788564682007,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.6009607315063477,
+      "learning_rate": 6.793134560916514e-09,
+      "loss": 0.16497414112091063,
+      "step": 3060
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.5608439445495605,
+      "learning_rate": 4.891178526986451e-09,
+      "loss": 0.1616500735282898,
+      "step": 3070
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.5863735675811768,
+      "learning_rate": 3.3006479333413943e-09,
+      "loss": 0.16401275396347045,
+      "step": 3080
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 0.5719097256660461,
+      "learning_rate": 2.021741301058422e-09,
+      "loss": 0.16051123142242432,
+      "step": 3090
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.5933088064193726,
+      "learning_rate": 1.0546182560652872e-09,
+      "loss": 0.16212167739868164,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 0.5979989171028137,
+      "learning_rate": 3.9939950921774607e-10,
+      "loss": 0.1578517436981201,
+      "step": 3110
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.5368886590003967,
+      "learning_rate": 5.616684123160854e-11,
+      "loss": 0.15727875232696534,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0,
+      "step": 3125,
+      "total_flos": 2.0482865290660545e+19,
+      "train_loss": 0.18024104360580445,
+      "train_runtime": 72884.8323,
+      "train_samples_per_second": 2.744,
+      "train_steps_per_second": 0.043
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.0482865290660545e+19,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoints/Qwen3.5-9B-SFT/training_loss.png b/checkpoints/Qwen3.5-9B-SFT/training_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..e62fd4448bf7049b8c64c2f123ed8c859f096ff7
Binary files /dev/null and b/checkpoints/Qwen3.5-9B-SFT/training_loss.png differ
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef37eb6ac95f819a2c4a51919553cc48d3b72354
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,63 @@
+# 实验一发布包
+
+## 目录结构
+
+```
+release_for_upload/
+├── scripts/                    # 评测脚本与验证集
+│   ├── eval_vln_vllm.py        # 主评测脚本 (vLLM)
+│   ├── eval_vln_transformers.py
+│   ├── patch_qwen35_visual_keys.py
+│   ├── patch_gemma_checkpoint.py
+│   ├── eval_exp1_base_parallel.sh
+│   ├── exp4_strict_offline_analysis.py
+│   └── exp1_val_1160.jsonl    # 验证集 1160 样本
+├── checkpoints/                # 8 个 SFT 模型（仅推理所需文件）
+│   ├── Qwen3.5-0.8B-SFT/
+│   ├── Qwen3.5-2B-SFT/
+│   ├── Qwen3.5-9B-SFT/
+│   ├── Qwen3-VL-2B-SFT/
+│   ├── Qwen3-VL-8B-SFT/
+│   ├── InternVL3.5-8B-SFT/
+│   ├── GLM-4.6V-Flash-SFT/
+│   └── Gemma-4-E4B-it-SFT/
+└── README.md
+```
+
+每个模型目录已剔除 DeepSpeed 中间 checkpoint（`checkpoint-*/` + `global_step*/`），仅保留推理所需的：
+- `model.safetensors`
+- `config.json` / `generation_config.json` / `processor_config.json` / `preprocessor_config.json`
+- `chat_template.jinja`
+- `tokenizer.json` / `tokenizer_config.json` / `vocab.json` / `merges.txt`（如有）
+- `eval_results_*.json`（评测结果）
+
+## 模型大小（8 个总计 ≈ 95 GB）
+
+| 模型 | 参数量 | 大小 |
+|---|---:|---:|
+| Qwen3.5-0.8B-SFT | 0.8B | 2.1 G |
+| Qwen3.5-2B-SFT | 2B | 5.1 G |
+| Qwen3.5-9B-SFT | 9B | 18.0 G |
+| Qwen3-VL-2B-SFT | 2B | 4.0 G |
+| Qwen3-VL-8B-SFT | 8B | 16.3 G |
+| InternVL3.5-8B-SFT | 8B | 15.9 G |
+| GLM-4.6V-Flash-SFT | 9B | 19.2 G |
+| Gemma-4-E4B-it-SFT | 4B | 14.9 G |
+
+## 使用方式
+
+```bash
+# 1) 解压（如果文件后缀是 .zip.txt，先把 .txt 去掉再解压）
+unzip Qwen3.5-9B-SFT.zip
+
+# 2) 评测
+source /mnt/.../miniconda3/etc/profile.d/conda.sh && conda activate vllm_eval
+export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
+CUDA_VISIBLE_DEVICES=0 python3 scripts/eval_vln_vllm.py \
+    --model_path checkpoints/Qwen3.5-9B-SFT \
+    --val_path scripts/exp1_val_1160.jsonl \
+    --output_dir checkpoints/Qwen3.5-9B-SFT \
+    --gpu_memory_utilization 0.7 \
+    --batch_size 32 \
+    --save_raw
+```
diff --git a/scripts/eval_exp1_base_parallel.sh b/scripts/eval_exp1_base_parallel.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0907e5905a30526cb3ada2cb3c8d0bed38ec5293
--- /dev/null
+++ b/scripts/eval_exp1_base_parallel.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# =============================================================================
+# Parallel evaluation of 8 BASE (untrained) models on exp1 validation set.
+# Each model runs on a dedicated GPU. InternVL uses transformers; rest use vLLM.
+#
+# Layout:
+#   GPU 0: Qwen3.5-0.8B      (vLLM)
+#   GPU 1: Qwen3.5-2B        (vLLM)
+#   GPU 2: Qwen3.5-9B        (vLLM)
+#   GPU 3: Qwen3-VL-2B       (vLLM)
+#   GPU 4: Qwen3-VL-8B       (vLLM)
+#   GPU 5: GLM-4.6V-Flash    (vLLM)
+#   GPU 6: InternVL3.5-8B-HF (transformers, vLLM不兼容)
+#   GPU 7: Gemma-4-E4B-it    (vLLM)
+#
+# Output: <model_dir>/eval_base_<timestamp>/eval_results_<model>.json
+#         <model_dir>/eval_base_<timestamp>/raw_errors_<model>.json
+# =============================================================================
+set -uo pipefail
+
+PROJECT_DIR="/mnt/sfs_turbo_new/R11181/project_vlm"
+EXP_DIR="${PROJECT_DIR}/exp_v5"
+LOG_DIR="${EXP_DIR}/logs"
+mkdir -p "${LOG_DIR}"
+CONDA_ROOT="/mnt/sfs_turbo/R11181/miniconda3"
+MODEL_DIR="${PROJECT_DIR}/model"
+VAL_JSONL="${EXP_DIR}/data/exp1/exp1_val_1160.jsonl"
+
+ts="$(date +%Y%m%d_%H%M%S)"
+
+# Each base model gets a dedicated output subdir to keep results separate
+BASE_OUT_ROOT="${EXP_DIR}/output/base_eval_${ts}"
+mkdir -p "${BASE_OUT_ROOT}"
+
+# helper for vLLM eval
+run_vllm() {
+    local gpu="$1" model_path="$2" name="$3" gmu="$4"
+    local outdir="${BASE_OUT_ROOT}/${name}"
+    local log="${LOG_DIR}/eval_base_${name}_${ts}.log"
+    mkdir -p "${outdir}"
+    {
+        source "${CONDA_ROOT}/etc/profile.d/conda.sh"
+        conda activate vllm_eval
+        export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
+        export PYTHONUNBUFFERED=1
+        echo "[$(date '+%F %T')] ${name} (BASE) eval on GPU ${gpu} (vLLM, gmu=${gmu})"
+        CUDA_VISIBLE_DEVICES="${gpu}" python3 "${PROJECT_DIR}/eval_vln_vllm.py" \
+            --model_path "${model_path}" \
+            --val_path "${VAL_JSONL}" \
+            --output_dir "${outdir}" \
+            --gpu_memory_utilization "${gmu}" \
+            --batch_size 32 \
+            --save_raw
+        echo "[$(date '+%F %T')] ${name} DONE"
+    } > "${log}" 2>&1 &
+    disown
+    echo "[OK] ${name} (vLLM) on GPU ${gpu}, pid=$!"
+}
+
+# helper for transformers eval (InternVL only)
+run_transformers() {
+    local gpu="$1" model_path="$2" name="$3"
+    local outdir="${BASE_OUT_ROOT}/${name}"
+    local log="${LOG_DIR}/eval_base_${name}_${ts}.log"
+    mkdir -p "${outdir}"
+    {
+        source "${CONDA_ROOT}/etc/profile.d/conda.sh"
+        conda activate vlm_train
+        export PYTHONUNBUFFERED=1
+        echo "[$(date '+%F %T')] ${name} (BASE) eval on GPU ${gpu} (transformers)"
+        CUDA_VISIBLE_DEVICES="${gpu}" python3 "${PROJECT_DIR}/eval_vln_transformers.py" \
+            --model_path "${model_path}" \
+            --val_path "${VAL_JSONL}" \
+            --output_dir "${outdir}" \
+            --model_type internvl
+        echo "[$(date '+%F %T')] ${name} DONE"
+    } > "${log}" 2>&1 &
+    disown
+    echo "[OK] ${name} (transformers) on GPU ${gpu}, pid=$!"
+}
+
+# -----------------------------------------------------------------------------
+# 8 lanes
+# -----------------------------------------------------------------------------
+run_vllm 0 "${MODEL_DIR}/Qwen3.5-0.8B"            "Qwen3.5-0.8B-base"      0.7
+run_vllm 1 "${MODEL_DIR}/Qwen3.5-2B"              "Qwen3.5-2B-base"        0.7
+run_vllm 2 "${MODEL_DIR}/Qwen3.5-9B"              "Qwen3.5-9B-base"        0.7
+run_vllm 3 "${MODEL_DIR}/Qwen3-VL-2B-Instruct"    "Qwen3-VL-2B-base"       0.7
+run_vllm 4 "${MODEL_DIR}/Qwen3-VL-8B-Instruct"    "Qwen3-VL-8B-base"       0.7
+run_vllm 5 "${MODEL_DIR}/GLM-4.6V-Flash"          "GLM-4.6V-Flash-base"    0.7
+run_transformers 6 "${MODEL_DIR}/InternVL3_5-8B-HF" "InternVL3.5-8B-base"
+run_vllm 7 "${MODEL_DIR}/Gemma-4-E4B-it"          "Gemma-4-E4B-base"       0.7
+
+echo ""
+echo "============================================================"
+echo "8 base eval lanes launched."
+echo "Results dir: ${BASE_OUT_ROOT}"
+echo "Logs:       ${LOG_DIR}/eval_base_*_${ts}.log"
+echo "============================================================"
diff --git a/scripts/eval_vln_transformers.py b/scripts/eval_vln_transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c9b9a0e3877b118ab3441348fc89570f56f9938
--- /dev/null
+++ b/scripts/eval_vln_transformers.py
@@ -0,0 +1,226 @@
+"""
+VLN Waypoint Prediction Evaluation — transformers fallback version
+For models incompatible with vLLM (e.g., InternVL converted from ModelScope).
+"""
+
+import argparse
+import json
+import os
+import re
+import time
+import logging
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+from PIL import Image
+
+logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+DIMS = ["dx", "dy", "dz", "dpitch", "dyaw", "droll"]
+NUM_WAYPOINTS = 5
+
+
+def load_val_data(val_path: str) -> List[Dict]:
+    data = []
+    with open(val_path) as f:
+        for line in f:
+            data.append(json.loads(line.strip()))
+    logger.info(f"Loaded {len(data)} validation samples")
+    return data
+
+
+def parse_waypoints(text: str) -> Optional[List[Dict]]:
+    try:
+        if "</think>" in text:
+            text = text.split("</think>")[-1]
+        match = re.search(r'\{.*\}', text, re.DOTALL)
+        if not match:
+            return None
+        obj = json.loads(match.group())
+        deltas = obj.get("waypoint_deltas", [])
+        if len(deltas) == 0:
+            return None
+        result = []
+        for d in deltas:
+            wp = {}
+            if isinstance(d, dict):
+                for dim in DIMS:
+                    wp[dim] = float(d.get(dim, 0.0))
+            elif isinstance(d, (list, tuple)) and len(d) >= len(DIMS):
+                for i, dim in enumerate(DIMS):
+                    wp[dim] = float(d[i])
+            else:
+                return None
+            result.append(wp)
+        return result
+    except (json.JSONDecodeError, ValueError, TypeError, AttributeError, IndexError):
+        return None
+
+
+# reuse compute_metrics and print_results from eval_vln_vllm
+from eval_vln_vllm import compute_metrics, print_results
+
+
+def load_internvl_model(model_path, device="cuda"):
+    from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoProcessor
+    logger.info(f"Loading InternVL model from {model_path}")
+
+    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+    tokenizer = processor.tokenizer
+
+    model = AutoModelForImageTextToText.from_pretrained(
+        model_path,
+        dtype=torch.bfloat16,
+        trust_remote_code=True,
+        device_map="auto",
+    ).eval()
+
+    return model, tokenizer, processor
+
+
+def load_generic_model(model_path, device="cuda"):
+    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+    logger.info(f"Loading model from {model_path}")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    try:
+        processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+    except Exception:
+        processor = None
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        dtype=torch.bfloat16,
+        trust_remote_code=True,
+        device_map="auto",
+    ).eval()
+
+    return model, tokenizer, processor
+
+
+def internvl_generate(model, tokenizer, processor, item, max_new_tokens=512):
+    messages = item["messages"]
+    image_paths = item.get("images", [])
+
+    images = [Image.open(p).convert("RGB") for p in image_paths]
+
+    # NOTE: our training data already includes N "<image>\n" tokens at the
+    # beginning of every user message (one per image). So we keep the user
+    # content as a plain string and DO NOT add additional {"type": "image"}
+    # entries -- that would double-count placeholders.
+    chat_messages = []
+    for msg in messages:
+        if msg["role"] == "assistant":
+            break
+        chat_messages.append({"role": msg["role"], "content": msg["content"]})
+
+    # Render template -> string with N "<image>" placeholders.
+    text = processor.apply_chat_template(
+        chat_messages, add_generation_prompt=True, tokenize=False
+    )
+    # InternVL processor expects N "<IMG_CONTEXT>" placeholders (its
+    # image_token), NOT the chat-template's "<image>" string. Replace
+    # them 1:1 so the processor can correctly expand each one into
+    # `image_seq_length * num_patches` IMG_CONTEXT tokens that match
+    # the corresponding pixel_values slice.
+    if images:
+        target_tok = getattr(processor, "image_token", "<IMG_CONTEXT>")
+        text = text.replace("<image>", target_tok)
+    inputs = processor(
+        text=text,
+        images=images if images else None,
+        return_tensors="pt",
+    ).to(model.device)
+
+    with torch.no_grad():
+        output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+
+    input_len = inputs["input_ids"].shape[-1]
+    response = tokenizer.decode(output_ids[0][input_len:], skip_special_tokens=True)
+    return response
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--val_path", type=str,
+                        default="/mnt/data-a808/R26112/datasets/0318_vln_waypoint_val.jsonl")
+    parser.add_argument("--max_samples", type=int, default=None)
+    parser.add_argument("--output_dir", type=str, default=None)
+    parser.add_argument("--model_type", type=str, default="internvl",
+                        choices=["internvl", "generic"])
+    args = parser.parse_args()
+
+    model_name = os.path.basename(args.model_path.rstrip("/"))
+    if args.output_dir is None:
+        args.output_dir = os.path.dirname(args.model_path.rstrip("/"))
+
+    val_data = load_val_data(args.val_path)
+    if args.max_samples and args.max_samples < len(val_data):
+        val_data = val_data[:args.max_samples]
+
+    if args.model_type == "internvl":
+        model, tokenizer, processor = load_internvl_model(args.model_path)
+    else:
+        model, tokenizer, processor = load_generic_model(args.model_path)
+
+    total = len(val_data)
+    all_errors = []
+    parse_failures = 0
+
+    for idx, item in enumerate(val_data):
+        gt_text = [m for m in item["messages"] if m["role"] == "assistant"][0]["content"]
+        gt_wp = parse_waypoints(gt_text)
+        if gt_wp is None:
+            continue
+
+        try:
+            response = internvl_generate(model, tokenizer, processor, item)
+        except Exception as e:
+            if idx < 3:
+                import traceback
+                logger.warning(f"Sample {idx}: generation error: {e}")
+                logger.warning(traceback.format_exc())
+            elif idx < 8:
+                logger.warning(f"Sample {idx}: generation error: {e}")
+            parse_failures += 1
+            continue
+
+        pred_wp = parse_waypoints(response)
+        if pred_wp is None:
+            parse_failures += 1
+            if parse_failures <= 5 or parse_failures % 100 == 0:
+                logger.warning(f"Sample {idx}: parse failure. Output: {response[:200]}")
+            continue
+
+        n_wp = min(len(gt_wp), len(pred_wp))
+        sample_errors = {dim: [] for dim in DIMS}
+        for wi in range(n_wp):
+            for dim in DIMS:
+                err = abs(pred_wp[wi][dim] - gt_wp[wi][dim])
+                sample_errors[dim].append(err)
+        all_errors.append(sample_errors)
+
+        if (idx + 1) % 50 == 0:
+            if all_errors:
+                cur_mae = np.mean([np.mean([e for s in all_errors for e in s[dim]]) for dim in DIMS])
+                logger.info(f"Progress [{idx+1}/{total}] MAE: {cur_mae:.4f} | parse_fail={parse_failures}")
+            else:
+                logger.info(f"Progress [{idx+1}/{total}] | parse_fail={parse_failures}")
+
+    results = compute_metrics(all_errors, parse_failures, total)
+    results["inference_engine"] = "transformers"
+
+    print_results(results, model_name)
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    out_file = os.path.join(args.output_dir, f"eval_results_{model_name}.json")
+    with open(out_file, "w") as f:
+        json.dump(results, f, indent=2)
+    logger.info(f"Results saved to {out_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/eval_vln_vllm.py b/scripts/eval_vln_vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfe54a1882bbeec176f35bc697a5812a982fa6c3
--- /dev/null
+++ b/scripts/eval_vln_vllm.py
@@ -0,0 +1,506 @@
+"""
+VLN Waypoint Prediction Evaluation — vLLM accelerated version
+Uses vLLM offline batch inference for much faster evaluation.
+"""
+
+import argparse
+import json
+import os
+import re
+import time
+import logging
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+from PIL import Image
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+DIMS = ["dx", "dy", "dz", "dpitch", "dyaw", "droll"]
+NUM_WAYPOINTS = 5
+
+
+def load_val_data(val_path: str) -> List[Dict]:
+    data = []
+    with open(val_path) as f:
+        for line in f:
+            item = json.loads(line.strip())
+            data.append(item)
+    logger.info(f"Loaded {len(data)} validation samples")
+    return data
+
+
+def parse_waypoints(text: str) -> Optional[List[Dict]]:
+    try:
+        if "</think>" in text:
+            text = text.split("</think>")[-1]
+
+        match = re.search(r'\{.*\}', text, re.DOTALL)
+        if not match:
+            return None
+        obj = json.loads(match.group())
+        deltas = obj.get("waypoint_deltas", [])
+        if len(deltas) == 0:
+            return None
+        result = []
+        for d in deltas:
+            wp = {}
+            if isinstance(d, dict):
+                for dim in DIMS:
+                    wp[dim] = float(d.get(dim, 0.0))
+            elif isinstance(d, (list, tuple)) and len(d) >= len(DIMS):
+                for i, dim in enumerate(DIMS):
+                    wp[dim] = float(d[i])
+            else:
+                return None
+            result.append(wp)
+        return result
+    except (json.JSONDecodeError, ValueError, TypeError, AttributeError, IndexError):
+        return None
+
+
+def build_vllm_inputs(item: Dict) -> dict:
+    """Build a single vLLM input with multimodal data for Qwen3.5."""
+    from PIL import Image as _PILImage
+    messages = item["messages"]
+    image_paths = item.get("images", [])
+
+    chat_messages = []
+    for msg in messages:
+        if msg["role"] == "assistant":
+            break
+        if msg["role"] == "user":
+            if image_paths:
+                content_parts = []
+                for p in image_paths:
+                    try:
+                        pil_img = _PILImage.open(p).convert("RGB")
+                    except Exception as _e:
+                        logger.warning(f"failed to open image {p}: {_e}")
+                        continue
+                    content_parts.append({"type": "image_pil", "image_pil": pil_img})
+                content_parts.append({"type": "text", "text": msg["content"]})
+                chat_messages.append({"role": "user", "content": content_parts})
+            else:
+                chat_messages.append({"role": "user", "content": msg["content"]})
+        else:
+            chat_messages.append({"role": msg["role"], "content": msg["content"]})
+
+    return chat_messages
+
+
+def compute_metrics(all_errors, parse_failures, total):
+    metrics = {}
+
+    for dim in DIMS:
+        vals = [e for s in all_errors for e in s[dim]]
+        if vals:
+            metrics[f"mae_{dim}"] = float(np.mean(vals))
+            metrics[f"rmse_{dim}"] = float(np.sqrt(np.mean(np.array(vals) ** 2)))
+
+    all_vals = []
+    for dim in DIMS:
+        all_vals.extend([e for s in all_errors for e in s[dim]])
+
+    pos_dims = ["dx", "dy", "dz"]
+    pos_vals = []
+    for dim in pos_dims:
+        pos_vals.extend([e for s in all_errors for e in s[dim]])
+
+    rot_dims = ["dpitch", "dyaw", "droll"]
+    rot_vals = []
+    for dim in rot_dims:
+        rot_vals.extend([e for s in all_errors for e in s[dim]])
+
+    metrics["mae_overall"] = float(np.mean(all_vals)) if all_vals else 0
+    metrics["mae_position"] = float(np.mean(pos_vals)) if pos_vals else 0
+    metrics["mae_rotation"] = float(np.mean(rot_vals)) if rot_vals else 0
+    metrics["rmse_overall"] = float(np.sqrt(np.mean(np.array(all_vals) ** 2))) if all_vals else 0
+
+    per_wp_euc = {}
+    for s in all_errors:
+        n_wp = len(s["dx"])
+        for wi in range(n_wp):
+            euc = np.sqrt(s["dx"][wi]**2 + s["dy"][wi]**2 + s["dz"][wi]**2)
+            per_wp_euc.setdefault(wi, []).append(euc)
+
+    all_euc = []
+    for wi in sorted(per_wp_euc.keys()):
+        vals = per_wp_euc[wi]
+        all_euc.extend(vals)
+        metrics[f"wp{wi+1}_euc_mae"] = float(np.mean(vals))
+        metrics[f"wp{wi+1}_euc_median"] = float(np.median(vals))
+
+    metrics["euclidean_mae"] = float(np.mean(all_euc)) if all_euc else 0
+
+    ade_list, fde_list = [], []
+    for s in all_errors:
+        n_wp = len(s["dx"])
+        traj_eucs = []
+        for wi in range(n_wp):
+            euc = np.sqrt(s["dx"][wi]**2 + s["dy"][wi]**2 + s["dz"][wi]**2)
+            traj_eucs.append(euc)
+        if traj_eucs:
+            ade_list.append(np.mean(traj_eucs))
+            fde_list.append(traj_eucs[-1])
+
+    metrics["ADE"] = float(np.mean(ade_list)) if ade_list else 0
+    metrics["FDE"] = float(np.mean(fde_list)) if fde_list else 0
+    metrics["ADE_median"] = float(np.median(ade_list)) if ade_list else 0
+    metrics["FDE_median"] = float(np.median(fde_list)) if fde_list else 0
+
+    # ----- finer-grained position thresholds (down to 0.1m) -----
+    pos_thresholds = [0.1, 0.2, 0.3, 0.5, 1.0, 2.0, 5.0]
+    for thr in pos_thresholds:
+        hit = sum(1 for e in all_euc if e < thr)
+        metrics[f"SR@{thr}m"] = hit / len(all_euc) if all_euc else 0
+
+    # ----- finer-grained trajectory thresholds (all wps under threshold) -----
+    traj_thresholds = [0.3, 0.5, 1.0, 2.0, 5.0]
+    for thr in traj_thresholds:
+        traj_success = 0
+        for s in all_errors:
+            n_wp = len(s["dx"])
+            all_under = True
+            for wi in range(n_wp):
+                euc = np.sqrt(s["dx"][wi]**2 + s["dy"][wi]**2 + s["dz"][wi]**2)
+                if euc >= thr:
+                    all_under = False
+                    break
+            if all_under:
+                traj_success += 1
+        metrics[f"TrajSR@{thr}m"] = traj_success / len(all_errors) if all_errors else 0
+
+    # ----- per-sample rotation magnitudes (waypoint-level) -----
+    all_rot_errors = []
+    per_sample_rot_mags = []  # list[list[float]]: each sample's per-wp rot magnitude
+    per_sample_pos_mags = []  # list[list[float]]: each sample's per-wp euc dist
+    for s in all_errors:
+        rots = []
+        poss = []
+        for wi in range(len(s["dx"])):
+            rot_err = np.sqrt(s["dpitch"][wi]**2 + s["dyaw"][wi]**2 + s["droll"][wi]**2)
+            pos_err = np.sqrt(s["dx"][wi]**2 + s["dy"][wi]**2 + s["dz"][wi]**2)
+            all_rot_errors.append(rot_err)
+            rots.append(rot_err)
+            poss.append(pos_err)
+        per_sample_rot_mags.append(rots)
+        per_sample_pos_mags.append(poss)
+
+    # ----- finer-grained rotation thresholds (down to 0.5deg) -----
+    rot_thresholds = [0.5, 1.0, 2.0, 5.0, 10.0]
+    for thr in rot_thresholds:
+        hit = sum(1 for e in all_rot_errors if e < thr)
+        metrics[f"RotAcc@{thr}deg"] = hit / len(all_rot_errors) if all_rot_errors else 0
+
+    # ----- TrajRotSR: whole trajectory all wps under rot threshold -----
+    for thr in [1.0, 2.0, 5.0, 10.0]:
+        traj_rot_success = 0
+        for rots in per_sample_rot_mags:
+            if all(r < thr for r in rots):
+                traj_rot_success += 1
+        metrics[f"TrajRotSR@{thr}deg"] = traj_rot_success / len(per_sample_rot_mags) if per_sample_rot_mags else 0
+
+    # ----- True JOINT success rates (precise, not approximated) -----
+    # JointSR@(pos_thr, rot_thr): any waypoint satisfies BOTH constraints
+    JOINT_PAIRS = [(0.5, 1.0), (0.5, 5.0), (1.0, 1.0), (1.0, 5.0), (0.3, 1.0), (0.5, 2.0)]
+    for pos_thr, rot_thr in JOINT_PAIRS:
+        hit = 0
+        for poss, rots in zip(per_sample_pos_mags, per_sample_rot_mags):
+            if any(p < pos_thr and r < rot_thr for p, r in zip(poss, rots)):
+                hit += 1
+        metrics[f"JointSR@({pos_thr}m,{rot_thr}deg)"] = hit / len(per_sample_pos_mags) if per_sample_pos_mags else 0
+
+    # TrajJointSR: whole trajectory all wps satisfy both constraints
+    for pos_thr, rot_thr in JOINT_PAIRS:
+        hit = 0
+        for poss, rots in zip(per_sample_pos_mags, per_sample_rot_mags):
+            if all(p < pos_thr and r < rot_thr for p, r in zip(poss, rots)):
+                hit += 1
+        metrics[f"TrajJointSR@({pos_thr}m,{rot_thr}deg)"] = hit / len(per_sample_pos_mags) if per_sample_pos_mags else 0
+
+    # ----- Per-waypoint rotation MAE (kept) -----
+    per_wp_rot = {}
+    for s in all_errors:
+        n_wp = len(s["dx"])
+        for wi in range(n_wp):
+            rot_err = np.sqrt(s["dpitch"][wi]**2 + s["dyaw"][wi]**2 + s["droll"][wi]**2)
+            per_wp_rot.setdefault(wi, []).append(rot_err)
+
+    for wi in sorted(per_wp_rot.keys()):
+        vals = per_wp_rot[wi]
+        metrics[f"wp{wi+1}_rot_mae"] = float(np.mean(vals))
+
+    metrics["rotation_euc_mae"] = float(np.mean(all_rot_errors)) if all_rot_errors else 0
+
+    # ----- Percentile / tail metrics (Sample-level, robust to outliers) -----
+    if ade_list:
+        ade_arr = np.array(ade_list)
+        for p in [50, 75, 90, 95, 99]:
+            metrics[f"ADE_p{p}"] = float(np.percentile(ade_arr, p))
+        metrics["ADE_max"] = float(ade_arr.max())
+    if fde_list:
+        fde_arr = np.array(fde_list)
+        for p in [50, 75, 90, 95, 99]:
+            metrics[f"FDE_p{p}"] = float(np.percentile(fde_arr, p))
+        metrics["FDE_max"] = float(fde_arr.max())
+    if all_rot_errors:
+        rot_arr = np.array(all_rot_errors)
+        for p in [50, 75, 90, 95, 99]:
+            metrics[f"rot_err_p{p}"] = float(np.percentile(rot_arr, p))
+        metrics["rot_err_max"] = float(rot_arr.max())
+
+    # ----- Hard failure rates (catastrophic errors) -----
+    n_samples = len(all_errors)
+    if n_samples > 0:
+        for thr in [2.0, 5.0, 10.0]:
+            metrics[f"HardFailRate_pos_gt_{thr}m"] = sum(1 for e in fde_list if e > thr) / n_samples
+        for thr in [10.0, 30.0, 60.0]:
+            sample_max_rot = [max(rots) if rots else 0 for rots in per_sample_rot_mags]
+            metrics[f"HardFailRate_rot_gt_{thr}deg"] = sum(1 for r in sample_max_rot if r > thr) / n_samples
+
+    metrics["parse_failure_rate"] = parse_failures / total if total > 0 else 0
+    metrics["parse_success_rate"] = 1 - metrics["parse_failure_rate"]
+    metrics["valid_samples"] = len(all_errors)
+    metrics["total_samples"] = total
+    metrics["parse_failures"] = parse_failures
+
+    return metrics
+
+
+def print_results(results, model_name):
+    logger.info("=" * 70)
+    logger.info(f"  Evaluation Results: {model_name}")
+    logger.info("=" * 70)
+    logger.info(f"  Samples: {results['valid_samples']}/{results['total_samples']} "
+                f"(parse failures: {results['parse_failures']}, "
+                f"rate: {results['parse_failure_rate']:.2%}, "
+                f"success: {results['parse_success_rate']:.2%})")
+
+    logger.info("-" * 70)
+    logger.info("  [Regression Metrics]")
+    logger.info(f"  Overall MAE:     {results['mae_overall']:.4f}")
+    logger.info(f"  Position MAE:    {results['mae_position']:.4f}  (dx/dy/dz)")
+    logger.info(f"  Rotation MAE:    {results['mae_rotation']:.4f}  (dpitch/dyaw/droll)")
+    logger.info(f"  Overall RMSE:    {results['rmse_overall']:.4f}")
+
+    logger.info("-" * 70)
+    logger.info("  [Trajectory Metrics]")
+    logger.info(f"  ADE (mean):      {results['ADE']:.4f}   (avg displacement error)")
+    logger.info(f"  ADE (median):    {results['ADE_median']:.4f}")
+    logger.info(f"  FDE (mean):      {results['FDE']:.4f}   (final displacement error)")
+    logger.info(f"  FDE (median):    {results['FDE_median']:.4f}")
+    logger.info(f"  Euclidean MAE:   {results['euclidean_mae']:.4f}")
+
+    logger.info("-" * 70)
+    logger.info("  [Position Success Rate]")
+    for thr in [0.5, 1.0, 2.0, 5.0]:
+        key = f"SR@{thr}m"
+        logger.info(f"  {key:12s}  {results.get(key, 0):.2%}")
+
+    logger.info("-" * 70)
+    logger.info("  [Trajectory Success Rate (all waypoints under threshold)]")
+    for thr in [1.0, 2.0, 5.0]:
+        key = f"TrajSR@{thr}m"
+        logger.info(f"  {key:14s}  {results.get(key, 0):.2%}")
+
+    logger.info("-" * 70)
+    logger.info("  [Rotation Accuracy]")
+    for thr in [1.0, 5.0, 10.0]:
+        key = f"RotAcc@{thr}deg"
+        logger.info(f"  {key:16s}  {results.get(key, 0):.2%}")
+
+    logger.info("-" * 70)
+    logger.info("  [Per-waypoint Position Error (Euclidean)]")
+    for wi in range(NUM_WAYPOINTS):
+        euc_key = f"wp{wi+1}_euc_mae"
+        med_key = f"wp{wi+1}_euc_median"
+        if euc_key in results:
+            logger.info(f"  Waypoint {wi+1}:  MAE={results[euc_key]:.4f}  "
+                         f"Median={results.get(med_key, 0):.4f}")
+
+    logger.info("-" * 70)
+    logger.info("  [Per-waypoint Rotation Error]")
+    for wi in range(NUM_WAYPOINTS):
+        rot_key = f"wp{wi+1}_rot_mae"
+        if rot_key in results:
+            logger.info(f"  Waypoint {wi+1}:  MAE={results[rot_key]:.4f}")
+
+    logger.info("-" * 70)
+    logger.info("  [Per-dimension MAE / RMSE]")
+    for dim in DIMS:
+        logger.info(f"  {dim:8s}  MAE={results.get(f'mae_{dim}',0):.4f}  "
+                     f"RMSE={results.get(f'rmse_{dim}',0):.4f}")
+    logger.info("=" * 70)
+
+
+def get_max_model_len(model_path: str) -> int:
+    config_path = os.path.join(model_path, "config.json")
+    if os.path.exists(config_path):
+        with open(config_path) as f:
+            cfg = json.load(f)
+        max_pos = cfg.get("max_position_embeddings", 8192)
+        return min(int(max_pos), 8192)
+    return 8192
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--val_path", type=str,
+                        default="/mnt/data-a808/R26112/datasets/0318_vln_waypoint_val.jsonl")
+    parser.add_argument("--max_samples", type=int, default=None)
+    parser.add_argument("--output_dir", type=str, default=None)
+    parser.add_argument("--tensor_parallel_size", type=int, default=1)
+    parser.add_argument("--batch_size", type=int, default=64,
+                        help="Number of requests per vLLM batch call")
+    parser.add_argument("--gpu_memory_utilization", type=float, default=0.85)
+    parser.add_argument("--max_model_len", type=int, default=None)
+    parser.add_argument("--save_raw", action="store_true",
+                        help="If set, also save per-sample raw errors to "
+                             "raw_errors_<model_name>.json (enables strict offline analysis).")
+    args = parser.parse_args()
+
+    model_name = os.path.basename(args.model_path.rstrip("/"))
+    if args.output_dir is None:
+        args.output_dir = os.path.dirname(args.model_path.rstrip("/"))
+
+    val_data = load_val_data(args.val_path)
+    if args.max_samples and args.max_samples < len(val_data):
+        val_data = val_data[:args.max_samples]
+
+    from vllm import LLM, SamplingParams
+
+    max_model_len = args.max_model_len or get_max_model_len(args.model_path)
+
+    logger.info(f"Loading model with vLLM: {args.model_path}")
+    logger.info(f"  tensor_parallel_size={args.tensor_parallel_size}")
+    logger.info(f"  gpu_memory_utilization={args.gpu_memory_utilization}")
+    logger.info(f"  max_model_len={max_model_len}")
+
+    llm = LLM(
+        model=args.model_path,
+        trust_remote_code=True,
+        tensor_parallel_size=args.tensor_parallel_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        max_model_len=max_model_len,
+        limit_mm_per_prompt={"image": 5},
+        allowed_local_media_path="/",
+    )
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=512,
+    )
+
+    logger.info("Validating samples (parsing ground-truth only; images opened lazily per batch)...")
+    valid_items = []  # [(orig_idx, item, gt_wp), ...]
+
+    for idx, item in enumerate(val_data):
+        gt_text = [m for m in item["messages"] if m["role"] == "assistant"][0]["content"]
+        gt_wp = parse_waypoints(gt_text)
+        if gt_wp is None:
+            logger.warning(f"Sample {idx}: cannot parse ground truth, skipping")
+            continue
+        valid_items.append((idx, item, gt_wp))
+
+    logger.info(f"Valid samples: {len(valid_items)}/{len(val_data)}")
+
+    total = len(val_data)
+    all_errors = []
+    parse_failures = 0
+
+    import gc
+    for batch_start in range(0, len(valid_items), args.batch_size):
+        batch_end = min(batch_start + args.batch_size, len(valid_items))
+        batch_items = valid_items[batch_start:batch_end]
+        # build inputs (open PIL images) only for this batch; freed after use
+        batch_msgs = [build_vllm_inputs(it[1]) for it in batch_items]
+        batch_gt = [it[2] for it in batch_items]
+        valid_indices = [it[0] for it in batch_items]
+
+        logger.info(f"Running vLLM batch [{batch_start+1}-{batch_end}/{len(valid_items)}]...")
+        t0 = time.time()
+
+        outputs = llm.chat(
+            messages=batch_msgs,
+            sampling_params=sampling_params,
+            chat_template_kwargs={"enable_thinking": False},
+        )
+
+        elapsed = time.time() - t0
+        logger.info(f"  Batch done in {elapsed:.1f}s ({len(batch_msgs)/elapsed:.1f} samples/s)")
+
+        for i, output in enumerate(outputs):
+            generated = output.outputs[0].text
+            pred_wp = parse_waypoints(generated)
+
+            if pred_wp is None:
+                parse_failures += 1
+                sample_idx = valid_indices[i]
+                if parse_failures <= 5 or parse_failures % 50 == 0:
+                    logger.warning(f"Sample {sample_idx}: parse failure. Output: {generated[:200]}")
+                continue
+
+            gt_wp = batch_gt[i]
+            n_wp = min(len(gt_wp), len(pred_wp))
+            sample_errors = {dim: [] for dim in DIMS}
+            for wi in range(n_wp):
+                for dim in DIMS:
+                    err = abs(pred_wp[wi][dim] - gt_wp[wi][dim])
+                    sample_errors[dim].append(err)
+            all_errors.append(sample_errors)
+
+        del batch_msgs
+        gc.collect()
+
+        cur_total_processed = batch_end
+        if all_errors:
+            cur_mae = {}
+            for dim in DIMS:
+                vals = [e for s in all_errors for e in s[dim]]
+                cur_mae[dim] = np.mean(vals) if vals else 0
+            avg = np.mean(list(cur_mae.values()))
+            logger.info(
+                f"  Progress [{cur_total_processed}/{len(valid_items)}] "
+                f"MAE: {avg:.4f} | parse_fail={parse_failures}"
+            )
+
+    results = compute_metrics(all_errors, parse_failures, total)
+
+    elapsed_total = time.time() - t0
+    results["inference_engine"] = "vllm"
+    results["vllm_version"] = "0.19.0"
+
+    print_results(results, model_name)
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    out_file = os.path.join(args.output_dir, f"eval_results_{model_name}.json")
+    with open(out_file, "w") as f:
+        json.dump(results, f, indent=2)
+    logger.info(f"Results saved to {out_file}")
+
+    if args.save_raw:
+        raw_file = os.path.join(args.output_dir, f"raw_errors_{model_name}.json")
+        # Compress: store as flat lists per dim; preserves per-sample / per-wp structure.
+        raw_payload = {
+            "n_samples": len(all_errors),
+            "parse_failures": parse_failures,
+            "total_samples": total,
+            "dims": DIMS,
+            # each sample: dict of dim -> list[float] (one entry per waypoint)
+            "errors_per_sample": [
+                {dim: list(map(float, s[dim])) for dim in DIMS} for s in all_errors
+            ],
+        }
+        with open(raw_file, "w") as f:
+            json.dump(raw_payload, f)
+        logger.info(f"Raw errors saved to {raw_file} ({os.path.getsize(raw_file)/1e6:.2f} MB)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/exp1_val_1160.jsonl b/scripts/exp1_val_1160.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c5c28db3af15963feb444cc5c0e1b96daab2180b
--- /dev/null
+++ b/scripts/exp1_val_1160.jsonl
@@ -0,0 +1,1160 @@
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.39, 62.92, 22.0, -46.48, -92.86, 0.0]\n  Target bbox: [623.38, 330.92, 656.37, 388.28]\n\nFrame 2:\n  Drone pose: [106.51, 61.49, 21.2, -46.78, -90.35, 0.0]\n  Target bbox: [621.87, 330.29, 657.99, 388.81]\n\nFrame 3:\n  Drone pose: [106.16, 60.6, 20.67, -46.6, -89.28, 0.0]\n  Target bbox: [624.66, 330.22, 655.59, 388.89]\n\nFrame 4:\n  Drone pose: [106.05, 59.97, 20.64, -46.76, -88.95, 0.0]\n  Target bbox: [627.33, 330.32, 652.87, 388.8]\n\nFrame 5 (current):\n  Drone pose: [106.03, 59.43, 20.62, -46.79, -88.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.92, \"ymin\": 328.42, \"xmax\": 662.47, \"ymax\": 390.66}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.52, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": 0.02, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -1.55, \"dz\": -0.07, \"dpitch\": 0.03, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -2.06, \"dz\": -0.09, \"dpitch\": 0.05, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": -2.57, \"dz\": -0.2, \"dpitch\": 0.19, \"dyaw\": 0.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.99, "window_alt_abs_m": 1.38, "target_px_mean_hist": 555.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.93, 56.86, 20.42, -46.6, -88.57, 0.0]\n  Target bbox: [625.91, 329.41, 654.33, 389.71]\n\nFrame 2:\n  Drone pose: [105.87, 56.34, 20.39, -46.58, -88.38, 0.0]\n  Target bbox: [617.75, 328.14, 662.64, 390.94]\n\nFrame 3:\n  Drone pose: [105.8, 55.83, 20.36, -46.55, -88.16, 0.0]\n  Target bbox: [623.09, 326.12, 656.82, 393.11]\n\nFrame 4:\n  Drone pose: [105.73, 55.32, 20.33, -46.54, -89.5, 0.0]\n  Target bbox: [619.85, 328.57, 660.36, 390.46]\n\nFrame 5 (current):\n  Drone pose: [105.68, 54.8, 20.3, -46.52, -89.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.64, \"ymin\": 328.82, \"xmax\": 656.64, \"ymax\": 390.22}, \"waypoint_deltas\": [{\"dx\": -0.04, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -1.02, \"dz\": -0.06, \"dpitch\": 0.04, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -1.53, \"dz\": -0.08, \"dpitch\": 0.06, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -2.05, \"dz\": -0.11, \"dpitch\": 0.08, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -2.57, \"dz\": -0.13, \"dpitch\": 0.08, \"dyaw\": -0.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.94, "window_alt_abs_m": 0.12, "target_px_mean_hist": 603.2, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.69, 52.23, 20.17, -46.44, -89.37, 0.0]\n  Target bbox: [617.52, 327.86, 662.76, 391.19]\n\nFrame 2:\n  Drone pose: [105.76, 51.71, 20.15, -46.44, -89.59, 0.0]\n  Target bbox: [621.19, 328.77, 658.99, 390.3]\n\nFrame 3:\n  Drone pose: [105.85, 51.19, 20.13, -46.45, -89.87, 0.0]\n  Target bbox: [624.73, 328.81, 655.32, 390.21]\n\nFrame 4:\n  Drone pose: [105.97, 50.66, 20.12, -46.48, -90.23, 0.0]\n  Target bbox: [622.08, 329.0, 657.82, 390.04]\n\nFrame 5 (current):\n  Drone pose: [106.11, 50.13, 20.1, -46.51, -90.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.29, \"ymin\": 327.16, \"xmax\": 661.4, \"ymax\": 391.84}, \"waypoint_deltas\": [{\"dx\": 0.16, \"dy\": -0.55, \"dz\": -0.01, \"dpitch\": -0.04, \"dyaw\": -0.52, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.09, \"dz\": -0.02, \"dpitch\": -0.09, \"dyaw\": -1.12, \"droll\": 0.0}, {\"dx\": 0.55, \"dy\": -1.65, \"dz\": -0.03, \"dpitch\": -0.15, \"dyaw\": -1.78, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": -2.22, \"dz\": -0.04, \"dpitch\": -0.23, \"dyaw\": -2.51, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": -2.79, \"dz\": -0.05, \"dpitch\": -0.31, \"dyaw\": -3.27, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.3, "window_alt_abs_m": 0.07, "target_px_mean_hist": 605.8, "cur_frame_id": 22, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.13, 47.34, 20.05, -46.82, -93.94, 0.0]\n  Target bbox: [625.27, 327.93, 654.49, 391.1]\n\nFrame 2:\n  Drone pose: [107.36, 46.75, 20.04, -46.92, -94.71, 0.0]\n  Target bbox: [624.64, 327.92, 655.12, 391.08]\n\nFrame 3:\n  Drone pose: [107.58, 46.16, 20.04, -47.02, -95.44, 0.0]\n  Target bbox: [621.2, 327.51, 658.51, 391.55]\n\nFrame 4:\n  Drone pose: [107.77, 45.57, 20.03, -47.13, -96.07, 0.0]\n  Target bbox: [624.13, 327.74, 655.64, 391.28]\n\nFrame 5 (current):\n  Drone pose: [107.9, 44.96, 20.03, -47.27, -96.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.16, \"ymin\": 327.47, \"xmax\": 656.6, \"ymax\": 391.54}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": -0.61, \"dz\": -0.01, \"dpitch\": -0.16, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -1.23, \"dz\": -0.01, \"dpitch\": -0.25, \"dyaw\": -1.91, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.86, \"dz\": -0.01, \"dpitch\": -0.34, \"dyaw\": -3.38, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -2.49, \"dz\": -0.02, \"dpitch\": -0.43, \"dyaw\": -4.69, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": -3.12, \"dz\": -0.02, \"dpitch\": -0.52, \"dyaw\": -5.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.58, "window_alt_abs_m": 0.02, "target_px_mean_hist": 616.2, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.6, 41.84, 20.01, -47.79, -102.38, 0.0]\n  Target bbox: [622.51, 324.28, 657.63, 394.82]\n\nFrame 2:\n  Drone pose: [107.41, 41.22, 20.01, -47.87, -103.45, 0.0]\n  Target bbox: [618.81, 322.6, 661.28, 396.51]\n\nFrame 3:\n  Drone pose: [107.18, 40.6, 20.01, -47.95, -104.44, 0.0]\n  Target bbox: [618.6, 322.36, 661.55, 396.82]\n\nFrame 4:\n  Drone pose: [106.94, 39.99, 20.01, -48.02, -105.36, 0.0]\n  Target bbox: [621.18, 323.75, 658.96, 395.34]\n\nFrame 5 (current):\n  Drone pose: [106.67, 39.38, 20.0, -48.08, -106.22, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.94, \"ymin\": 322.85, \"xmax\": 661.17, \"ymax\": 396.24}, \"waypoint_deltas\": [{\"dx\": -0.28, \"dy\": -0.59, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.8, \"droll\": 0.0}, {\"dx\": -0.57, \"dy\": -1.18, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -1.54, \"droll\": 0.0}, {\"dx\": -0.88, \"dy\": -1.76, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -2.22, \"droll\": 0.0}, {\"dx\": -1.2, \"dy\": -2.32, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -2.86, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": -2.88, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -3.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.84, "window_alt_abs_m": 0.01, "target_px_mean_hist": 603.0, "cur_frame_id": 40, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00049/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.14, 36.5, 20.0, -48.19, -109.69, 0.0]\n  Target bbox: [618.89, 322.9, 661.3, 396.25]\n\nFrame 2:\n  Drone pose: [104.81, 35.95, 20.0, -48.17, -110.3, 0.0]\n  Target bbox: [619.13, 323.33, 661.04, 395.8]\n\nFrame 3:\n  Drone pose: [104.5, 35.4, 20.0, -48.13, -110.93, 0.0]\n  Target bbox: [617.0, 320.43, 663.29, 398.76]\n\nFrame 4:\n  Drone pose: [104.2, 34.87, 20.0, -48.06, -111.61, 0.0]\n  Target bbox: [623.77, 324.36, 656.43, 394.71]\n\nFrame 5 (current):\n  Drone pose: [103.93, 34.35, 20.0, -47.95, -112.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.73, \"ymin\": 325.87, \"xmax\": 657.12, \"ymax\": 393.21}, \"waypoint_deltas\": [{\"dx\": -0.24, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": 0.75, \"droll\": 0.0}, {\"dx\": -0.47, \"dy\": -1.02, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": -0.67, \"dy\": -1.52, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": -0.86, \"dy\": -2.02, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.45, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": -2.51, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.65, "window_alt_abs_m": 0.0, "target_px_mean_hist": 616.8, "cur_frame_id": 49, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.88, 31.84, 20.0, -47.98, -112.21, 0.0]\n  Target bbox: [615.31, 322.95, 664.54, 396.19]\n\nFrame 2:\n  Drone pose: [102.69, 31.36, 20.0, -48.08, -111.6, 0.0]\n  Target bbox: [619.57, 324.0, 660.6, 395.11]\n\nFrame 3:\n  Drone pose: [102.49, 30.87, 20.0, -47.88, -112.51, 0.0]\n  Target bbox: [617.14, 323.66, 662.71, 395.48]\n\nFrame 4:\n  Drone pose: [102.28, 30.38, 20.0, -48.0, -111.84, 0.0]\n  Target bbox: [623.29, 323.85, 656.93, 395.23]\n\nFrame 5 (current):\n  Drone pose: [102.06, 29.89, 20.0, -47.81, -112.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.93, \"ymin\": 324.7, \"xmax\": 659.92, \"ymax\": 394.4}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.72, \"droll\": 0.0}, {\"dx\": -0.47, \"dy\": -0.98, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": -0.7, \"dy\": -1.46, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": -1.95, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": -2.44, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.03, "window_alt_abs_m": 0.0, "target_px_mean_hist": 612.5, "cur_frame_id": 58, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.92, 27.45, 20.0, -47.8, -112.18, 0.0]\n  Target bbox: [618.44, 322.31, 661.81, 396.9]\n\nFrame 2:\n  Drone pose: [100.71, 26.96, 20.0, -47.6, -113.05, 0.0]\n  Target bbox: [622.16, 325.23, 657.69, 393.81]\n\nFrame 3:\n  Drone pose: [100.5, 26.48, 20.0, -47.71, -112.39, 0.0]\n  Target bbox: [615.94, 323.04, 663.93, 396.08]\n\nFrame 4:\n  Drone pose: [100.28, 25.99, 20.0, -47.82, -111.71, 0.0]\n  Target bbox: [620.26, 323.8, 659.93, 395.33]\n\nFrame 5 (current):\n  Drone pose: [100.06, 25.5, 20.0, -47.63, -112.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.95, \"ymin\": 324.6, \"xmax\": 659.91, \"ymax\": 394.47}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 0.72, \"droll\": 0.0}, {\"dx\": -0.47, \"dy\": -0.98, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": -0.72, \"dy\": -1.47, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": -1.96, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": -1.22, \"dy\": -2.45, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.74, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.04, "window_alt_abs_m": 0.0, "target_px_mean_hist": 620.2, "cur_frame_id": 67, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.84, 23.05, 20.0, -47.7, -111.81, 0.0]\n  Target bbox: [620.76, 325.77, 659.35, 393.34]\n\nFrame 2:\n  Drone pose: [98.6, 22.55, 20.0, -47.5, -112.6, 0.0]\n  Target bbox: [621.54, 326.4, 658.57, 392.7]\n\nFrame 3:\n  Drone pose: [98.37, 22.06, 20.0, -47.11, -113.41, 0.0]\n  Target bbox: [623.11, 326.12, 656.75, 392.99]\n\nFrame 4:\n  Drone pose: [98.16, 21.58, 20.0, -47.23, -112.76, 0.0]\n  Target bbox: [617.13, 323.39, 662.74, 395.71]\n\nFrame 5 (current):\n  Drone pose: [97.95, 21.09, 20.0, -47.33, -112.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.06, \"ymin\": 323.61, \"xmax\": 662.79, \"ymax\": 395.54}, \"waypoint_deltas\": [{\"dx\": -0.21, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.66, \"droll\": 0.0}, {\"dx\": -0.42, \"dy\": -0.98, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": -1.47, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": -1.96, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": -1.1, \"dy\": -2.46, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.9, "window_alt_abs_m": 0.0, "target_px_mean_hist": 624.8, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/ORI/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [96.85, 18.63, 20.0, -47.34, -111.73, 0.0]\n  Target bbox: [617.4, 320.75, 662.89, 398.46]\n\nFrame 2:\n  Drone pose: [96.61, 18.14, 20.0, -47.17, -112.55, 0.0]\n  Target bbox: [615.85, 322.76, 664.03, 396.36]\n\nFrame 3:\n  Drone pose: [96.39, 17.64, 20.0, -47.3, -111.86, 0.0]\n  Target bbox: [623.18, 324.73, 657.01, 394.36]\n\nFrame 4:\n  Drone pose: [96.17, 17.14, 20.0, -47.13, -112.72, 0.0]\n  Target bbox: [618.11, 323.81, 661.76, 395.31]\n\nFrame 5 (current):\n  Drone pose: [95.96, 16.64, 20.0, -47.26, -112.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.93, \"ymin\": 326.4, \"xmax\": 658.17, \"ymax\": 392.68}, \"waypoint_deltas\": [{\"dx\": -0.2, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": -0.89, \"droll\": 0.0}, {\"dx\": -0.4, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.29, \"droll\": 0.0}, {\"dx\": -0.6, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": -0.8, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.99, "window_alt_abs_m": 0.0, "target_px_mean_hist": 608.2, "cur_frame_id": 85, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.39, 62.92, 22.0, -46.48, -92.86, 0.0]\n  Target bbox: [620.19, 326.39, 659.89, 393.0]\n\nFrame 2:\n  Drone pose: [106.62, 61.34, 21.25, -48.03, -99.71, 0.0]\n  Target bbox: [675.86, 259.13, 717.17, 328.34]\n\nFrame 3:\n  Drone pose: [106.16, 60.6, 20.67, -46.74, -91.9, 0.0]\n  Target bbox: [650.46, 326.85, 690.8, 388.5]\n\nFrame 4:\n  Drone pose: [106.05, 59.97, 20.64, -44.38, -92.8, 0.0]\n  Target bbox: [668.08, 370.15, 700.92, 430.86]\n\nFrame 5 (current):\n  Drone pose: [106.03, 59.43, 20.62, -46.79, -88.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.55, \"ymin\": 328.86, \"xmax\": 656.74, \"ymax\": 390.22}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.52, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": 0.02, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -1.55, \"dz\": -0.07, \"dpitch\": 0.03, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -2.06, \"dz\": -0.09, \"dpitch\": 0.05, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": -2.57, \"dz\": -0.2, \"dpitch\": 0.19, \"dyaw\": 0.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.46, "window_alt_abs_m": 1.38, "target_px_mean_hist": 527.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.93, 56.86, 20.42, -45.0, -88.72, 0.0]\n  Target bbox: [622.95, 355.68, 660.86, 417.15]\n\nFrame 2:\n  Drone pose: [105.87, 56.34, 20.39, -46.93, -84.52, 0.0]\n  Target bbox: [580.9, 324.06, 609.74, 385.35]\n\nFrame 3:\n  Drone pose: [105.8, 55.83, 20.36, -46.86, -92.26, 0.0]\n  Target bbox: [665.22, 318.27, 710.36, 393.26]\n\nFrame 4:\n  Drone pose: [105.73, 55.32, 20.33, -48.09, -94.5, 0.0]\n  Target bbox: [673.99, 301.95, 722.96, 369.07]\n\nFrame 5 (current):\n  Drone pose: [105.82, 54.74, 20.21, -46.61, -80.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.38, \"ymin\": 330.79, \"xmax\": 603.77, \"ymax\": 399.76}, \"waypoint_deltas\": [{\"dx\": -0.18, \"dy\": -0.45, \"dz\": 0.06, \"dpitch\": 0.11, \"dyaw\": -8.36, \"droll\": 0.0}, {\"dx\": -0.2, \"dy\": -0.96, \"dz\": 0.03, \"dpitch\": 0.13, \"dyaw\": -8.3, \"droll\": 0.0}, {\"dx\": -0.2, \"dy\": -1.47, \"dz\": 0.01, \"dpitch\": 0.15, \"dyaw\": -8.3, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": -1.99, \"dz\": -0.02, \"dpitch\": 0.17, \"dyaw\": -8.38, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -2.51, \"dz\": -0.04, \"dpitch\": 0.17, \"dyaw\": -8.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 27.84, "window_alt_abs_m": 0.21, "target_px_mean_hist": 590.2, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.75, 52.33, 20.04, -43.14, -96.06, 0.0]\n  Target bbox: [619.04, 323.4, 660.91, 396.11]\n\nFrame 2:\n  Drone pose: [105.76, 51.71, 20.15, -42.57, -90.34, 0.0]\n  Target bbox: [627.01, 386.92, 670.72, 462.67]\n\nFrame 3:\n  Drone pose: [105.96, 51.12, 20.19, -49.35, -83.85, 0.0]\n  Target bbox: [620.31, 323.97, 659.64, 395.06]\n\nFrame 4:\n  Drone pose: [105.82, 50.54, 20.1, -53.71, -90.53, 0.0]\n  Target bbox: [670.23, 238.26, 721.14, 316.23]\n\nFrame 5 (current):\n  Drone pose: [106.11, 50.13, 20.1, -43.73, -91.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 633.02, \"ymin\": 375.88, \"xmax\": 663.04, \"ymax\": 436.4}, \"waypoint_deltas\": [{\"dx\": 0.16, \"dy\": -0.55, \"dz\": -0.01, \"dpitch\": -2.82, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.09, \"dz\": -0.02, \"dpitch\": -2.87, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": 0.55, \"dy\": -1.65, \"dz\": -0.03, \"dpitch\": -2.93, \"dyaw\": -1.09, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": -2.22, \"dz\": -0.04, \"dpitch\": -3.01, \"dyaw\": -1.82, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": -2.79, \"dz\": -0.05, \"dpitch\": -3.09, \"dyaw\": -2.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.72, "window_alt_abs_m": 0.25, "target_px_mean_hist": 561.5, "cur_frame_id": 22, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.96, 47.32, 20.16, -44.66, -97.14, 0.0]\n  Target bbox: [682.2, 284.2, 723.32, 347.67]\n\nFrame 2:\n  Drone pose: [107.36, 46.75, 20.04, -49.06, -95.7, 0.0]\n  Target bbox: [634.79, 291.89, 668.13, 355.49]\n\nFrame 3:\n  Drone pose: [107.58, 46.16, 20.04, -47.45, -96.5, 0.0]\n  Target bbox: [639.01, 320.58, 665.59, 384.2]\n\nFrame 4:\n  Drone pose: [107.77, 45.57, 20.03, -43.88, -97.79, 0.0]\n  Target bbox: [642.58, 381.57, 677.27, 447.29]\n\nFrame 5 (current):\n  Drone pose: [107.9, 44.96, 20.03, -47.27, -96.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.75, \"ymin\": 328.01, \"xmax\": 654.03, \"ymax\": 391.04}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": -0.61, \"dz\": -0.01, \"dpitch\": -0.16, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -1.23, \"dz\": -0.01, \"dpitch\": -0.25, \"dyaw\": -1.91, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.86, \"dz\": -0.01, \"dpitch\": -0.34, \"dyaw\": -3.38, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -2.49, \"dz\": -0.02, \"dpitch\": -0.43, \"dyaw\": -4.69, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": -3.12, \"dz\": -0.02, \"dpitch\": -0.52, \"dyaw\": -5.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.79, "window_alt_abs_m": 0.13, "target_px_mean_hist": 592.8, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.52, 41.88, 19.94, -47.7, -96.13, 0.0]\n  Target bbox: [560.53, 296.79, 601.72, 361.96]\n\nFrame 2:\n  Drone pose: [107.34, 41.39, 20.1, -46.77, -111.76, 0.0]\n  Target bbox: [620.71, 328.02, 659.0, 391.0]\n\nFrame 3:\n  Drone pose: [107.18, 40.6, 20.01, -47.95, -104.44, 0.0]\n  Target bbox: [618.99, 322.66, 661.15, 396.51]\n\nFrame 4:\n  Drone pose: [106.94, 39.99, 20.01, -47.41, -100.36, 0.0]\n  Target bbox: [563.29, 336.84, 603.59, 406.49]\n\nFrame 5 (current):\n  Drone pose: [106.67, 39.38, 20.0, -52.11, -105.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 608.73, \"ymin\": 253.91, \"xmax\": 653.34, \"ymax\": 329.89}, \"waypoint_deltas\": [{\"dx\": -0.28, \"dy\": -0.59, \"dz\": 0.0, \"dpitch\": 3.98, \"dyaw\": -1.58, \"droll\": 0.0}, {\"dx\": -0.57, \"dy\": -1.18, \"dz\": 0.0, \"dpitch\": 3.95, \"dyaw\": -2.32, \"droll\": 0.0}, {\"dx\": -0.88, \"dy\": -1.76, \"dz\": 0.0, \"dpitch\": 3.92, \"dyaw\": -3.0, \"droll\": 0.0}, {\"dx\": -1.2, \"dy\": -2.32, \"dz\": 0.0, \"dpitch\": 3.91, \"dyaw\": -3.64, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": -2.88, \"dz\": 0.0, \"dpitch\": 3.92, \"dyaw\": -4.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 32.09, "window_alt_abs_m": 0.25, "target_px_mean_hist": 588.5, "cur_frame_id": 40, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00049/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.14, 36.5, 20.0, -48.19, -109.69, 0.0]\n  Target bbox: [619.31, 323.68, 660.84, 395.44]\n\nFrame 2:\n  Drone pose: [104.81, 35.95, 20.0, -44.33, -108.43, 0.0]\n  Target bbox: [601.08, 387.91, 636.51, 461.0]\n\nFrame 3:\n  Drone pose: [104.66, 35.48, 19.93, -45.06, -113.34, 0.0]\n  Target bbox: [618.69, 323.49, 661.61, 395.85]\n\nFrame 4:\n  Drone pose: [104.2, 34.87, 20.0, -47.36, -112.0, 0.0]\n  Target bbox: [626.98, 335.89, 662.29, 406.7]\n\nFrame 5 (current):\n  Drone pose: [103.93, 34.35, 20.0, -50.45, -107.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 565.66, \"ymin\": 285.97, \"xmax\": 600.86, \"ymax\": 352.61}, \"waypoint_deltas\": [{\"dx\": -0.24, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 2.33, \"dyaw\": -4.25, \"droll\": 0.0}, {\"dx\": -0.47, \"dy\": -1.02, \"dz\": 0.0, \"dpitch\": 2.5, \"dyaw\": -5.12, \"droll\": 0.0}, {\"dx\": -0.67, \"dy\": -1.52, \"dz\": 0.0, \"dpitch\": 2.37, \"dyaw\": -4.5, \"droll\": 0.0}, {\"dx\": -0.86, \"dy\": -2.02, \"dz\": 0.0, \"dpitch\": 2.57, \"dyaw\": -5.45, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": -2.51, \"dz\": 0.0, \"dpitch\": 2.47, \"dyaw\": -4.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.16, "window_alt_abs_m": 0.14, "target_px_mean_hist": 618.2, "cur_frame_id": 49, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.88, 31.84, 20.0, -47.98, -112.21, 0.0]\n  Target bbox: [618.52, 324.28, 661.35, 394.79]\n\nFrame 2:\n  Drone pose: [102.69, 31.36, 20.0, -53.08, -111.47, 0.0]\n  Target bbox: [615.57, 236.71, 661.62, 314.26]\n\nFrame 3:\n  Drone pose: [102.49, 30.87, 20.0, -50.84, -112.21, 0.0]\n  Target bbox: [616.35, 274.43, 656.53, 345.24]\n\nFrame 4:\n  Drone pose: [102.28, 30.38, 20.0, -50.18, -116.84, 0.0]\n  Target bbox: [675.2, 288.18, 718.46, 361.19]\n\nFrame 5 (current):\n  Drone pose: [102.06, 29.89, 20.0, -44.45, -108.08, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 562.72, \"ymin\": 381.5, \"xmax\": 612.17, \"ymax\": 453.44}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": -3.48, \"dyaw\": -3.88, \"droll\": 0.0}, {\"dx\": -0.47, \"dy\": -0.98, \"dz\": 0.0, \"dpitch\": -3.29, \"dyaw\": -4.69, \"droll\": 0.0}, {\"dx\": -0.7, \"dy\": -1.46, \"dz\": 0.0, \"dpitch\": -3.42, \"dyaw\": -3.96, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": -1.95, \"dz\": 0.0, \"dpitch\": -3.23, \"dyaw\": -4.79, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": -2.44, \"dz\": 0.0, \"dpitch\": -3.35, \"dyaw\": -4.1, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.87, "window_alt_abs_m": 0.0, "target_px_mean_hist": 622.2, "cur_frame_id": 58, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.92, 27.45, 20.0, -47.18, -107.18, 0.0]\n  Target bbox: [562.8, 335.73, 603.72, 408.1]\n\nFrame 2:\n  Drone pose: [100.71, 26.96, 20.0, -52.6, -114.26, 0.0]\n  Target bbox: [634.32, 240.05, 673.8, 311.05]\n\nFrame 3:\n  Drone pose: [100.5, 26.48, 20.0, -47.71, -112.39, 0.0]\n  Target bbox: [617.97, 323.31, 661.87, 395.79]\n\nFrame 4:\n  Drone pose: [100.28, 25.99, 20.0, -44.24, -116.71, 0.0]\n  Target bbox: [675.32, 384.37, 718.84, 458.63]\n\nFrame 5 (current):\n  Drone pose: [100.06, 25.5, 20.0, -52.61, -108.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 567.22, \"ymin\": 240.84, \"xmax\": 616.73, \"ymax\": 313.35}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": 4.85, \"dyaw\": -3.45, \"droll\": 0.0}, {\"dx\": -0.47, \"dy\": -0.98, \"dz\": 0.0, \"dpitch\": 5.03, \"dyaw\": -4.24, \"droll\": 0.0}, {\"dx\": -0.72, \"dy\": -1.47, \"dz\": 0.0, \"dpitch\": 4.89, \"dyaw\": -3.47, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": -1.96, \"dz\": 0.0, \"dpitch\": 5.06, \"dyaw\": -4.21, \"droll\": 0.0}, {\"dx\": -1.22, \"dy\": -2.45, \"dz\": 0.0, \"dpitch\": 4.91, \"dyaw\": -3.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.6, "window_alt_abs_m": 0.0, "target_px_mean_hist": 615.5, "cur_frame_id": 67, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.84, 23.05, 20.0, -49.81, -116.77, 0.0]\n  Target bbox: [675.92, 289.67, 717.33, 362.06]\n\nFrame 2:\n  Drone pose: [98.6, 22.55, 20.0, -47.5, -112.6, 0.0]\n  Target bbox: [618.37, 322.07, 661.9, 397.17]\n\nFrame 3:\n  Drone pose: [98.37, 22.06, 20.0, -52.11, -117.29, 0.0]\n  Target bbox: [666.84, 242.52, 702.73, 310.62]\n\nFrame 4:\n  Drone pose: [98.16, 21.58, 20.0, -47.23, -112.76, 0.0]\n  Target bbox: [618.48, 324.35, 661.38, 394.75]\n\nFrame 5 (current):\n  Drone pose: [97.94, 21.12, 19.93, -50.62, -118.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 668.46, \"ymin\": 307.42, \"xmax\": 718.26, \"ymax\": 382.44}, \"waypoint_deltas\": [{\"dx\": -0.2, \"dy\": -0.52, \"dz\": 0.07, \"dpitch\": 3.18, \"dyaw\": 7.47, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": -1.01, \"dz\": 0.07, \"dpitch\": 3.36, \"dyaw\": 6.61, \"droll\": 0.0}, {\"dx\": -0.64, \"dy\": -1.5, \"dz\": 0.07, \"dpitch\": 3.24, \"dyaw\": 7.31, \"droll\": 0.0}, {\"dx\": -0.86, \"dy\": -1.99, \"dz\": 0.07, \"dpitch\": 3.42, \"dyaw\": 6.49, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": -2.49, \"dz\": 0.07, \"dpitch\": 3.28, \"dyaw\": 7.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.56, "window_alt_abs_m": 0.07, "target_px_mean_hist": 620.5, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155/aug_001/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [96.85, 18.63, 20.0, -49.63, -106.73, 0.0]\n  Target bbox: [562.61, 286.91, 602.67, 358.82]\n\nFrame 2:\n  Drone pose: [96.74, 18.15, 19.96, -51.51, -109.5, 0.0]\n  Target bbox: [563.4, 246.51, 600.19, 311.79]\n\nFrame 3:\n  Drone pose: [96.39, 17.64, 20.0, -47.3, -111.86, 0.0]\n  Target bbox: [618.59, 320.9, 661.7, 398.27]\n\nFrame 4:\n  Drone pose: [96.17, 17.14, 20.0, -49.38, -116.77, 0.0]\n  Target bbox: [663.57, 286.42, 710.13, 359.75]\n\nFrame 5 (current):\n  Drone pose: [95.96, 16.64, 20.0, -47.26, -112.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.22, \"ymin\": 321.13, \"xmax\": 661.06, \"ymax\": 398.03}, \"waypoint_deltas\": [{\"dx\": -0.2, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": -0.89, \"droll\": 0.0}, {\"dx\": -0.4, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.29, \"droll\": 0.0}, {\"dx\": -0.6, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": -0.8, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.72, "window_alt_abs_m": 0.08, "target_px_mean_hist": 625.2, "cur_frame_id": 85, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776160155", "difficulty_score": 0.2157, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [123.39, -3.06, 22.0, -46.47, 180.0, 0.0]\n  Target bbox: [629.16, 327.86, 650.84, 391.42]\n\nFrame 2:\n  Drone pose: [121.88, -3.87, 21.2, -46.83, 177.55, 0.0]\n  Target bbox: [628.4, 325.37, 651.32, 393.86]\n\nFrame 3:\n  Drone pose: [120.85, -4.25, 20.67, -46.85, 176.28, 0.0]\n  Target bbox: [627.69, 323.95, 652.01, 395.24]\n\nFrame 4:\n  Drone pose: [120.07, -4.44, 20.64, -47.23, 175.64, 0.0]\n  Target bbox: [627.35, 323.11, 652.3, 396.2]\n\nFrame 5 (current):\n  Drone pose: [119.44, -4.53, 20.62, -47.38, 175.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.26, \"ymin\": 324.21, \"xmax\": 652.44, \"ymax\": 394.94}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": -0.03, \"dz\": -0.03, \"dpitch\": -0.01, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": -0.05, \"dz\": -0.05, \"dpitch\": 0.01, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": -0.06, \"dz\": -0.07, \"dpitch\": 0.04, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": -0.07, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.08, \"dz\": -0.2, \"dpitch\": 0.21, \"dyaw\": -0.28, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.67, "window_alt_abs_m": 1.38, "target_px_mean_hist": 509.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [116.88, -4.61, 20.42, -47.17, 175.05, 0.0]\n  Target bbox: [627.09, 323.03, 652.57, 396.25]\n\nFrame 2:\n  Drone pose: [116.38, -4.62, 20.39, -47.13, 175.04, 0.0]\n  Target bbox: [627.67, 328.6, 652.14, 390.47]\n\nFrame 3:\n  Drone pose: [115.86, -4.61, 20.36, -47.11, 175.05, 0.0]\n  Target bbox: [627.1, 324.71, 652.6, 394.49]\n\nFrame 4:\n  Drone pose: [115.33, -4.6, 20.33, -47.12, 175.08, 0.0]\n  Target bbox: [626.28, 320.92, 653.32, 398.35]\n\nFrame 5 (current):\n  Drone pose: [114.79, -4.59, 20.3, -47.14, 175.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.0, \"ymin\": 323.28, \"xmax\": 652.67, \"ymax\": 395.91}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": 0.02, \"dz\": -0.03, \"dpitch\": -0.01, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": 0.04, \"dz\": -0.05, \"dpitch\": 0.22, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": 0.06, \"dz\": -0.08, \"dpitch\": 0.24, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": 0.07, \"dz\": -0.11, \"dpitch\": 0.28, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": -2.55, \"dy\": 0.08, \"dz\": -0.13, \"dpitch\": 0.32, \"dyaw\": 0.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.08, "window_alt_abs_m": 0.12, "target_px_mean_hist": 540.8, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [112.24, -4.51, 20.17, -46.82, 175.35, 0.0]\n  Target bbox: [626.37, 320.64, 653.23, 398.58]\n\nFrame 2:\n  Drone pose: [111.76, -4.5, 20.15, -46.76, 175.39, 0.0]\n  Target bbox: [626.78, 321.15, 652.83, 398.15]\n\nFrame 3:\n  Drone pose: [111.28, -4.49, 20.14, -46.69, 175.42, 0.0]\n  Target bbox: [627.24, 325.91, 652.51, 393.15]\n\nFrame 4:\n  Drone pose: [110.81, -4.48, 20.12, -46.62, 175.45, 0.0]\n  Target bbox: [626.37, 320.56, 653.22, 398.73]\n\nFrame 5 (current):\n  Drone pose: [110.35, -4.48, 20.11, -46.55, 175.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.02, \"ymin\": 322.21, \"xmax\": 652.64, \"ymax\": 396.97}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": 0.01, \"dz\": -0.02, \"dpitch\": 0.08, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": 0.01, \"dz\": -0.03, \"dpitch\": 0.17, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": 0.01, \"dz\": -0.04, \"dpitch\": 0.24, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": -1.86, \"dy\": 0.01, \"dz\": -0.05, \"dpitch\": 0.29, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -2.35, \"dy\": 0.01, \"dz\": -0.06, \"dpitch\": 0.32, \"dyaw\": 0.06, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.12, "window_alt_abs_m": 0.07, "target_px_mean_hist": 573.0, "cur_frame_id": 22, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [108.0, -4.47, 20.05, -46.23, 175.53, 0.0]\n  Target bbox: [627.13, 322.32, 652.53, 396.89]\n\nFrame 2:\n  Drone pose: [107.5, -4.47, 20.04, -46.21, 175.52, 0.0]\n  Target bbox: [626.99, 326.33, 652.76, 392.76]\n\nFrame 3:\n  Drone pose: [107.0, -4.47, 20.04, -46.2, 175.52, 0.0]\n  Target bbox: [627.74, 328.27, 652.06, 390.82]\n\nFrame 4:\n  Drone pose: [106.5, -4.47, 20.03, -46.19, 175.52, 0.0]\n  Target bbox: [627.51, 328.94, 652.31, 390.08]\n\nFrame 5 (current):\n  Drone pose: [106.0, -4.47, 20.03, -46.18, 175.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.38, \"ymin\": 324.59, \"xmax\": 652.33, \"ymax\": 394.6}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.49, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.02, "target_px_mean_hist": 558.2, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.51, -4.47, 20.01, -46.15, 175.52, 0.0]\n  Target bbox: [626.58, 320.94, 653.02, 398.4]\n\nFrame 2:\n  Drone pose: [103.01, -4.48, 20.01, -46.15, 175.52, 0.0]\n  Target bbox: [627.29, 323.38, 652.39, 395.82]\n\nFrame 3:\n  Drone pose: [102.51, -4.48, 20.01, -46.15, 175.51, 0.0]\n  Target bbox: [626.44, 321.84, 653.19, 397.36]\n\nFrame 4:\n  Drone pose: [102.01, -4.48, 20.01, -46.15, 175.5, 0.0]\n  Target bbox: [627.54, 327.57, 652.25, 391.48]\n\nFrame 5 (current):\n  Drone pose: [101.51, -4.49, 20.0, -46.14, 175.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.57, \"ymin\": 323.55, \"xmax\": 653.1, \"ymax\": 395.62}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": -1.51, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.32, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.04, "window_alt_abs_m": 0.01, "target_px_mean_hist": 560.0, "cur_frame_id": 40, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00049/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [99.0, -4.59, 20.0, -46.13, 175.16, 0.0]\n  Target bbox: [626.37, 320.69, 653.22, 398.64]\n\nFrame 2:\n  Drone pose: [98.5, -4.63, 20.0, -46.13, 175.02, 0.0]\n  Target bbox: [626.31, 322.18, 653.34, 397.01]\n\nFrame 3:\n  Drone pose: [98.0, -4.69, 20.0, -46.12, 174.85, 0.0]\n  Target bbox: [626.89, 322.59, 652.76, 396.7]\n\nFrame 4:\n  Drone pose: [97.49, -4.75, 20.0, -46.12, 174.65, 0.0]\n  Target bbox: [626.99, 322.97, 652.68, 396.25]\n\nFrame 5 (current):\n  Drone pose: [96.99, -4.83, 20.0, -46.12, 174.4, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.79, \"ymin\": 320.45, \"xmax\": 653.8, \"ymax\": 398.83}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.28, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": -0.31, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.98, \"droll\": 0.0}, {\"dx\": -2.05, \"dy\": -0.43, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -1.38, \"droll\": 0.0}, {\"dx\": -2.57, \"dy\": -0.57, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -1.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.76, "window_alt_abs_m": 0.0, "target_px_mean_hist": 565.8, "cur_frame_id": 49, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [94.42, -5.4, 20.0, -46.11, 172.59, 0.0]\n  Target bbox: [626.21, 325.97, 653.54, 393.14]\n\nFrame 2:\n  Drone pose: [93.9, -5.54, 20.0, -46.11, 172.14, 0.0]\n  Target bbox: [626.5, 325.65, 653.24, 393.5]\n\nFrame 3:\n  Drone pose: [93.38, -5.68, 20.0, -46.12, 171.69, 0.0]\n  Target bbox: [626.12, 323.53, 653.57, 395.66]\n\nFrame 4:\n  Drone pose: [92.85, -5.82, 20.0, -46.13, 171.26, 0.0]\n  Target bbox: [624.8, 320.03, 654.77, 399.3]\n\nFrame 5 (current):\n  Drone pose: [92.32, -5.93, 20.0, -46.15, 170.89, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.37, \"ymin\": 325.4, \"xmax\": 653.37, \"ymax\": 393.77}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -0.15, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": -1.6, \"dy\": -0.18, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -2.17, \"droll\": 0.0}, {\"dx\": -2.14, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -2.22, \"droll\": 0.0}, {\"dx\": -2.68, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -3.78, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.71, "window_alt_abs_m": 0.0, "target_px_mean_hist": 558.2, "cur_frame_id": 58, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [89.64, -6.12, 20.0, -46.08, 167.11, 0.0]\n  Target bbox: [624.19, 321.78, 655.43, 397.56]\n\nFrame 2:\n  Drone pose: [89.09, -6.1, 20.0, -46.15, 167.12, 0.0]\n  Target bbox: [619.08, 322.24, 661.03, 396.97]\n\nFrame 3:\n  Drone pose: [88.55, -6.09, 20.0, -46.04, 165.61, 0.0]\n  Target bbox: [625.2, 325.05, 654.56, 394.07]\n\nFrame 4:\n  Drone pose: [88.0, -6.07, 20.0, -46.12, 165.64, 0.0]\n  Target bbox: [623.42, 320.83, 656.16, 398.58]\n\nFrame 5 (current):\n  Drone pose: [87.46, -6.04, 20.0, -46.2, 165.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.2, \"ymin\": 322.48, \"xmax\": 660.93, \"ymax\": 396.73}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -1.43, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -1.3, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": 0.16, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -2.67, \"droll\": 0.0}, {\"dx\": -2.18, \"dy\": 0.23, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -2.49, \"droll\": 0.0}, {\"dx\": -2.72, \"dy\": 0.31, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": -3.8, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.61, "window_alt_abs_m": 0.0, "target_px_mean_hist": 559.8, "cur_frame_id": 67, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [84.74, -5.73, 20.0, -46.01, 161.89, 0.0]\n  Target bbox: [623.1, 321.08, 656.54, 398.18]\n\nFrame 2:\n  Drone pose: [84.2, -5.65, 20.0, -46.11, 162.09, 0.0]\n  Target bbox: [617.89, 320.64, 662.36, 398.65]\n\nFrame 3:\n  Drone pose: [83.66, -5.57, 20.0, -45.95, 160.8, 0.0]\n  Target bbox: [623.24, 322.61, 656.45, 396.64]\n\nFrame 4:\n  Drone pose: [83.12, -5.49, 20.0, -46.05, 160.99, 0.0]\n  Target bbox: [619.58, 322.67, 660.6, 396.52]\n\nFrame 5 (current):\n  Drone pose: [82.59, -5.42, 20.0, -45.88, 159.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.64, \"ymin\": 319.94, \"xmax\": 657.92, \"ymax\": 399.49}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 0.16, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 0.4, \"droll\": 0.0}, {\"dx\": -1.6, \"dy\": 0.26, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.82, \"droll\": 0.0}, {\"dx\": -2.13, \"dy\": 0.37, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -0.53, \"droll\": 0.0}, {\"dx\": -2.66, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -1.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.98, "window_alt_abs_m": 0.0, "target_px_mean_hist": 542.8, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/ORI/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [79.93, -4.93, 20.0, -45.82, 158.02, 0.0]\n  Target bbox: [623.12, 322.91, 656.6, 396.35]\n\nFrame 2:\n  Drone pose: [79.4, -4.8, 20.0, -45.94, 158.37, 0.0]\n  Target bbox: [621.86, 324.78, 658.28, 394.33]\n\nFrame 3:\n  Drone pose: [78.87, -4.66, 20.0, -45.77, 157.29, 0.0]\n  Target bbox: [622.48, 322.22, 657.21, 397.06]\n\nFrame 4:\n  Drone pose: [78.35, -4.52, 20.0, -45.89, 157.66, 0.0]\n  Target bbox: [618.9, 321.4, 661.41, 397.93]\n\nFrame 5 (current):\n  Drone pose: [77.82, -4.38, 20.0, -45.7, 156.6, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.56, \"ymin\": 319.79, \"xmax\": 659.05, \"ymax\": 399.58}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": 0.26, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.73, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -2.61, \"dy\": 0.67, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -1.06, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.87, "window_alt_abs_m": 0.0, "target_px_mean_hist": 545.5, "cur_frame_id": 85, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [123.43, -3.22, 21.99, -43.0, 178.07, 0.0]\n  Target bbox: [646.52, 388.23, 667.6, 445.45]\n\nFrame 2:\n  Drone pose: [121.8, -3.9, 21.11, -46.81, 177.45, 0.0]\n  Target bbox: [627.65, 322.92, 652.0, 396.39]\n\nFrame 3:\n  Drone pose: [120.73, -4.35, 20.71, -45.6, 178.84, 0.0]\n  Target bbox: [593.63, 347.55, 619.64, 422.45]\n\nFrame 4:\n  Drone pose: [120.07, -4.44, 20.64, -47.59, 180.64, 0.0]\n  Target bbox: [569.01, 318.07, 596.07, 392.7]\n\nFrame 5 (current):\n  Drone pose: [119.53, -4.44, 20.56, -50.03, 179.87, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 578.44, \"ymin\": 283.69, \"xmax\": 603.94, \"ymax\": 342.06}, \"waypoint_deltas\": [{\"dx\": -0.63, \"dy\": -0.12, \"dz\": 0.03, \"dpitch\": 2.64, \"dyaw\": -4.66, \"droll\": 0.0}, {\"dx\": -1.13, \"dy\": -0.14, \"dz\": 0.01, \"dpitch\": 2.66, \"dyaw\": -4.71, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": -0.15, \"dz\": -0.01, \"dpitch\": 2.69, \"dyaw\": -4.75, \"droll\": 0.0}, {\"dx\": -2.15, \"dy\": -0.16, \"dz\": -0.03, \"dpitch\": 2.71, \"dyaw\": -4.79, \"droll\": 0.0}, {\"dx\": -2.65, \"dy\": -0.17, \"dz\": -0.14, \"dpitch\": 2.86, \"dyaw\": -4.82, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.58, "window_alt_abs_m": 1.42, "target_px_mean_hist": 500.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [116.76, -4.5, 20.41, -45.77, 172.02, 0.0]\n  Target bbox: [664.48, 350.65, 692.47, 423.34]\n\nFrame 2:\n  Drone pose: [116.34, -4.48, 20.51, -45.37, 172.16, 0.0]\n  Target bbox: [663.07, 355.96, 692.46, 432.34]\n\nFrame 3:\n  Drone pose: [115.86, -4.61, 20.36, -47.11, 175.05, 0.0]\n  Target bbox: [627.62, 328.64, 652.18, 390.44]\n\nFrame 4:\n  Drone pose: [115.23, -4.62, 20.41, -44.16, 175.74, 0.0]\n  Target bbox: [618.82, 382.59, 643.56, 444.83]\n\nFrame 5 (current):\n  Drone pose: [114.95, -4.57, 20.28, -46.23, 176.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.61, \"ymin\": 333.59, \"xmax\": 637.76, \"ymax\": 407.07}, \"waypoint_deltas\": [{\"dx\": -0.69, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.92, \"dyaw\": -1.31, \"droll\": 0.0}, {\"dx\": -1.21, \"dy\": 0.02, \"dz\": -0.03, \"dpitch\": -0.69, \"dyaw\": -1.26, \"droll\": 0.0}, {\"dx\": -1.72, \"dy\": 0.04, \"dz\": -0.06, \"dpitch\": -0.67, \"dyaw\": -1.21, \"droll\": 0.0}, {\"dx\": -2.22, \"dy\": 0.05, \"dz\": -0.09, \"dpitch\": -0.63, \"dyaw\": -1.16, \"droll\": 0.0}, {\"dx\": -2.71, \"dy\": 0.06, \"dz\": -0.11, \"dpitch\": -0.59, \"dyaw\": -1.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.45, "window_alt_abs_m": 0.43, "target_px_mean_hist": 536.2, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [112.17, -4.59, 20.26, -51.2, 178.69, 0.0]\n  Target bbox: [585.1, 259.79, 610.72, 321.41]\n\nFrame 2:\n  Drone pose: [111.76, -4.5, 20.15, -45.78, 180.39, 0.0]\n  Target bbox: [568.88, 342.72, 595.27, 413.04]\n\nFrame 3:\n  Drone pose: [111.09, -4.44, 20.16, -43.21, 170.54, 0.0]\n  Target bbox: [682.86, 388.94, 712.0, 462.57]\n\nFrame 4:\n  Drone pose: [110.87, -4.47, 20.21, -45.44, 180.51, 0.0]\n  Target bbox: [569.1, 349.79, 594.96, 414.13]\n\nFrame 5 (current):\n  Drone pose: [110.35, -4.48, 20.11, -48.78, 178.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 589.24, \"ymin\": 287.36, \"xmax\": 614.69, \"ymax\": 358.28}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": 0.01, \"dz\": -0.02, \"dpitch\": 2.31, \"dyaw\": -3.24, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": 0.01, \"dz\": -0.03, \"dpitch\": 2.4, \"dyaw\": -3.22, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": 0.01, \"dz\": -0.04, \"dpitch\": 2.47, \"dyaw\": -3.21, \"droll\": 0.0}, {\"dx\": -1.86, \"dy\": 0.01, \"dz\": -0.05, \"dpitch\": 2.52, \"dyaw\": -3.2, \"droll\": 0.0}, {\"dx\": -2.35, \"dy\": 0.01, \"dz\": -0.06, \"dpitch\": 2.55, \"dyaw\": -3.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.3, "window_alt_abs_m": 0.28, "target_px_mean_hist": 556.2, "cur_frame_id": 22, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.9, -4.35, 20.12, -49.11, 180.33, 0.0]\n  Target bbox: [574.45, 278.69, 601.84, 355.72]\n\nFrame 2:\n  Drone pose: [107.47, -4.34, 20.06, -42.95, 170.93, 0.0]\n  Target bbox: [683.29, 379.44, 712.97, 456.47]\n\nFrame 3:\n  Drone pose: [106.98, -4.61, 19.99, -43.75, 179.02, 0.0]\n  Target bbox: [580.73, 365.03, 606.87, 436.83]\n\nFrame 4:\n  Drone pose: [106.5, -4.47, 20.03, -47.91, 180.52, 0.0]\n  Target bbox: [567.89, 298.68, 595.02, 366.14]\n\nFrame 5 (current):\n  Drone pose: [105.87, -4.45, 19.88, -42.84, 180.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 568.46, \"ymin\": 386.18, \"xmax\": 594.4, \"ymax\": 448.33}, \"waypoint_deltas\": [{\"dx\": -0.37, \"dy\": -0.02, \"dz\": 0.14, \"dpitch\": -3.33, \"dyaw\": -5.04, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": -0.02, \"dz\": 0.14, \"dpitch\": -3.33, \"dyaw\": -5.04, \"droll\": 0.0}, {\"dx\": -1.37, \"dy\": -0.02, \"dz\": 0.14, \"dpitch\": -3.32, \"dyaw\": -5.04, \"droll\": 0.0}, {\"dx\": -1.86, \"dy\": -0.02, \"dz\": 0.13, \"dpitch\": -3.32, \"dyaw\": -5.04, \"droll\": 0.0}, {\"dx\": -2.36, \"dy\": -0.02, \"dz\": 0.13, \"dpitch\": -3.31, \"dyaw\": -5.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.03, "window_alt_abs_m": 0.32, "target_px_mean_hist": 576.5, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.51, -4.64, 20.08, -47.14, 180.0, 0.0]\n  Target bbox: [568.22, 309.35, 594.77, 382.95]\n\nFrame 2:\n  Drone pose: [103.01, -4.48, 20.01, -47.34, 180.43, 0.0]\n  Target bbox: [568.75, 302.73, 596.12, 380.07]\n\nFrame 3:\n  Drone pose: [102.58, -4.54, 19.93, -43.08, 171.39, 0.0]\n  Target bbox: [671.78, 370.36, 700.36, 446.04]\n\nFrame 4:\n  Drone pose: [102.14, -4.38, 19.97, -50.36, 178.27, 0.0]\n  Target bbox: [598.73, 249.16, 623.77, 320.85]\n\nFrame 5 (current):\n  Drone pose: [101.62, -4.37, 20.0, -45.97, 175.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.56, \"ymin\": 325.88, \"xmax\": 652.19, \"ymax\": 393.23}, \"waypoint_deltas\": [{\"dx\": -0.61, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": -1.12, \"dy\": -0.14, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": -0.45, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": -0.16, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": -0.51, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": -0.18, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -0.59, \"droll\": 0.0}, {\"dx\": -2.62, \"dy\": -0.22, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -0.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.75, "window_alt_abs_m": 0.22, "target_px_mean_hist": 561.5, "cur_frame_id": 40, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00049/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [99.0, -4.59, 20.0, -46.13, 175.16, 0.0]\n  Target bbox: [627.02, 326.84, 652.75, 392.23]\n\nFrame 2:\n  Drone pose: [98.53, -4.75, 20.0, -48.64, 177.13, 0.0]\n  Target bbox: [597.39, 278.48, 624.26, 355.3]\n\nFrame 3:\n  Drone pose: [98.05, -4.58, 19.88, -41.95, 176.83, 0.0]\n  Target bbox: [607.58, 390.03, 633.32, 461.16]\n\nFrame 4:\n  Drone pose: [97.4, -4.6, 19.96, -49.96, 180.1, 0.0]\n  Target bbox: [567.38, 260.53, 595.27, 336.44]\n\nFrame 5 (current):\n  Drone pose: [96.95, -4.65, 19.93, -42.79, 173.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 648.37, \"ymin\": 380.04, \"xmax\": 674.94, \"ymax\": 450.41}, \"waypoint_deltas\": [{\"dx\": -0.47, \"dy\": -0.27, \"dz\": 0.07, \"dpitch\": -3.33, \"dyaw\": 1.03, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.37, \"dz\": 0.07, \"dpitch\": -3.32, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": -0.49, \"dz\": 0.07, \"dpitch\": -3.32, \"dyaw\": 0.33, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": -0.61, \"dz\": 0.07, \"dpitch\": -3.32, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": -0.75, \"dz\": 0.07, \"dpitch\": -3.32, \"dyaw\": -0.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.53, "window_alt_abs_m": 0.25, "target_px_mean_hist": 563.2, "cur_frame_id": 49, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [94.47, -5.44, 20.16, -45.65, 173.08, 0.0]\n  Target bbox: [619.1, 333.86, 646.14, 406.76]\n\nFrame 2:\n  Drone pose: [94.06, -5.58, 19.9, -50.43, 170.63, 0.0]\n  Target bbox: [643.64, 246.05, 671.14, 315.03]\n\nFrame 3:\n  Drone pose: [93.38, -5.68, 20.0, -47.0, 170.18, 0.0]\n  Target bbox: [644.56, 313.68, 671.1, 376.27]\n\nFrame 4:\n  Drone pose: [92.93, -5.93, 20.11, -41.91, 175.95, 0.0]\n  Target bbox: [566.92, 396.8, 595.59, 468.99]\n\nFrame 5 (current):\n  Drone pose: [92.24, -5.87, 20.16, -42.75, 174.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 580.03, \"ymin\": 389.33, \"xmax\": 609.18, \"ymax\": 459.42}, \"waypoint_deltas\": [{\"dx\": -0.45, \"dy\": -0.15, \"dz\": -0.16, \"dpitch\": -3.43, \"dyaw\": -4.34, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.21, \"dz\": -0.16, \"dpitch\": -3.47, \"dyaw\": -4.54, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": -0.24, \"dz\": -0.16, \"dpitch\": -3.37, \"dyaw\": -6.21, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": -0.25, \"dz\": -0.16, \"dpitch\": -3.43, \"dyaw\": -6.26, \"droll\": 0.0}, {\"dx\": -2.6, \"dy\": -0.25, \"dz\": -0.16, \"dpitch\": -3.33, \"dyaw\": -7.82, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.69, "window_alt_abs_m": 0.52, "target_px_mean_hist": 550.8, "cur_frame_id": 58, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [89.64, -6.12, 20.0, -45.96, 171.86, 0.0]\n  Target bbox: [567.6, 323.83, 600.82, 403.03]\n\nFrame 2:\n  Drone pose: [89.12, -6.12, 19.87, -45.53, 172.09, 0.0]\n  Target bbox: [562.67, 332.45, 599.9, 403.45]\n\nFrame 3:\n  Drone pose: [88.55, -6.14, 20.03, -46.18, 170.46, 0.0]\n  Target bbox: [566.3, 327.25, 596.44, 391.66]\n\nFrame 4:\n  Drone pose: [88.0, -6.07, 20.0, -50.25, 162.03, 0.0]\n  Target bbox: [664.48, 251.74, 700.06, 330.65]\n\nFrame 5 (current):\n  Drone pose: [87.49, -6.13, 20.17, -50.3, 166.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 607.03, \"ymin\": 256.26, \"xmax\": 648.63, \"ymax\": 331.11}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": 0.14, \"dz\": -0.17, \"dpitch\": 4.22, \"dyaw\": -2.21, \"droll\": 0.0}, {\"dx\": -1.12, \"dy\": 0.19, \"dz\": -0.17, \"dpitch\": 4.13, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": -1.67, \"dy\": 0.25, \"dz\": -0.17, \"dpitch\": 4.25, \"dyaw\": -3.45, \"droll\": 0.0}, {\"dx\": -2.21, \"dy\": 0.32, \"dz\": -0.17, \"dpitch\": 4.16, \"dyaw\": -3.27, \"droll\": 0.0}, {\"dx\": -2.75, \"dy\": 0.4, \"dz\": -0.17, \"dpitch\": 4.29, \"dyaw\": -4.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.74, "window_alt_abs_m": 0.48, "target_px_mean_hist": 549.8, "cur_frame_id": 67, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [84.71, -5.68, 19.94, -46.12, 164.25, 0.0]\n  Target bbox: [598.57, 324.87, 628.05, 390.08]\n\nFrame 2:\n  Drone pose: [84.2, -5.72, 19.92, -41.98, 161.76, 0.0]\n  Target bbox: [622.78, 391.12, 659.96, 461.07]\n\nFrame 3:\n  Drone pose: [83.51, -5.59, 20.01, -41.48, 163.51, 0.0]\n  Target bbox: [588.13, 401.81, 622.39, 476.94]\n\nFrame 4:\n  Drone pose: [83.01, -5.38, 19.95, -46.21, 161.22, 0.0]\n  Target bbox: [620.26, 323.33, 659.91, 395.86]\n\nFrame 5 (current):\n  Drone pose: [82.59, -5.42, 20.0, -45.88, 159.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.65, \"ymin\": 325.76, \"xmax\": 655.15, \"ymax\": 393.43}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 0.16, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 0.4, \"droll\": 0.0}, {\"dx\": -1.6, \"dy\": 0.26, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.82, \"droll\": 0.0}, {\"dx\": -2.13, \"dy\": 0.37, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -0.53, \"droll\": 0.0}, {\"dx\": -2.66, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -1.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.07, "window_alt_abs_m": 0.22, "target_px_mean_hist": 551.0, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432/aug_001/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [79.93, -4.93, 20.0, -48.03, 161.43, 0.0]\n  Target bbox: [582.35, 287.49, 616.86, 359.38]\n\nFrame 2:\n  Drone pose: [79.4, -4.8, 20.0, -45.94, 158.37, 0.0]\n  Target bbox: [620.64, 322.92, 659.62, 396.38]\n\nFrame 3:\n  Drone pose: [78.76, -4.53, 19.91, -45.88, 157.53, 0.0]\n  Target bbox: [624.23, 325.4, 655.58, 393.79]\n\nFrame 4:\n  Drone pose: [78.27, -4.58, 20.12, -44.86, 159.05, 0.0]\n  Target bbox: [603.71, 347.5, 637.1, 415.32]\n\nFrame 5 (current):\n  Drone pose: [77.67, -4.41, 20.07, -50.72, 156.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.59, \"ymin\": 241.28, \"xmax\": 653.89, \"ymax\": 319.16}, \"waypoint_deltas\": [{\"dx\": -0.37, \"dy\": 0.16, \"dz\": -0.07, \"dpitch\": 4.9, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -0.9, \"dy\": 0.29, \"dz\": -0.07, \"dpitch\": 5.09, \"dyaw\": -0.79, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": 0.43, \"dz\": -0.07, \"dpitch\": 4.98, \"dyaw\": -0.44, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": 0.56, \"dz\": -0.07, \"dpitch\": 4.87, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": -2.46, \"dy\": 0.7, \"dz\": -0.07, \"dpitch\": 5.06, \"dyaw\": -1.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.81, "window_alt_abs_m": 0.35, "target_px_mean_hist": 554.2, "cur_frame_id": 85, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_432", "difficulty_score": 0.2287, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [9.17, 16.91, 22.0, -46.97, -171.47, 0.0]\n  Target bbox: [629.43, 340.37, 650.55, 379.4]\n\nFrame 2:\n  Drone pose: [7.51, 15.53, 21.2, -47.75, -173.61, 0.0]\n  Target bbox: [632.11, 339.01, 647.99, 380.69]\n\nFrame 3:\n  Drone pose: [6.5, 14.71, 20.67, -47.91, -176.01, 0.0]\n  Target bbox: [629.6, 339.79, 650.38, 379.93]\n\nFrame 4:\n  Drone pose: [5.81, 14.23, 20.64, -48.17, -175.91, 0.0]\n  Target bbox: [629.41, 339.6, 650.57, 380.11]\n\nFrame 5 (current):\n  Drone pose: [5.26, 13.93, 20.62, -48.18, -175.26, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 631.59, \"ymin\": 339.69, \"xmax\": 648.52, \"ymax\": 380.02}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.19, \"dz\": -0.03, \"dpitch\": 0.0, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.34, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": -1.08, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": -0.46, \"dz\": -0.07, \"dpitch\": 0.05, \"dyaw\": -1.47, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": -0.57, \"dz\": -0.09, \"dpitch\": 0.07, \"dyaw\": -1.81, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": -0.66, \"dz\": -0.2, \"dpitch\": 0.19, \"dyaw\": -2.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.3, "window_alt_abs_m": 1.38, "target_px_mean_hist": 192.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00008/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00012/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [3.27, 13.36, 20.53, -48.11, -177.07, 0.0]\n  Target bbox: [632.21, 338.41, 647.91, 381.27]\n\nFrame 2:\n  Drone pose: [2.75, 13.27, 20.42, -47.99, -177.35, 0.0]\n  Target bbox: [632.39, 338.65, 647.71, 381.01]\n\nFrame 3:\n  Drone pose: [2.21, 13.2, 20.39, -48.01, -177.57, 0.0]\n  Target bbox: [632.43, 339.36, 647.67, 380.31]\n\nFrame 4:\n  Drone pose: [1.66, 13.15, 20.36, -48.05, -177.73, 0.0]\n  Target bbox: [631.58, 342.06, 648.5, 377.6]\n\nFrame 5 (current):\n  Drone pose: [1.11, 13.11, 20.33, -48.08, -177.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 631.96, \"ymin\": 339.39, \"xmax\": 648.16, \"ymax\": 380.31}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": -0.02, \"dz\": -0.03, \"dpitch\": -0.03, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -0.03, \"dz\": -0.06, \"dpitch\": -0.03, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": -0.03, \"dz\": -0.08, \"dpitch\": -0.01, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": -2.08, \"dy\": 0.0, \"dz\": -0.11, \"dpitch\": 0.03, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -2.57, \"dy\": 0.03, \"dz\": -0.14, \"dpitch\": 0.08, \"dyaw\": 0.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.77, "window_alt_abs_m": 0.2, "target_px_mean_hist": 210.8, "cur_frame_id": 12, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-1.46, 13.14, 20.19, -48.0, -177.73, 0.0]\n  Target bbox: [632.05, 338.54, 648.07, 381.16]\n\nFrame 2:\n  Drone pose: [-1.94, 13.2, 20.17, -47.93, -177.55, 0.0]\n  Target bbox: [632.02, 338.5, 648.1, 381.19]\n\nFrame 3:\n  Drone pose: [-2.41, 13.27, 20.15, -47.85, -177.33, 0.0]\n  Target bbox: [631.69, 340.51, 648.41, 379.17]\n\nFrame 4:\n  Drone pose: [-2.87, 13.36, 20.14, -47.76, -177.05, 0.0]\n  Target bbox: [631.42, 338.29, 648.71, 381.41]\n\nFrame 5 (current):\n  Drone pose: [-3.31, 13.46, 20.12, -47.63, -176.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 631.83, \"ymin\": 342.19, \"xmax\": 648.24, \"ymax\": 377.47}, \"waypoint_deltas\": [{\"dx\": -0.43, \"dy\": 0.12, \"dz\": -0.01, \"dpitch\": 0.14, \"dyaw\": 0.36, \"droll\": 0.0}, {\"dx\": -0.86, \"dy\": 0.24, \"dz\": -0.03, \"dpitch\": 0.28, \"dyaw\": 0.74, \"droll\": 0.0}, {\"dx\": -1.33, \"dy\": 0.37, \"dz\": -0.04, \"dpitch\": 0.36, \"dyaw\": 1.15, \"droll\": 0.0}, {\"dx\": -1.83, \"dy\": 0.52, \"dz\": -0.05, \"dpitch\": 0.4, \"dyaw\": 1.6, \"droll\": 0.0}, {\"dx\": -2.36, \"dy\": 0.66, \"dz\": -0.06, \"dpitch\": 0.39, \"dyaw\": 2.08, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.0, "window_alt_abs_m": 0.07, "target_px_mean_hist": 212.0, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00029/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-5.14, 13.98, 20.07, -47.23, -175.13, 0.0]\n  Target bbox: [631.73, 340.82, 648.36, 378.86]\n\nFrame 2:\n  Drone pose: [-5.67, 14.12, 20.06, -47.24, -174.65, 0.0]\n  Target bbox: [631.62, 340.56, 648.46, 379.11]\n\nFrame 3:\n  Drone pose: [-6.23, 14.27, 20.05, -47.3, -174.17, 0.0]\n  Target bbox: [631.59, 337.85, 648.54, 381.84]\n\nFrame 4:\n  Drone pose: [-6.81, 14.39, 20.04, -47.4, -173.77, 0.0]\n  Target bbox: [630.84, 337.86, 649.29, 381.85]\n\nFrame 5 (current):\n  Drone pose: [-7.43, 14.45, 20.04, -47.56, -173.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.84, \"ymin\": 338.34, \"xmax\": 649.29, \"ymax\": 381.36}, \"waypoint_deltas\": [{\"dx\": -0.64, \"dy\": 0.01, \"dz\": -0.01, \"dpitch\": -0.22, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": -1.3, \"dy\": -0.04, \"dz\": -0.01, \"dpitch\": -0.38, \"dyaw\": 1.58, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": -0.15, \"dz\": -0.02, \"dpitch\": -0.57, \"dyaw\": 2.93, \"droll\": 0.0}, {\"dx\": -2.67, \"dy\": -0.31, \"dz\": -0.02, \"dpitch\": -0.77, \"dyaw\": 4.15, \"droll\": 0.0}, {\"dx\": -3.36, \"dy\": -0.5, \"dz\": -0.02, \"dpitch\": -0.97, \"dyaw\": 5.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.62, "window_alt_abs_m": 0.03, "target_px_mean_hist": 219.8, "cur_frame_id": 29, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-10.79, 13.95, 20.02, -48.53, -168.21, 0.0]\n  Target bbox: [627.15, 336.81, 652.8, 382.85]\n\nFrame 2:\n  Drone pose: [-11.47, 13.76, 20.01, -48.71, -167.1, 0.0]\n  Target bbox: [626.8, 336.75, 653.14, 382.94]\n\nFrame 3:\n  Drone pose: [-12.14, 13.55, 20.01, -48.86, -166.01, 0.0]\n  Target bbox: [628.21, 338.23, 651.75, 381.43]\n\nFrame 4:\n  Drone pose: [-12.8, 13.33, 20.01, -49.0, -164.94, 0.0]\n  Target bbox: [626.94, 336.79, 653.0, 382.87]\n\nFrame 5 (current):\n  Drone pose: [-13.45, 13.11, 20.01, -49.12, -163.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.76, \"ymin\": 339.03, \"xmax\": 651.2, \"ymax\": 380.65}, \"waypoint_deltas\": [{\"dx\": -0.65, \"dy\": -0.22, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 1.04, \"droll\": 0.0}, {\"dx\": -1.29, \"dy\": -0.46, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": 2.06, \"droll\": 0.0}, {\"dx\": -1.92, \"dy\": -0.7, \"dz\": -0.01, \"dpitch\": -0.05, \"dyaw\": 3.05, \"droll\": 0.0}, {\"dx\": -2.55, \"dy\": -0.94, \"dz\": -0.01, \"dpitch\": -0.11, \"dyaw\": 4.02, \"droll\": 0.0}, {\"dx\": -3.18, \"dy\": -1.2, \"dz\": -0.01, \"dpitch\": -0.16, \"dyaw\": 4.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.33, "window_alt_abs_m": 0.01, "target_px_mean_hist": 211.0, "cur_frame_id": 38, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-16.0, 12.17, 20.0, -49.23, -159.86, 0.0]\n  Target bbox: [627.32, 336.85, 652.61, 382.77]\n\nFrame 2:\n  Drone pose: [-16.63, 11.91, 20.0, -49.28, -158.92, 0.0]\n  Target bbox: [628.53, 337.85, 651.39, 381.81]\n\nFrame 3:\n  Drone pose: [-17.25, 11.65, 20.0, -49.32, -158.01, 0.0]\n  Target bbox: [627.19, 337.04, 652.73, 382.6]\n\nFrame 4:\n  Drone pose: [-17.85, 11.37, 20.0, -49.34, -157.17, 0.0]\n  Target bbox: [627.71, 337.2, 652.21, 382.42]\n\nFrame 5 (current):\n  Drone pose: [-18.44, 11.06, 20.0, -49.35, -156.42, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.14, \"ymin\": 337.86, \"xmax\": 651.79, \"ymax\": 381.77}, \"waypoint_deltas\": [{\"dx\": -0.59, \"dy\": -0.31, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": -1.17, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": -1.76, \"dy\": -0.97, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 2.01, \"droll\": 0.0}, {\"dx\": -2.35, \"dy\": -1.3, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 2.7, \"droll\": 0.0}, {\"dx\": -2.94, \"dy\": -1.61, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 3.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.45, "window_alt_abs_m": 0.0, "target_px_mean_hist": 221.5, "cur_frame_id": 46, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-21.38, 9.45, 20.0, -49.38, -152.98, 0.0]\n  Target bbox: [627.72, 337.29, 652.18, 382.33]\n\nFrame 2:\n  Drone pose: [-21.98, 9.16, 20.0, -49.36, -152.2, 0.0]\n  Target bbox: [628.99, 338.46, 650.93, 381.18]\n\nFrame 3:\n  Drone pose: [-22.57, 8.88, 20.0, -49.33, -151.38, 0.0]\n  Target bbox: [628.47, 337.06, 651.62, 382.59]\n\nFrame 4:\n  Drone pose: [-23.16, 8.6, 20.0, -49.9, -152.08, 0.0]\n  Target bbox: [629.58, 339.91, 650.35, 379.72]\n\nFrame 5 (current):\n  Drone pose: [-23.76, 8.32, 20.0, -49.88, -151.24, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.14, \"ymin\": 339.23, \"xmax\": 650.78, \"ymax\": 380.38}, \"waypoint_deltas\": [{\"dx\": -0.59, \"dy\": -0.29, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.81, \"droll\": 0.0}, {\"dx\": -1.17, \"dy\": -0.59, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 1.56, \"droll\": 0.0}, {\"dx\": -1.75, \"dy\": -0.9, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": 2.26, \"droll\": 0.0}, {\"dx\": -2.32, \"dy\": -1.22, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": 2.9, \"droll\": 0.0}, {\"dx\": -2.88, \"dy\": -1.56, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": 3.48, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.14, "window_alt_abs_m": 0.0, "target_px_mean_hist": 223.0, "cur_frame_id": 55, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00063/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-26.08, 7.1, 20.0, -49.68, -148.34, 0.0]\n  Target bbox: [629.86, 339.78, 650.07, 379.83]\n\nFrame 2:\n  Drone pose: [-26.64, 6.76, 20.0, -49.63, -147.76, 0.0]\n  Target bbox: [628.31, 338.42, 651.58, 381.23]\n\nFrame 3:\n  Drone pose: [-27.19, 6.39, 20.0, -49.58, -147.27, 0.0]\n  Target bbox: [629.81, 340.15, 650.11, 379.46]\n\nFrame 4:\n  Drone pose: [-27.74, 6.0, 20.0, -49.55, -146.86, 0.0]\n  Target bbox: [629.95, 339.76, 649.96, 379.89]\n\nFrame 5 (current):\n  Drone pose: [-28.28, 5.59, 20.0, -49.52, -146.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.44, \"ymin\": 338.35, \"xmax\": 651.44, \"ymax\": 381.3}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": -0.42, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -0.86, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": -1.31, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.67, \"droll\": 0.0}, {\"dx\": -2.08, \"dy\": -1.78, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": -2.58, \"dy\": -2.25, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.82, "window_alt_abs_m": 0.0, "target_px_mean_hist": 223.5, "cur_frame_id": 63, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-30.86, 3.34, 20.0, -49.39, -145.65, 0.0]\n  Target bbox: [629.48, 338.54, 650.41, 381.11]\n\nFrame 2:\n  Drone pose: [-31.35, 2.86, 20.0, -49.37, -145.63, 0.0]\n  Target bbox: [629.29, 338.98, 650.61, 380.63]\n\nFrame 3:\n  Drone pose: [-31.83, 2.36, 20.0, -49.33, -145.63, 0.0]\n  Target bbox: [629.16, 338.63, 650.74, 380.99]\n\nFrame 4:\n  Drone pose: [-32.31, 1.87, 20.0, -49.3, -145.66, 0.0]\n  Target bbox: [631.24, 340.15, 648.68, 379.49]\n\nFrame 5 (current):\n  Drone pose: [-32.78, 1.39, 20.0, -49.24, -145.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.72, \"ymin\": 340.28, \"xmax\": 650.2, \"ymax\": 379.36}, \"waypoint_deltas\": [{\"dx\": -0.47, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -0.93, \"dy\": -0.92, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -1.38, \"dy\": -1.34, \"dz\": 0.0, \"dpitch\": 0.33, \"dyaw\": 0.21, \"droll\": 0.0}, {\"dx\": -1.82, \"dy\": -1.73, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": -2.26, \"dy\": -2.1, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": -0.74, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.07, "window_alt_abs_m": 0.0, "target_px_mean_hist": 226.8, "cur_frame_id": 72, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/ORI/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.6, -0.34, 20.0, -49.2, -146.67, 0.0]\n  Target bbox: [630.56, 339.63, 649.35, 380.01]\n\nFrame 2:\n  Drone pose: [-35.04, -0.71, 20.0, -48.99, -146.41, 0.0]\n  Target bbox: [628.51, 337.81, 651.55, 381.85]\n\nFrame 3:\n  Drone pose: [-35.47, -1.06, 20.0, -49.22, -147.52, 0.0]\n  Target bbox: [628.49, 338.0, 651.4, 381.61]\n\nFrame 4:\n  Drone pose: [-35.91, -1.39, 20.0, -48.98, -147.15, 0.0]\n  Target bbox: [627.67, 337.48, 652.4, 382.23]\n\nFrame 5 (current):\n  Drone pose: [-36.34, -1.7, 20.0, -49.16, -148.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.97, \"ymin\": 340.01, \"xmax\": 648.95, \"ymax\": 379.63}, \"waypoint_deltas\": [{\"dx\": -0.43, \"dy\": -0.31, \"dz\": 0.0, \"dpitch\": 0.27, \"dyaw\": 0.44, \"droll\": 0.0}, {\"dx\": -0.86, \"dy\": -0.61, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -0.55, \"droll\": 0.0}, {\"dx\": -1.3, \"dy\": -0.9, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -1.54, \"droll\": 0.0}, {\"dx\": -1.73, \"dy\": -1.21, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": -1.07, \"droll\": 0.0}, {\"dx\": -2.17, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -2.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.77, "window_alt_abs_m": 0.0, "target_px_mean_hist": 219.2, "cur_frame_id": 80, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [9.12, 16.72, 21.97, -47.24, -166.98, 0.0]\n  Target bbox: [571.9, 339.6, 593.56, 377.37]\n\nFrame 2:\n  Drone pose: [7.51, 15.53, 21.2, -48.33, -169.54, 0.0]\n  Target bbox: [585.16, 333.51, 602.73, 369.28]\n\nFrame 3:\n  Drone pose: [6.5, 14.71, 20.67, -47.91, -176.01, 0.0]\n  Target bbox: [627.2, 337.88, 652.79, 381.87]\n\nFrame 4:\n  Drone pose: [5.81, 14.23, 20.64, -48.17, -175.91, 0.0]\n  Target bbox: [627.22, 337.38, 652.77, 382.32]\n\nFrame 5 (current):\n  Drone pose: [5.26, 13.93, 20.62, -43.55, -173.89, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.02, \"ymin\": 416.6, \"xmax\": 633.17, \"ymax\": 459.09}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.19, \"dz\": -0.03, \"dpitch\": -4.63, \"dyaw\": -1.98, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.34, \"dz\": -0.05, \"dpitch\": -4.6, \"dyaw\": -2.45, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": -0.46, \"dz\": -0.07, \"dpitch\": -4.58, \"dyaw\": -2.84, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": -0.57, \"dz\": -0.09, \"dpitch\": -4.56, \"dyaw\": -3.18, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": -0.66, \"dz\": -0.2, \"dpitch\": -4.44, \"dyaw\": -3.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.16, "window_alt_abs_m": 1.35, "target_px_mean_hist": 190.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00008/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00012/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [3.36, 13.25, 20.52, -49.56, -172.42, 0.0]\n  Target bbox: [574.81, 312.99, 592.75, 356.69]\n\nFrame 2:\n  Drone pose: [2.79, 13.24, 20.24, -52.68, -180.04, 0.0]\n  Target bbox: [661.07, 255.16, 677.82, 297.47]\n\nFrame 3:\n  Drone pose: [2.02, 13.2, 20.4, -48.33, -177.56, 0.0]\n  Target bbox: [631.56, 342.28, 648.51, 377.36]\n\nFrame 4:\n  Drone pose: [1.66, 13.15, 20.36, -48.05, -177.73, 0.0]\n  Target bbox: [632.0, 340.36, 648.1, 379.33]\n\nFrame 5 (current):\n  Drone pose: [1.14, 13.0, 20.32, -49.49, -179.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 647.16, \"ymin\": 315.21, \"xmax\": 663.42, \"ymax\": 355.86}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": 0.09, \"dz\": -0.02, \"dpitch\": 1.38, \"dyaw\": 1.63, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": 0.08, \"dz\": -0.05, \"dpitch\": 1.38, \"dyaw\": 1.6, \"droll\": 0.0}, {\"dx\": -1.61, \"dy\": 0.08, \"dz\": -0.07, \"dpitch\": 1.4, \"dyaw\": 1.62, \"droll\": 0.0}, {\"dx\": -2.11, \"dy\": 0.11, \"dz\": -0.1, \"dpitch\": 1.44, \"dyaw\": 1.69, \"droll\": 0.0}, {\"dx\": -2.6, \"dy\": 0.14, \"dz\": -0.13, \"dpitch\": 1.49, \"dyaw\": 1.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.08, "window_alt_abs_m": 0.51, "target_px_mean_hist": 209.2, "cur_frame_id": 12, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-1.46, 13.14, 20.19, -48.0, -177.73, 0.0]\n  Target bbox: [631.78, 339.85, 648.33, 379.82]\n\nFrame 2:\n  Drone pose: [-1.89, 13.18, 20.08, -49.96, -175.2, 0.0]\n  Target bbox: [604.2, 304.86, 621.39, 341.03]\n\nFrame 3:\n  Drone pose: [-2.41, 13.27, 20.15, -46.78, -176.91, 0.0]\n  Target bbox: [627.28, 356.0, 643.26, 399.52]\n\nFrame 4:\n  Drone pose: [-2.91, 13.35, 20.23, -43.8, -177.83, 0.0]\n  Target bbox: [640.17, 408.27, 657.21, 451.62]\n\nFrame 5 (current):\n  Drone pose: [-3.19, 13.47, 19.99, -45.43, -172.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 582.48, \"ymin\": 369.72, \"xmax\": 601.49, \"ymax\": 413.6}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.11, \"dz\": 0.12, \"dpitch\": -2.06, \"dyaw\": -3.85, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": 0.23, \"dz\": 0.1, \"dpitch\": -1.92, \"dyaw\": -3.47, \"droll\": 0.0}, {\"dx\": -1.45, \"dy\": 0.36, \"dz\": 0.09, \"dpitch\": -1.84, \"dyaw\": -3.06, \"droll\": 0.0}, {\"dx\": -1.95, \"dy\": 0.51, \"dz\": 0.08, \"dpitch\": -1.8, \"dyaw\": -2.61, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": 0.65, \"dz\": 0.07, \"dpitch\": -1.81, \"dyaw\": -2.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.46, "window_alt_abs_m": 0.5, "target_px_mean_hist": 212.2, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00029/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-5.14, 13.98, 20.07, -45.51, -180.13, 0.0]\n  Target bbox: [688.19, 369.55, 706.1, 411.6]\n\nFrame 2:\n  Drone pose: [-5.67, 14.12, 20.06, -50.64, -179.65, 0.0]\n  Target bbox: [688.16, 284.29, 706.23, 324.78]\n\nFrame 3:\n  Drone pose: [-6.27, 14.36, 20.15, -47.5, -173.88, 0.0]\n  Target bbox: [631.33, 342.01, 648.73, 377.64]\n\nFrame 4:\n  Drone pose: [-6.81, 14.39, 20.04, -47.4, -173.77, 0.0]\n  Target bbox: [631.1, 338.7, 649.02, 381.02]\n\nFrame 5 (current):\n  Drone pose: [-7.59, 14.42, 20.01, -47.8, -173.58, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 631.45, \"ymin\": 340.84, \"xmax\": 648.63, \"ymax\": 378.84}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": 0.04, \"dz\": 0.02, \"dpitch\": 0.02, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": -0.01, \"dz\": 0.02, \"dpitch\": -0.14, \"dyaw\": 1.65, \"droll\": 0.0}, {\"dx\": -1.82, \"dy\": -0.12, \"dz\": 0.01, \"dpitch\": -0.33, \"dyaw\": 3.0, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": -0.28, \"dz\": 0.01, \"dpitch\": -0.53, \"dyaw\": 4.22, \"droll\": 0.0}, {\"dx\": -3.2, \"dy\": -0.47, \"dz\": 0.01, \"dpitch\": -0.73, \"dyaw\": 5.37, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.55, "window_alt_abs_m": 0.25, "target_px_mean_hist": 215.0, "cur_frame_id": 29, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-10.79, 13.95, 20.02, -53.53, -171.66, 0.0]\n  Target bbox: [665.26, 253.48, 691.89, 299.84]\n\nFrame 2:\n  Drone pose: [-11.47, 13.76, 20.01, -47.1, -162.1, 0.0]\n  Target bbox: [573.43, 368.11, 595.53, 409.22]\n\nFrame 3:\n  Drone pose: [-12.05, 13.54, 20.17, -51.23, -171.1, 0.0]\n  Target bbox: [683.08, 302.63, 707.41, 344.61]\n\nFrame 4:\n  Drone pose: [-12.91, 13.18, 19.99, -50.43, -160.33, 0.0]\n  Target bbox: [572.14, 319.46, 597.89, 363.51]\n\nFrame 5 (current):\n  Drone pose: [-13.36, 12.95, 19.98, -49.0, -164.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.83, \"ymin\": 338.76, \"xmax\": 651.14, \"ymax\": 380.89}, \"waypoint_deltas\": [{\"dx\": -0.74, \"dy\": -0.06, \"dz\": 0.03, \"dpitch\": -0.01, \"dyaw\": 1.66, \"droll\": 0.0}, {\"dx\": -1.38, \"dy\": -0.3, \"dz\": 0.02, \"dpitch\": -0.09, \"dyaw\": 2.68, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": -0.54, \"dz\": 0.02, \"dpitch\": -0.17, \"dyaw\": 3.67, \"droll\": 0.0}, {\"dx\": -2.64, \"dy\": -0.78, \"dz\": 0.02, \"dpitch\": -0.23, \"dyaw\": 4.64, \"droll\": 0.0}, {\"dx\": -3.27, \"dy\": -1.04, \"dz\": 0.02, \"dpitch\": -0.28, \"dyaw\": 5.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 33.49, "window_alt_abs_m": 0.34, "target_px_mean_hist": 219.0, "cur_frame_id": 38, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-16.0, 12.17, 20.0, -53.69, -162.07, 0.0]\n  Target bbox: [653.83, 264.23, 674.88, 306.07]\n\nFrame 2:\n  Drone pose: [-16.62, 12.04, 20.1, -49.34, -158.52, 0.0]\n  Target bbox: [627.23, 337.44, 652.69, 382.22]\n\nFrame 3:\n  Drone pose: [-17.25, 11.65, 20.0, -54.32, -163.01, 0.0]\n  Target bbox: [683.53, 256.12, 706.45, 299.11]\n\nFrame 4:\n  Drone pose: [-17.75, 11.3, 19.91, -47.78, -153.0, 0.0]\n  Target bbox: [577.72, 361.48, 602.96, 405.27]\n\nFrame 5 (current):\n  Drone pose: [-18.44, 11.06, 20.0, -49.35, -156.42, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.13, \"ymin\": 339.04, \"xmax\": 650.81, \"ymax\": 380.59}, \"waypoint_deltas\": [{\"dx\": -0.59, \"dy\": -0.31, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": -1.17, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": -1.76, \"dy\": -0.97, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 2.01, \"droll\": 0.0}, {\"dx\": -2.35, \"dy\": -1.3, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 2.7, \"droll\": 0.0}, {\"dx\": -2.94, \"dy\": -1.61, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 3.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.45, "window_alt_abs_m": 0.38, "target_px_mean_hist": 223.5, "cur_frame_id": 46, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-21.29, 9.48, 20.14, -48.3, -148.04, 0.0]\n  Target bbox: [573.37, 359.44, 597.13, 401.77]\n\nFrame 2:\n  Drone pose: [-21.97, 9.17, 19.9, -49.85, -147.26, 0.0]\n  Target bbox: [573.22, 328.32, 598.76, 372.91]\n\nFrame 3:\n  Drone pose: [-22.61, 8.84, 20.08, -50.35, -153.24, 0.0]\n  Target bbox: [647.05, 322.67, 672.85, 370.51]\n\nFrame 4:\n  Drone pose: [-23.12, 8.5, 19.83, -49.64, -156.11, 0.0]\n  Target bbox: [666.91, 338.33, 693.05, 384.47]\n\nFrame 5 (current):\n  Drone pose: [-23.77, 8.43, 20.01, -49.82, -150.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.65, \"ymin\": 339.47, \"xmax\": 649.27, \"ymax\": 380.16}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": -0.4, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.45, \"droll\": 0.0}, {\"dx\": -1.16, \"dy\": -0.7, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": 1.2, \"droll\": 0.0}, {\"dx\": -1.74, \"dy\": -1.01, \"dz\": -0.01, \"dpitch\": 0.09, \"dyaw\": 1.9, \"droll\": 0.0}, {\"dx\": -2.31, \"dy\": -1.33, \"dz\": -0.01, \"dpitch\": 0.14, \"dyaw\": 2.54, \"droll\": 0.0}, {\"dx\": -2.87, \"dy\": -1.67, \"dz\": -0.01, \"dpitch\": 0.19, \"dyaw\": 3.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.87, "window_alt_abs_m": 0.87, "target_px_mean_hist": 220.2, "cur_frame_id": 55, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00063/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-26.08, 7.1, 20.0, -49.68, -148.34, 0.0]\n  Target bbox: [628.22, 338.27, 651.67, 381.37]\n\nFrame 2:\n  Drone pose: [-26.64, 6.76, 20.0, -46.62, -149.86, 0.0]\n  Target bbox: [652.27, 390.31, 673.53, 430.73]\n\nFrame 3:\n  Drone pose: [-27.13, 6.54, 19.95, -49.28, -146.93, 0.0]\n  Target bbox: [629.89, 339.67, 650.02, 379.98]\n\nFrame 4:\n  Drone pose: [-27.69, 5.91, 20.05, -54.44, -142.66, 0.0]\n  Target bbox: [578.87, 259.76, 601.27, 301.12]\n\nFrame 5 (current):\n  Drone pose: [-28.34, 5.58, 19.91, -51.61, -151.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 683.6, \"ymin\": 305.02, \"xmax\": 705.61, \"ymax\": 347.07}, \"waypoint_deltas\": [{\"dx\": -0.47, \"dy\": -0.41, \"dz\": 0.09, \"dpitch\": 2.11, \"dyaw\": 5.2, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.85, \"dz\": 0.09, \"dpitch\": 2.14, \"dyaw\": 5.42, \"droll\": 0.0}, {\"dx\": -1.51, \"dy\": -1.3, \"dz\": 0.09, \"dpitch\": 2.16, \"dyaw\": 5.59, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": -1.77, \"dz\": 0.09, \"dpitch\": 2.19, \"dyaw\": 5.71, \"droll\": 0.0}, {\"dx\": -2.52, \"dy\": -2.24, \"dz\": 0.09, \"dpitch\": 2.22, \"dyaw\": 5.78, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.5, "window_alt_abs_m": 0.29, "target_px_mean_hist": 220.8, "cur_frame_id": 63, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-30.86, 3.34, 20.0, -51.57, -146.99, 0.0]\n  Target bbox: [644.31, 302.13, 665.11, 344.76]\n\nFrame 2:\n  Drone pose: [-31.23, 2.72, 19.99, -50.73, -141.61, 0.0]\n  Target bbox: [578.04, 316.4, 600.6, 358.86]\n\nFrame 3:\n  Drone pose: [-31.71, 2.31, 19.94, -44.84, -151.03, 0.0]\n  Target bbox: [684.19, 413.86, 706.05, 453.63]\n\nFrame 4:\n  Drone pose: [-32.31, 1.87, 20.0, -48.84, -143.19, 0.0]\n  Target bbox: [601.52, 346.3, 624.03, 389.51]\n\nFrame 5 (current):\n  Drone pose: [-32.78, 1.39, 20.0, -47.77, -142.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 598.87, \"ymin\": 364.45, \"xmax\": 620.43, \"ymax\": 405.75}, \"waypoint_deltas\": [{\"dx\": -0.47, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": -1.4, \"dyaw\": -2.73, \"droll\": 0.0}, {\"dx\": -0.93, \"dy\": -0.92, \"dz\": 0.0, \"dpitch\": -1.29, \"dyaw\": -2.66, \"droll\": 0.0}, {\"dx\": -1.38, \"dy\": -1.34, \"dz\": 0.0, \"dpitch\": -1.14, \"dyaw\": -2.54, \"droll\": 0.0}, {\"dx\": -1.82, \"dy\": -1.73, \"dz\": 0.0, \"dpitch\": -1.43, \"dyaw\": -3.75, \"droll\": 0.0}, {\"dx\": -2.26, \"dy\": -2.1, \"dz\": 0.0, \"dpitch\": -1.22, \"dyaw\": -3.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.92, "window_alt_abs_m": 0.11, "target_px_mean_hist": 227.2, "cur_frame_id": 72, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642/aug_001/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.59, -0.25, 20.12, -49.28, -146.4, 0.0]\n  Target bbox: [629.85, 340.47, 650.08, 379.17]\n\nFrame 2:\n  Drone pose: [-35.08, -0.54, 20.08, -53.5, -144.38, 0.0]\n  Target bbox: [612.04, 262.77, 635.17, 306.5]\n\nFrame 3:\n  Drone pose: [-35.45, -1.08, 20.01, -49.23, -147.63, 0.0]\n  Target bbox: [629.47, 340.02, 650.45, 379.62]\n\nFrame 4:\n  Drone pose: [-35.91, -1.39, 20.0, -47.94, -145.1, 0.0]\n  Target bbox: [604.41, 353.92, 630.17, 401.04]\n\nFrame 5 (current):\n  Drone pose: [-36.49, -1.65, 19.94, -48.88, -150.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 660.15, \"ymin\": 345.8, \"xmax\": 682.95, \"ymax\": 386.97}, \"waypoint_deltas\": [{\"dx\": -0.28, \"dy\": -0.36, \"dz\": 0.06, \"dpitch\": -0.01, \"dyaw\": 2.9, \"droll\": 0.0}, {\"dx\": -0.71, \"dy\": -0.66, \"dz\": 0.06, \"dpitch\": -0.17, \"dyaw\": 1.91, \"droll\": 0.0}, {\"dx\": -1.15, \"dy\": -0.95, \"dz\": 0.06, \"dpitch\": -0.34, \"dyaw\": 0.92, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": -1.26, \"dz\": 0.06, \"dpitch\": -0.07, \"dyaw\": 1.39, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": -1.56, \"dz\": 0.06, \"dpitch\": -0.24, \"dyaw\": 0.37, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.33, "window_alt_abs_m": 0.18, "target_px_mean_hist": 216.0, "cur_frame_id": 80, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776416642", "difficulty_score": 0.2743, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.17, -80.0, 22.0, -54.26, 25.42, 0.0]\n  Target bbox: [626.07, 326.65, 653.79, 391.96]\n\nFrame 2:\n  Drone pose: [-96.99, -79.93, 21.96, -53.35, 26.35, 0.0]\n  Target bbox: [622.0, 326.35, 657.87, 392.34]\n\nFrame 3:\n  Drone pose: [-96.81, -79.86, 21.98, -52.53, 27.22, 0.0]\n  Target bbox: [620.28, 323.35, 659.46, 395.5]\n\nFrame 4:\n  Drone pose: [-96.64, -79.8, 22.01, -51.74, 28.04, 0.0]\n  Target bbox: [620.97, 324.56, 658.79, 394.35]\n\nFrame 5 (current):\n  Drone pose: [-96.46, -79.73, 22.05, -51.16, 28.82, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.68, \"ymin\": 323.97, \"xmax\": 658.05, \"ymax\": 395.0}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": 0.07, \"dz\": 0.05, \"dpitch\": 0.74, \"dyaw\": 0.73, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": 0.14, \"dz\": 0.1, \"dpitch\": 1.46, \"dyaw\": 1.42, \"droll\": 0.0}, {\"dx\": 0.52, \"dy\": 0.21, \"dz\": 0.15, \"dpitch\": 2.16, \"dyaw\": 2.07, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": 0.27, \"dz\": 0.21, \"dpitch\": 2.85, \"dyaw\": 2.69, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": 0.34, \"dz\": 0.27, \"dpitch\": 3.51, \"dyaw\": 3.27, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.4, "window_alt_abs_m": 0.14, "target_px_mean_hist": 558.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00008/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00012/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.76, -79.46, 22.26, -48.31, 31.51, 0.0]\n  Target bbox: [625.88, 328.5, 653.95, 390.6]\n\nFrame 2:\n  Drone pose: [-95.58, -79.39, 22.32, -47.65, 32.09, 0.0]\n  Target bbox: [624.03, 328.34, 655.79, 390.79]\n\nFrame 3:\n  Drone pose: [-95.41, -79.32, 22.41, -47.03, 32.65, 0.0]\n  Target bbox: [623.82, 327.02, 656.23, 392.14]\n\nFrame 4:\n  Drone pose: [-95.23, -79.25, 22.49, -46.81, 31.99, 0.0]\n  Target bbox: [625.81, 329.82, 654.2, 389.36]\n\nFrame 5 (current):\n  Drone pose: [-95.06, -79.19, 22.65, -46.68, 31.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.23, \"ymin\": 325.41, \"xmax\": 657.96, \"ymax\": 393.97}, \"waypoint_deltas\": [{\"dx\": 0.18, \"dy\": 0.07, \"dz\": 0.16, \"dpitch\": 0.13, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": 0.06, \"dz\": 0.34, \"dpitch\": 0.12, \"dyaw\": -0.86, \"droll\": 0.0}, {\"dx\": 0.84, \"dy\": 0.05, \"dz\": 0.5, \"dpitch\": 0.11, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": 0.09, \"dz\": 0.57, \"dpitch\": -0.12, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": 1.97, \"dy\": 0.15, \"dz\": 0.62, \"dpitch\": -0.28, \"dyaw\": -1.1, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.44, "window_alt_abs_m": 0.39, "target_px_mean_hist": 485.2, "cur_frame_id": 12, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-93.09, -79.04, 23.27, -46.96, 30.25, 0.0]\n  Target bbox: [626.1, 330.84, 653.92, 388.39]\n\nFrame 2:\n  Drone pose: [-92.49, -78.97, 23.32, -47.19, 30.22, 0.0]\n  Target bbox: [623.48, 326.56, 656.69, 392.77]\n\nFrame 3:\n  Drone pose: [-91.73, -77.47, 23.36, -48.57, 26.84, 0.0]\n  Target bbox: [625.45, 328.96, 654.66, 390.25]\n\nFrame 4:\n  Drone pose: [-91.08, -74.65, 24.16, -51.4, 19.24, 0.0]\n  Target bbox: [626.56, 330.05, 653.57, 389.05]\n\nFrame 5 (current):\n  Drone pose: [-90.77, -70.82, 24.57, -52.95, 7.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.67, \"ymin\": 334.33, \"xmax\": 652.42, \"ymax\": 384.68}, \"waypoint_deltas\": [{\"dx\": 0.23, \"dy\": 4.07, \"dz\": 0.44, \"dpitch\": -0.18, \"dyaw\": -13.08, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": 7.61, \"dz\": 0.1, \"dpitch\": 1.36, \"dyaw\": -23.85, \"droll\": 0.0}, {\"dx\": 1.43, \"dy\": 9.43, \"dz\": -0.7, \"dpitch\": 2.83, \"dyaw\": -29.31, \"droll\": 0.0}, {\"dx\": 1.13, \"dy\": 9.73, \"dz\": -1.38, \"dpitch\": 4.93, \"dyaw\": -29.26, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 9.67, \"dz\": -1.47, \"dpitch\": 5.8, \"dyaw\": -28.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.93, "window_alt_abs_m": 1.3, "target_px_mean_hist": 438.5, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00029/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-89.64, -61.09, 23.19, -48.02, -21.94, 0.0]\n  Target bbox: [625.23, 327.89, 654.61, 391.28]\n\nFrame 2:\n  Drone pose: [-89.74, -61.15, 23.1, -47.15, -21.15, 0.0]\n  Target bbox: [627.17, 331.61, 652.78, 387.53]\n\nFrame 3:\n  Drone pose: [-89.38, -61.08, 23.0, -46.8, -21.21, 0.0]\n  Target bbox: [627.49, 331.3, 652.45, 387.84]\n\nFrame 4:\n  Drone pose: [-89.05, -61.02, 22.88, -46.4, -21.18, 0.0]\n  Target bbox: [624.18, 326.74, 656.01, 392.56]\n\nFrame 5 (current):\n  Drone pose: [-88.6, -60.75, 22.69, -46.19, -20.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.52, \"ymin\": 328.52, \"xmax\": 655.6, \"ymax\": 390.77}, \"waypoint_deltas\": [{\"dx\": 0.45, \"dy\": 0.34, \"dz\": -0.21, \"dpitch\": 0.26, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": 0.9, \"dy\": 0.73, \"dz\": -0.45, \"dpitch\": 0.59, \"dyaw\": 0.8, \"droll\": 0.0}, {\"dx\": 1.35, \"dy\": 1.15, \"dz\": -0.67, \"dpitch\": 0.92, \"dyaw\": 1.05, \"droll\": 0.0}, {\"dx\": 1.81, \"dy\": 1.59, \"dz\": -0.82, \"dpitch\": 1.14, \"dyaw\": 1.23, \"droll\": 0.0}, {\"dx\": 2.29, \"dy\": 2.06, \"dz\": -0.96, \"dpitch\": 1.36, \"dyaw\": 1.35, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.51, "window_alt_abs_m": 0.5, "target_px_mean_hist": 457.8, "cur_frame_id": 29, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-86.31, -58.69, 21.73, -44.83, -19.2, 0.0]\n  Target bbox: [622.8, 325.91, 657.37, 393.43]\n\nFrame 2:\n  Drone pose: [-85.8, -58.49, 21.58, -44.78, -18.45, 0.0]\n  Target bbox: [626.24, 327.31, 653.9, 391.93]\n\nFrame 3:\n  Drone pose: [-85.26, -58.49, 21.44, -44.85, -17.16, 0.0]\n  Target bbox: [621.64, 325.8, 658.49, 393.56]\n\nFrame 4:\n  Drone pose: [-84.71, -58.5, 21.3, -44.92, -15.81, 0.0]\n  Target bbox: [625.85, 327.78, 654.2, 391.34]\n\nFrame 5 (current):\n  Drone pose: [-84.16, -58.65, 21.17, -45.05, -14.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.12, \"ymin\": 324.37, \"xmax\": 659.02, \"ymax\": 394.97}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": -0.15, \"dz\": -0.12, \"dpitch\": -0.12, \"dyaw\": 1.76, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": -0.29, \"dz\": -0.23, \"dpitch\": -0.23, \"dyaw\": 3.56, \"droll\": 0.0}, {\"dx\": 1.68, \"dy\": -0.44, \"dz\": -0.32, \"dpitch\": -0.33, \"dyaw\": 5.38, \"droll\": 0.0}, {\"dx\": 2.24, \"dy\": -0.59, \"dz\": -0.42, \"dpitch\": -0.41, \"dyaw\": 7.24, \"droll\": 0.0}, {\"dx\": 2.79, \"dy\": -0.73, \"dz\": -0.5, \"dpitch\": -0.47, \"dyaw\": 9.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.11, "window_alt_abs_m": 0.55, "target_px_mean_hist": 478.8, "cur_frame_id": 38, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00047/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-81.37, -59.38, 20.67, -45.52, -4.96, 0.0]\n  Target bbox: [619.8, 323.13, 660.22, 396.17]\n\nFrame 2:\n  Drone pose: [-80.81, -59.53, 20.59, -45.56, -3.06, 0.0]\n  Target bbox: [624.36, 325.83, 655.69, 393.3]\n\nFrame 3:\n  Drone pose: [-80.25, -59.68, 20.52, -45.58, -1.14, 0.0]\n  Target bbox: [624.39, 325.8, 655.66, 393.32]\n\nFrame 4:\n  Drone pose: [-79.69, -59.82, 20.46, -45.57, 0.8, 0.0]\n  Target bbox: [625.01, 325.55, 654.96, 393.48]\n\nFrame 5 (current):\n  Drone pose: [-79.13, -59.97, 20.4, -45.54, 2.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.79, \"ymin\": 325.88, \"xmax\": 655.21, \"ymax\": 393.2}, \"waypoint_deltas\": [{\"dx\": 0.55, \"dy\": -0.15, \"dz\": -0.05, \"dpitch\": 0.05, \"dyaw\": 1.95, \"droll\": 0.0}, {\"dx\": 1.11, \"dy\": -0.29, \"dz\": -0.1, \"dpitch\": 0.13, \"dyaw\": 3.91, \"droll\": 0.0}, {\"dx\": 1.67, \"dy\": -0.44, \"dz\": -0.14, \"dpitch\": 0.23, \"dyaw\": 5.86, \"droll\": 0.0}, {\"dx\": 2.23, \"dy\": -0.59, \"dz\": -0.18, \"dpitch\": 0.36, \"dyaw\": 7.8, \"droll\": 0.0}, {\"dx\": 2.79, \"dy\": -0.73, \"dz\": -0.21, \"dpitch\": 0.52, \"dyaw\": 9.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.72, "window_alt_abs_m": 0.27, "target_px_mean_hist": 560.8, "cur_frame_id": 47, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-76.9, -60.56, 20.22, -45.18, 10.55, 0.0]\n  Target bbox: [619.49, 322.44, 660.4, 396.85]\n\nFrame 2:\n  Drone pose: [-76.34, -60.7, 20.19, -45.02, 12.48, 0.0]\n  Target bbox: [623.76, 325.81, 656.19, 393.33]\n\nFrame 3:\n  Drone pose: [-75.79, -60.85, 20.16, -44.84, 14.4, 0.0]\n  Target bbox: [619.46, 321.69, 660.35, 397.6]\n\nFrame 4:\n  Drone pose: [-75.23, -61.0, 20.14, -44.63, 16.29, 0.0]\n  Target bbox: [622.21, 324.87, 657.7, 394.32]\n\nFrame 5 (current):\n  Drone pose: [-74.67, -61.14, 20.12, -44.4, 18.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.2, \"ymin\": 326.26, \"xmax\": 655.73, \"ymax\": 392.86}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": -0.15, \"dz\": -0.02, \"dpitch\": 0.25, \"dyaw\": 1.84, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": -0.3, \"dz\": -0.04, \"dpitch\": 0.53, \"dyaw\": 3.65, \"droll\": 0.0}, {\"dx\": 1.67, \"dy\": -0.44, \"dz\": -0.05, \"dpitch\": 0.84, \"dyaw\": 5.43, \"droll\": 0.0}, {\"dx\": 2.23, \"dy\": -0.59, \"dz\": -0.06, \"dpitch\": 1.16, \"dyaw\": 7.17, \"droll\": 0.0}, {\"dx\": 2.79, \"dy\": -0.74, \"dz\": -0.07, \"dpitch\": 1.5, \"dyaw\": 8.87, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.61, "window_alt_abs_m": 0.11, "target_px_mean_hist": 577.2, "cur_frame_id": 55, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00064/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-71.88, -61.88, 20.05, -42.9, 27.03, 0.0]\n  Target bbox: [622.35, 325.17, 657.44, 394.12]\n\nFrame 2:\n  Drone pose: [-71.32, -62.02, 20.04, -42.54, 28.69, 0.0]\n  Target bbox: [625.37, 327.71, 654.48, 391.54]\n\nFrame 3:\n  Drone pose: [-70.76, -62.17, 20.03, -42.16, 30.31, 0.0]\n  Target bbox: [628.2, 326.31, 651.63, 392.89]\n\nFrame 4:\n  Drone pose: [-70.2, -62.24, 20.02, -41.83, 31.73, 0.0]\n  Target bbox: [623.05, 325.64, 656.69, 393.74]\n\nFrame 5 (current):\n  Drone pose: [-69.64, -62.3, 20.02, -41.5, 33.07, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.56, \"ymin\": 328.82, \"xmax\": 653.3, \"ymax\": 390.42}, \"waypoint_deltas\": [{\"dx\": 0.57, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 0.29, \"dyaw\": 1.17, \"droll\": 0.0}, {\"dx\": 1.14, \"dy\": 0.06, \"dz\": -0.01, \"dpitch\": 0.56, \"dyaw\": 2.27, \"droll\": 0.0}, {\"dx\": 1.72, \"dy\": 0.43, \"dz\": -0.01, \"dpitch\": 0.58, \"dyaw\": 2.66, \"droll\": 0.0}, {\"dx\": 2.3, \"dy\": 0.81, \"dz\": -0.01, \"dpitch\": 0.59, \"dyaw\": 3.03, \"droll\": 0.0}, {\"dx\": 2.87, \"dy\": 1.2, \"dz\": -0.01, \"dpitch\": 0.6, \"dyaw\": 3.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.04, "window_alt_abs_m": 0.03, "target_px_mean_hist": 550.8, "cur_frame_id": 64, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-67.34, -61.49, 20.01, -40.91, 36.1, 0.0]\n  Target bbox: [628.11, 329.04, 651.73, 390.21]\n\nFrame 2:\n  Drone pose: [-66.77, -61.1, 20.01, -40.9, 36.46, 0.0]\n  Target bbox: [624.36, 326.13, 655.36, 393.27]\n\nFrame 3:\n  Drone pose: [-66.2, -60.71, 20.0, -40.89, 36.8, 0.0]\n  Target bbox: [624.8, 326.92, 654.94, 392.44]\n\nFrame 4:\n  Drone pose: [-65.63, -60.32, 20.0, -40.87, 37.13, 0.0]\n  Target bbox: [628.26, 329.36, 651.59, 389.86]\n\nFrame 5 (current):\n  Drone pose: [-65.06, -59.93, 20.0, -40.86, 37.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.07, \"ymin\": 326.16, \"xmax\": 655.62, \"ymax\": 393.27}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.32, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": 0.79, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.65, \"droll\": 0.0}, {\"dx\": 1.69, \"dy\": 1.18, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.97, \"droll\": 0.0}, {\"dx\": 2.25, \"dy\": 1.56, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 1.31, \"droll\": 0.0}, {\"dx\": 2.81, \"dy\": 1.95, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 1.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.35, "window_alt_abs_m": 0.01, "target_px_mean_hist": 541.8, "cur_frame_id": 72, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/ORI/frames_playback/frame_00081/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-62.25, -57.98, 20.0, -40.73, 39.1, 0.0]\n  Target bbox: [624.73, 325.93, 654.94, 393.51]\n\nFrame 2:\n  Drone pose: [-61.68, -57.6, 20.0, -40.7, 39.44, 0.0]\n  Target bbox: [626.37, 328.86, 653.44, 390.38]\n\nFrame 3:\n  Drone pose: [-61.11, -57.2, 20.0, -40.68, 39.76, 0.0]\n  Target bbox: [628.33, 330.19, 651.53, 389.02]\n\nFrame 4:\n  Drone pose: [-60.55, -56.79, 20.0, -40.67, 40.04, 0.0]\n  Target bbox: [627.46, 329.62, 652.36, 389.68]\n\nFrame 5 (current):\n  Drone pose: [-59.99, -56.36, 20.0, -40.68, 40.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.7, \"ymin\": 329.56, \"xmax\": 653.13, \"ymax\": 389.67}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": 0.46, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 1.1, \"dy\": 0.93, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": 1.42, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": 1.94, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": 2.62, \"dy\": 2.48, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.17, "window_alt_abs_m": 0.0, "target_px_mean_hist": 540.8, "cur_frame_id": 81, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.16, -80.04, 21.95, -52.82, 29.28, 0.0]\n  Target bbox: [585.71, 349.39, 620.44, 416.88]\n\nFrame 2:\n  Drone pose: [-97.04, -79.78, 21.88, -53.07, 20.78, 0.0]\n  Target bbox: [672.81, 331.48, 708.25, 397.84]\n\nFrame 3:\n  Drone pose: [-96.81, -79.86, 21.98, -52.0, 22.22, 0.0]\n  Target bbox: [675.69, 337.8, 707.05, 402.36]\n\nFrame 4:\n  Drone pose: [-96.51, -79.7, 21.98, -51.97, 23.33, 0.0]\n  Target bbox: [669.21, 326.53, 707.05, 394.86]\n\nFrame 5 (current):\n  Drone pose: [-96.45, -79.88, 22.15, -51.18, 29.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.31, \"ymin\": 323.91, \"xmax\": 658.41, \"ymax\": 395.06}, \"waypoint_deltas\": [{\"dx\": 0.16, \"dy\": 0.22, \"dz\": -0.05, \"dpitch\": 0.76, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": 0.34, \"dy\": 0.29, \"dz\": 0.0, \"dpitch\": 1.48, \"dyaw\": 0.94, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": 0.36, \"dz\": 0.05, \"dpitch\": 2.18, \"dyaw\": 1.59, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": 0.42, \"dz\": 0.11, \"dpitch\": 2.87, \"dyaw\": 2.21, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": 0.49, \"dz\": 0.17, \"dpitch\": 3.53, \"dyaw\": 2.79, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.01, "window_alt_abs_m": 0.36, "target_px_mean_hist": 558.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00008/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00012/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.76, -79.46, 22.26, -48.31, 31.51, 0.0]\n  Target bbox: [628.41, 327.8, 651.43, 391.21]\n\nFrame 2:\n  Drone pose: [-95.58, -79.39, 22.32, -47.61, 35.16, 0.0]\n  Target bbox: [592.22, 331.63, 617.81, 390.29]\n\nFrame 3:\n  Drone pose: [-95.41, -79.32, 22.41, -45.92, 32.69, 0.0]\n  Target bbox: [621.55, 343.81, 657.79, 412.77]\n\nFrame 4:\n  Drone pose: [-95.23, -79.25, 22.49, -43.91, 34.97, 0.0]\n  Target bbox: [589.18, 376.59, 621.62, 441.12]\n\nFrame 5 (current):\n  Drone pose: [-95.06, -79.19, 22.65, -46.68, 31.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.65, \"ymin\": 326.27, \"xmax\": 657.5, \"ymax\": 393.08}, \"waypoint_deltas\": [{\"dx\": 0.18, \"dy\": 0.07, \"dz\": 0.16, \"dpitch\": 0.13, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": 0.06, \"dz\": 0.34, \"dpitch\": 0.12, \"dyaw\": -0.86, \"droll\": 0.0}, {\"dx\": 0.84, \"dy\": 0.05, \"dz\": 0.5, \"dpitch\": 0.11, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": 0.09, \"dz\": 0.57, \"dpitch\": -0.12, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": 1.97, \"dy\": 0.15, \"dz\": 0.62, \"dpitch\": -0.28, \"dyaw\": -1.1, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.04, "window_alt_abs_m": 0.39, "target_px_mean_hist": 472.8, "cur_frame_id": 12, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-93.13, -79.01, 23.39, -44.3, 32.61, 0.0]\n  Target bbox: [595.62, 375.51, 626.71, 438.42]\n\nFrame 2:\n  Drone pose: [-92.49, -78.97, 23.32, -47.19, 30.22, 0.0]\n  Target bbox: [622.79, 326.48, 657.36, 392.83]\n\nFrame 3:\n  Drone pose: [-91.73, -77.47, 23.36, -48.57, 26.84, 0.0]\n  Target bbox: [626.35, 331.2, 653.68, 387.93]\n\nFrame 4:\n  Drone pose: [-91.08, -74.65, 24.16, -51.4, 19.24, 0.0]\n  Target bbox: [625.17, 327.48, 655.02, 391.62]\n\nFrame 5 (current):\n  Drone pose: [-90.82, -70.87, 24.48, -50.74, 10.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 598.34, \"ymin\": 366.19, \"xmax\": 623.55, \"ymax\": 421.61}, \"waypoint_deltas\": [{\"dx\": 0.28, \"dy\": 4.12, \"dz\": 0.53, \"dpitch\": -2.39, \"dyaw\": -16.05, \"droll\": 0.0}, {\"dx\": 0.71, \"dy\": 7.66, \"dz\": 0.19, \"dpitch\": -0.85, \"dyaw\": -26.82, \"droll\": 0.0}, {\"dx\": 1.48, \"dy\": 9.48, \"dz\": -0.61, \"dpitch\": 0.62, \"dyaw\": -32.28, \"droll\": 0.0}, {\"dx\": 1.18, \"dy\": 9.78, \"dz\": -1.29, \"dpitch\": 2.72, \"dyaw\": -32.23, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": 9.72, \"dz\": -1.38, \"dpitch\": 3.59, \"dyaw\": -31.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.32, "window_alt_abs_m": 1.22, "target_px_mean_hist": 428.8, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00029/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-89.55, -61.01, 23.09, -51.02, -17.26, 0.0]\n  Target bbox: [568.72, 280.46, 598.06, 340.16]\n\nFrame 2:\n  Drone pose: [-89.86, -61.26, 22.99, -46.9, -20.76, 0.0]\n  Target bbox: [627.48, 331.29, 652.46, 387.84]\n\nFrame 3:\n  Drone pose: [-89.39, -61.06, 23.19, -47.03, -21.25, 0.0]\n  Target bbox: [626.58, 330.4, 653.34, 388.77]\n\nFrame 4:\n  Drone pose: [-89.05, -61.02, 22.88, -46.4, -21.18, 0.0]\n  Target bbox: [625.77, 329.57, 654.33, 389.7]\n\nFrame 5 (current):\n  Drone pose: [-88.52, -60.72, 22.53, -43.28, -24.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 669.43, \"ymin\": 375.65, \"xmax\": 703.88, \"ymax\": 439.85}, \"waypoint_deltas\": [{\"dx\": 0.37, \"dy\": 0.31, \"dz\": -0.05, \"dpitch\": -2.65, \"dyaw\": 4.57, \"droll\": 0.0}, {\"dx\": 0.82, \"dy\": 0.7, \"dz\": -0.29, \"dpitch\": -2.32, \"dyaw\": 4.91, \"droll\": 0.0}, {\"dx\": 1.27, \"dy\": 1.12, \"dz\": -0.51, \"dpitch\": -1.99, \"dyaw\": 5.16, \"droll\": 0.0}, {\"dx\": 1.73, \"dy\": 1.56, \"dz\": -0.66, \"dpitch\": -1.77, \"dyaw\": 5.34, \"droll\": 0.0}, {\"dx\": 2.21, \"dy\": 2.03, \"dz\": -0.8, \"dpitch\": -1.55, \"dyaw\": 5.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.54, "window_alt_abs_m": 0.96, "target_px_mean_hist": 450.5, "cur_frame_id": 29, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-86.31, -58.69, 21.73, -45.49, -17.08, 0.0]\n  Target bbox: [600.76, 317.48, 628.24, 380.25]\n\nFrame 2:\n  Drone pose: [-85.77, -58.58, 21.57, -43.09, -15.89, 0.0]\n  Target bbox: [593.46, 354.27, 630.21, 424.63]\n\nFrame 3:\n  Drone pose: [-85.34, -58.62, 21.5, -43.68, -14.77, 0.0]\n  Target bbox: [602.55, 348.47, 630.0, 411.37]\n\nFrame 4:\n  Drone pose: [-84.67, -58.42, 21.2, -44.8, -16.07, 0.0]\n  Target bbox: [621.85, 324.97, 658.27, 394.31]\n\nFrame 5 (current):\n  Drone pose: [-84.16, -58.65, 21.17, -44.45, -15.22, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 638.75, \"ymin\": 336.83, \"xmax\": 668.81, \"ymax\": 402.58}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": -0.15, \"dz\": -0.12, \"dpitch\": -0.72, \"dyaw\": 2.89, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": -0.29, \"dz\": -0.23, \"dpitch\": -0.83, \"dyaw\": 4.69, \"droll\": 0.0}, {\"dx\": 1.68, \"dy\": -0.44, \"dz\": -0.32, \"dpitch\": -0.93, \"dyaw\": 6.51, \"droll\": 0.0}, {\"dx\": 2.24, \"dy\": -0.59, \"dz\": -0.42, \"dpitch\": -1.01, \"dyaw\": 8.37, \"droll\": 0.0}, {\"dx\": 2.79, \"dy\": -0.73, \"dz\": -0.5, \"dpitch\": -1.07, \"dyaw\": 10.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.46, "window_alt_abs_m": 0.55, "target_px_mean_hist": 472.8, "cur_frame_id": 38, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00047/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-81.37, -59.38, 20.67, -45.52, -4.96, 0.0]\n  Target bbox: [623.35, 324.7, 656.74, 394.46]\n\nFrame 2:\n  Drone pose: [-80.99, -59.56, 20.51, -45.18, -2.93, 0.0]\n  Target bbox: [620.48, 323.56, 659.53, 395.73]\n\nFrame 3:\n  Drone pose: [-80.17, -59.72, 20.38, -46.77, -6.0, 0.0]\n  Target bbox: [682.98, 305.68, 715.46, 373.89]\n\nFrame 4:\n  Drone pose: [-79.69, -59.82, 20.46, -45.57, 0.8, 0.0]\n  Target bbox: [622.83, 324.42, 657.23, 394.71]\n\nFrame 5 (current):\n  Drone pose: [-79.09, -59.87, 20.4, -45.62, 2.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.67, \"ymin\": 323.77, \"xmax\": 657.24, \"ymax\": 395.33}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.25, \"dz\": -0.05, \"dpitch\": 0.13, \"dyaw\": 2.25, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -0.39, \"dz\": -0.1, \"dpitch\": 0.21, \"dyaw\": 4.21, \"droll\": 0.0}, {\"dx\": 1.63, \"dy\": -0.54, \"dz\": -0.14, \"dpitch\": 0.31, \"dyaw\": 6.16, \"droll\": 0.0}, {\"dx\": 2.19, \"dy\": -0.69, \"dz\": -0.18, \"dpitch\": 0.44, \"dyaw\": 8.1, \"droll\": 0.0}, {\"dx\": 2.75, \"dy\": -0.83, \"dz\": -0.21, \"dpitch\": 0.6, \"dyaw\": 10.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.54, "window_alt_abs_m": 0.42, "target_px_mean_hist": 562.0, "cur_frame_id": 47, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-76.74, -60.58, 20.29, -45.51, 10.7, 0.0]\n  Target bbox: [623.02, 324.79, 656.95, 394.28]\n\nFrame 2:\n  Drone pose: [-76.39, -60.67, 20.14, -48.16, 7.36, 0.0]\n  Target bbox: [679.19, 268.89, 720.77, 344.01]\n\nFrame 3:\n  Drone pose: [-75.81, -60.86, 20.25, -44.94, 14.4, 0.0]\n  Target bbox: [619.18, 322.42, 660.68, 396.88]\n\nFrame 4:\n  Drone pose: [-75.32, -60.86, 20.13, -42.69, 20.83, 0.0]\n  Target bbox: [564.14, 359.51, 595.47, 425.58]\n\nFrame 5 (current):\n  Drone pose: [-74.67, -61.14, 20.12, -44.4, 18.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.92, \"ymin\": 323.18, \"xmax\": 657.89, \"ymax\": 396.04}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": -0.15, \"dz\": -0.02, \"dpitch\": 0.25, \"dyaw\": 1.84, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": -0.3, \"dz\": -0.04, \"dpitch\": 0.53, \"dyaw\": 3.65, \"droll\": 0.0}, {\"dx\": 1.67, \"dy\": -0.44, \"dz\": -0.05, \"dpitch\": 0.84, \"dyaw\": 5.43, \"droll\": 0.0}, {\"dx\": 2.23, \"dy\": -0.59, \"dz\": -0.06, \"dpitch\": 1.16, \"dyaw\": 7.17, \"droll\": 0.0}, {\"dx\": 2.79, \"dy\": -0.74, \"dz\": -0.07, \"dpitch\": 1.5, \"dyaw\": 8.87, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.48, "window_alt_abs_m": 0.41, "target_px_mean_hist": 580.8, "cur_frame_id": 55, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00064/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-71.88, -61.88, 20.05, -42.9, 27.03, 0.0]\n  Target bbox: [625.13, 327.21, 654.74, 391.94]\n\nFrame 2:\n  Drone pose: [-71.23, -62.13, 20.1, -42.67, 29.08, 0.0]\n  Target bbox: [622.15, 324.4, 657.56, 395.0]\n\nFrame 3:\n  Drone pose: [-70.77, -62.04, 20.0, -42.2, 30.0, 0.0]\n  Target bbox: [625.58, 326.47, 654.23, 392.77]\n\nFrame 4:\n  Drone pose: [-70.21, -62.35, 19.9, -41.67, 30.86, 0.0]\n  Target bbox: [639.57, 325.93, 668.41, 389.71]\n\nFrame 5 (current):\n  Drone pose: [-69.72, -62.26, 20.0, -41.41, 32.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.21, \"ymin\": 328.73, \"xmax\": 653.65, \"ymax\": 390.49}, \"waypoint_deltas\": [{\"dx\": 0.65, \"dy\": -0.02, \"dz\": 0.02, \"dpitch\": 0.2, \"dyaw\": 1.36, \"droll\": 0.0}, {\"dx\": 1.22, \"dy\": 0.02, \"dz\": 0.01, \"dpitch\": 0.47, \"dyaw\": 2.46, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": 0.39, \"dz\": 0.01, \"dpitch\": 0.49, \"dyaw\": 2.85, \"droll\": 0.0}, {\"dx\": 2.38, \"dy\": 0.77, \"dz\": 0.01, \"dpitch\": 0.5, \"dyaw\": 3.22, \"droll\": 0.0}, {\"dx\": 2.95, \"dy\": 1.16, \"dz\": 0.01, \"dpitch\": 0.51, \"dyaw\": 3.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.85, "window_alt_abs_m": 0.36, "target_px_mean_hist": 555.5, "cur_frame_id": 64, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-67.28, -61.4, 20.05, -37.11, 32.06, 0.0]\n  Target bbox: [675.99, 396.5, 704.74, 459.27]\n\nFrame 2:\n  Drone pose: [-66.77, -61.1, 20.01, -39.08, 41.46, 0.0]\n  Target bbox: [562.49, 359.46, 589.84, 424.59]\n\nFrame 3:\n  Drone pose: [-66.3, -60.6, 20.04, -40.92, 36.41, 0.0]\n  Target bbox: [623.46, 326.53, 656.25, 392.87]\n\nFrame 4:\n  Drone pose: [-65.63, -60.32, 20.0, -36.29, 37.83, 0.0]\n  Target bbox: [617.68, 406.12, 643.88, 467.4]\n\nFrame 5 (current):\n  Drone pose: [-65.17, -59.98, 20.07, -40.82, 37.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.68, \"ymin\": 328.18, \"xmax\": 653.1, \"ymax\": 391.18}, \"waypoint_deltas\": [{\"dx\": 0.67, \"dy\": 0.45, \"dz\": -0.07, \"dpitch\": -0.02, \"dyaw\": 0.38, \"droll\": 0.0}, {\"dx\": 1.23, \"dy\": 0.84, \"dz\": -0.07, \"dpitch\": 0.01, \"dyaw\": 0.71, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": 1.23, \"dz\": -0.07, \"dpitch\": 0.03, \"dyaw\": 1.03, \"droll\": 0.0}, {\"dx\": 2.36, \"dy\": 1.61, \"dz\": -0.07, \"dpitch\": 0.06, \"dyaw\": 1.37, \"droll\": 0.0}, {\"dx\": 2.92, \"dy\": 2.0, \"dz\": -0.07, \"dpitch\": 0.09, \"dyaw\": 1.71, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.31, "window_alt_abs_m": 0.18, "target_px_mean_hist": 546.0, "cur_frame_id": 72, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669/aug_001/frames_playback/frame_00081/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-62.25, -57.98, 20.0, -38.6, 37.19, 0.0]\n  Target bbox: [649.53, 361.86, 679.38, 429.55]\n\nFrame 2:\n  Drone pose: [-61.68, -57.49, 19.94, -35.69, 41.64, 0.0]\n  Target bbox: [594.12, 412.7, 622.95, 475.57]\n\nFrame 3:\n  Drone pose: [-61.11, -57.2, 20.0, -40.68, 39.76, 0.0]\n  Target bbox: [625.73, 327.47, 654.01, 391.86]\n\nFrame 4:\n  Drone pose: [-60.45, -56.93, 20.01, -36.92, 39.02, 0.0]\n  Target bbox: [644.2, 389.1, 673.57, 456.63]\n\nFrame 5 (current):\n  Drone pose: [-59.99, -56.36, 20.0, -40.68, 40.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.85, \"ymin\": 328.78, \"xmax\": 651.95, \"ymax\": 390.54}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": 0.46, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 1.1, \"dy\": 0.93, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": 1.42, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": 1.94, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": 2.62, \"dy\": 2.48, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.33, "window_alt_abs_m": 0.15, "target_px_mean_hist": 549.2, "cur_frame_id": 81, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776081669", "difficulty_score": 0.4305, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-124.11, -50.06, 22.0, -54.53, 0.0, 0.0]\n  Target bbox: [625.22, 322.46, 654.78, 396.42]\n\nFrame 2:\n  Drone pose: [-124.77, -49.34, 22.54, -53.16, -2.55, 0.0]\n  Target bbox: [624.93, 328.44, 654.81, 390.5]\n\nFrame 3:\n  Drone pose: [-124.55, -48.57, 22.56, -52.63, -5.18, 0.0]\n  Target bbox: [624.45, 324.55, 655.21, 394.47]\n\nFrame 4:\n  Drone pose: [-124.42, -47.91, 22.59, -51.94, -7.26, 0.0]\n  Target bbox: [623.46, 323.09, 656.14, 396.09]\n\nFrame 5 (current):\n  Drone pose: [-124.16, -47.29, 22.61, -51.44, -9.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.56, \"ymin\": 329.82, \"xmax\": 656.24, \"ymax\": 389.24}, \"waypoint_deltas\": [{\"dx\": 0.62, \"dy\": 0.26, \"dz\": 0.03, \"dpitch\": -0.16, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": 1.32, \"dy\": 0.41, \"dz\": 0.06, \"dpitch\": -0.47, \"dyaw\": -1.52, \"droll\": 0.0}, {\"dx\": 2.02, \"dy\": 0.49, \"dz\": 0.09, \"dpitch\": -0.81, \"dyaw\": -1.95, \"droll\": 0.0}, {\"dx\": 2.66, \"dy\": 0.54, \"dz\": 0.12, \"dpitch\": -1.05, \"dyaw\": -2.19, \"droll\": 0.0}, {\"dx\": 3.25, \"dy\": 0.57, \"dz\": 0.2, \"dpitch\": -1.29, \"dyaw\": -2.36, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.21, "window_alt_abs_m": 0.61, "target_px_mean_hist": 594.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-120.91, -46.72, 22.81, -52.73, -11.57, 0.0]\n  Target bbox: [622.5, 322.97, 657.11, 396.2]\n\nFrame 2:\n  Drone pose: [-120.34, -46.69, 22.86, -52.9, -11.7, 0.0]\n  Target bbox: [623.25, 325.54, 656.46, 393.53]\n\nFrame 3:\n  Drone pose: [-119.82, -46.69, 22.92, -53.0, -11.72, 0.0]\n  Target bbox: [623.82, 330.43, 656.02, 388.6]\n\nFrame 4:\n  Drone pose: [-119.34, -46.71, 22.98, -53.06, -11.66, 0.0]\n  Target bbox: [623.99, 329.65, 655.82, 389.4]\n\nFrame 5 (current):\n  Drone pose: [-118.84, -46.71, 23.05, -53.14, -11.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.26, \"ymin\": 330.05, \"xmax\": 656.57, \"ymax\": 388.96}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.07, \"dpitch\": -0.08, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": -0.02, \"dz\": 0.15, \"dpitch\": -0.15, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": 1.46, \"dy\": -0.03, \"dz\": 0.23, \"dpitch\": -0.23, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 1.95, \"dy\": -0.04, \"dz\": 0.32, \"dpitch\": -0.33, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": 2.47, \"dy\": -0.03, \"dz\": 0.4, \"dpitch\": -0.46, \"dyaw\": 0.1, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.25, "window_alt_abs_m": 0.24, "target_px_mean_hist": 568.2, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-116.37, -46.74, 23.45, -53.6, -11.53, 0.0]\n  Target bbox: [623.7, 327.97, 656.06, 391.07]\n\nFrame 2:\n  Drone pose: [-115.87, -46.72, 23.55, -53.71, -11.58, 0.0]\n  Target bbox: [622.99, 324.12, 656.65, 395.04]\n\nFrame 3:\n  Drone pose: [-115.39, -46.72, 23.64, -53.79, -11.57, 0.0]\n  Target bbox: [623.77, 327.98, 655.99, 391.07]\n\nFrame 4:\n  Drone pose: [-114.9, -46.7, 23.73, -53.87, -11.64, 0.0]\n  Target bbox: [622.61, 323.95, 657.02, 395.21]\n\nFrame 5 (current):\n  Drone pose: [-114.38, -46.66, 23.81, -53.99, -11.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.32, \"ymin\": 331.45, \"xmax\": 655.52, \"ymax\": 387.63}, \"waypoint_deltas\": [{\"dx\": 0.52, \"dy\": -0.02, \"dz\": 0.15, \"dpitch\": -0.21, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -0.03, \"dz\": 0.26, \"dpitch\": -0.41, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": -0.14, \"dz\": 0.37, \"dpitch\": -0.38, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": 1.76, \"dy\": -0.26, \"dz\": 0.47, \"dpitch\": -0.08, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": 2.17, \"dy\": -0.59, \"dz\": 0.6, \"dpitch\": -0.19, \"dyaw\": 0.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.27, "window_alt_abs_m": 0.36, "target_px_mean_hist": 537.2, "cur_frame_id": 22, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-112.21, -47.25, 24.41, -54.18, -11.27, 0.0]\n  Target bbox: [623.76, 325.92, 655.94, 393.18]\n\nFrame 2:\n  Drone pose: [-111.82, -47.39, 24.44, -54.1, -10.71, 0.0]\n  Target bbox: [623.61, 330.19, 656.18, 388.9]\n\nFrame 3:\n  Drone pose: [-111.47, -47.5, 24.45, -53.9, -10.28, 0.0]\n  Target bbox: [624.92, 331.7, 654.9, 387.44]\n\nFrame 4:\n  Drone pose: [-111.18, -47.62, 24.46, -53.62, -9.77, 0.0]\n  Target bbox: [624.28, 326.62, 655.43, 392.51]\n\nFrame 5 (current):\n  Drone pose: [-110.91, -47.83, 25.26, -54.24, -8.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.55, \"ymin\": 331.1, \"xmax\": 655.25, \"ymax\": 388.05}, \"waypoint_deltas\": [{\"dx\": 0.31, \"dy\": -0.05, \"dz\": -0.22, \"dpitch\": 0.52, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": 0.62, \"dy\": -0.06, \"dz\": -0.13, \"dpitch\": 0.84, \"dyaw\": -1.21, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": 0.05, \"dz\": -0.23, \"dpitch\": 1.39, \"dyaw\": -2.99, \"droll\": 0.0}, {\"dx\": 1.27, \"dy\": 0.25, \"dz\": 0.27, \"dpitch\": 1.31, \"dyaw\": -5.01, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": 0.66, \"dz\": -0.53, \"dpitch\": 2.81, \"dyaw\": -7.54, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.32, "window_alt_abs_m": 0.85, "target_px_mean_hist": 500.5, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00041/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-109.06, -46.71, 23.96, -49.71, -18.99, 0.0]\n  Target bbox: [622.61, 326.67, 657.12, 392.69]\n\nFrame 2:\n  Drone pose: [-108.72, -46.9, 23.83, -49.41, -18.31, 0.0]\n  Target bbox: [621.59, 327.86, 658.55, 391.51]\n\nFrame 3:\n  Drone pose: [-108.34, -46.82, 23.68, -48.79, -19.76, 0.0]\n  Target bbox: [623.28, 327.91, 656.5, 391.44]\n\nFrame 4:\n  Drone pose: [-107.89, -46.78, 23.45, -48.42, -19.85, 0.0]\n  Target bbox: [624.05, 329.34, 656.09, 390.02]\n\nFrame 5 (current):\n  Drone pose: [-107.47, -46.76, 23.25, -47.8, -21.14, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.25, \"ymin\": 330.76, \"xmax\": 655.64, \"ymax\": 388.57}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.05, \"dz\": -0.22, \"dpitch\": 0.25, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": -0.12, \"dz\": -0.44, \"dpitch\": -0.04, \"dyaw\": -1.65, \"droll\": 0.0}, {\"dx\": 2.81, \"dy\": 0.03, \"dz\": -0.7, \"dpitch\": -0.55, \"dyaw\": -2.88, \"droll\": 0.0}, {\"dx\": 4.22, \"dy\": -0.32, \"dz\": -0.88, \"dpitch\": -1.47, \"dyaw\": -4.46, \"droll\": 0.0}, {\"dx\": 5.8, \"dy\": -1.15, \"dz\": -1.05, \"dpitch\": -3.34, \"dyaw\": -3.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.51, "window_alt_abs_m": 0.71, "target_px_mean_hist": 487.5, "cur_frame_id": 41, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.67, -47.91, 22.2, -51.14, -24.66, 0.0]\n  Target bbox: [622.31, 328.27, 657.6, 390.8]\n\nFrame 2:\n  Drone pose: [-101.18, -49.05, 22.02, -51.63, -21.09, 0.0]\n  Target bbox: [616.54, 322.41, 663.7, 396.73]\n\nFrame 3:\n  Drone pose: [-100.95, -49.98, 21.85, -51.23, -19.38, 0.0]\n  Target bbox: [621.09, 324.17, 658.62, 394.97]\n\nFrame 4:\n  Drone pose: [-101.12, -50.85, 21.69, -50.39, -15.9, 0.0]\n  Target bbox: [623.65, 326.42, 656.51, 392.7]\n\nFrame 5 (current):\n  Drone pose: [-101.11, -51.12, 21.54, -49.32, -16.18, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.65, \"ymin\": 323.7, \"xmax\": 658.01, \"ymax\": 395.55}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.29, \"dz\": -0.15, \"dpitch\": 0.8, \"dyaw\": 1.27, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": -0.49, \"dz\": -0.28, \"dpitch\": 1.75, \"dyaw\": 0.75, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -0.68, \"dz\": -0.41, \"dpitch\": 2.48, \"dyaw\": 1.61, \"droll\": 0.0}, {\"dx\": 0.57, \"dy\": -0.83, \"dz\": -0.52, \"dpitch\": 2.94, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.97, \"dz\": -0.63, \"dpitch\": 3.14, \"dyaw\": 1.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.04, "window_alt_abs_m": 0.66, "target_px_mean_hist": 592.2, "cur_frame_id": 50, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.11, -52.09, 20.91, -46.18, -15.02, 0.0]\n  Target bbox: [622.1, 325.07, 658.09, 394.29]\n\nFrame 2:\n  Drone pose: [-99.66, -52.23, 20.81, -45.82, -16.02, 0.0]\n  Target bbox: [621.92, 322.38, 657.69, 397.03]\n\nFrame 3:\n  Drone pose: [-99.22, -52.37, 20.72, -45.67, -15.58, 0.0]\n  Target bbox: [621.87, 323.08, 657.76, 396.32]\n\nFrame 4:\n  Drone pose: [-98.77, -52.51, 20.63, -45.53, -15.13, 0.0]\n  Target bbox: [616.71, 321.98, 663.44, 397.46]\n\nFrame 5 (current):\n  Drone pose: [-98.32, -52.66, 20.56, -45.21, -16.08, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.08, \"ymin\": 323.02, \"xmax\": 657.55, \"ymax\": 396.43}, \"waypoint_deltas\": [{\"dx\": 0.45, \"dy\": -0.17, \"dz\": -0.07, \"dpitch\": 0.1, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -0.34, \"dz\": -0.13, \"dpitch\": 0.38, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": 1.39, \"dy\": -0.52, \"dz\": -0.19, \"dpitch\": 0.43, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": 1.87, \"dy\": -0.71, \"dz\": -0.24, \"dpitch\": 0.65, \"dyaw\": -0.71, \"droll\": 0.0}, {\"dx\": 2.37, \"dy\": -0.9, \"dz\": -0.28, \"dpitch\": 0.65, \"dyaw\": -0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.83, "window_alt_abs_m": 0.35, "target_px_mean_hist": 599.0, "cur_frame_id": 59, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00069/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.45, -53.75, 20.24, -44.37, -17.13, 0.0]\n  Target bbox: [621.78, 322.95, 657.84, 396.58]\n\nFrame 2:\n  Drone pose: [-94.96, -53.93, 20.2, -44.39, -16.63, 0.0]\n  Target bbox: [617.95, 322.9, 662.26, 396.64]\n\nFrame 3:\n  Drone pose: [-94.46, -54.1, 20.17, -44.21, -17.53, 0.0]\n  Target bbox: [623.16, 327.81, 656.68, 391.49]\n\nFrame 4:\n  Drone pose: [-93.96, -54.27, 20.15, -44.24, -17.05, 0.0]\n  Target bbox: [623.18, 327.81, 656.66, 391.49]\n\nFrame 5 (current):\n  Drone pose: [-93.47, -54.45, 20.12, -44.27, -16.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.66, \"ymin\": 326.29, \"xmax\": 656.5, \"ymax\": 393.08}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.19, \"dz\": -0.02, \"dpitch\": 0.16, \"dyaw\": -0.88, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.38, \"dz\": -0.03, \"dpitch\": 0.11, \"dyaw\": -0.34, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -0.58, \"dz\": -0.05, \"dpitch\": 0.26, \"dyaw\": -1.18, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -0.78, \"dz\": -0.06, \"dpitch\": 0.2, \"dyaw\": -0.62, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -0.98, \"dz\": -0.07, \"dpitch\": 0.34, \"dyaw\": -1.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.38, "window_alt_abs_m": 0.11, "target_px_mean_hist": 619.8, "cur_frame_id": 69, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-90.97, -55.43, 20.05, -43.93, -17.99, 0.0]\n  Target bbox: [621.81, 323.99, 657.86, 395.5]\n\nFrame 2:\n  Drone pose: [-90.47, -55.63, 20.04, -44.0, -17.43, 0.0]\n  Target bbox: [622.22, 324.57, 658.0, 394.88]\n\nFrame 3:\n  Drone pose: [-89.97, -55.83, 20.03, -43.86, -18.27, 0.0]\n  Target bbox: [622.13, 325.55, 657.62, 393.86]\n\nFrame 4:\n  Drone pose: [-89.47, -56.02, 20.03, -43.93, -17.75, 0.0]\n  Target bbox: [620.89, 324.67, 659.29, 394.77]\n\nFrame 5 (current):\n  Drone pose: [-88.97, -56.2, 20.02, -43.78, -18.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.66, \"ymin\": 326.6, \"xmax\": 657.15, \"ymax\": 392.73}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": -0.18, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": -0.37, \"dz\": -0.01, \"dpitch\": -0.14, \"dyaw\": 1.02, \"droll\": 0.0}, {\"dx\": 1.49, \"dy\": -0.56, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -0.77, \"dz\": -0.01, \"dpitch\": -0.09, \"dyaw\": 0.75, \"droll\": 0.0}, {\"dx\": 2.49, \"dy\": -0.98, \"dz\": -0.01, \"dpitch\": 0.04, \"dyaw\": -0.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.78, "window_alt_abs_m": 0.03, "target_px_mean_hist": 616.0, "cur_frame_id": 78, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-86.48, -57.18, 20.01, -43.74, -18.66, 0.0]\n  Target bbox: [621.19, 321.81, 658.41, 397.72]\n\nFrame 2:\n  Drone pose: [-85.98, -57.4, 20.0, -43.83, -18.06, 0.0]\n  Target bbox: [617.04, 322.24, 663.19, 397.33]\n\nFrame 3:\n  Drone pose: [-85.48, -57.62, 20.0, -43.7, -18.82, 0.0]\n  Target bbox: [621.65, 323.32, 658.01, 396.18]\n\nFrame 4:\n  Drone pose: [-84.99, -57.84, 20.0, -43.8, -18.22, 0.0]\n  Target bbox: [622.46, 325.69, 657.71, 393.73]\n\nFrame 5 (current):\n  Drone pose: [-84.49, -58.06, 20.0, -43.67, -18.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.01, \"ymin\": 327.03, \"xmax\": 656.8, \"ymax\": 392.33}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.2, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.4, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -0.6, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -0.8, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.82, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.73, "window_alt_abs_m": 0.0, "target_px_mean_hist": 610.0, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-124.21, -50.15, 22.0, -58.41, -4.66, 0.0]\n  Target bbox: [673.24, 259.21, 705.31, 326.6]\n\nFrame 2:\n  Drone pose: [-124.7, -49.38, 22.48, -53.2, -2.39, 0.0]\n  Target bbox: [624.65, 323.13, 654.95, 395.91]\n\nFrame 3:\n  Drone pose: [-124.61, -48.65, 22.71, -53.29, -1.65, 0.0]\n  Target bbox: [590.76, 317.33, 622.71, 384.28]\n\nFrame 4:\n  Drone pose: [-124.42, -47.91, 22.59, -48.55, -9.3, 0.0]\n  Target bbox: [644.32, 384.56, 678.46, 448.9]\n\nFrame 5 (current):\n  Drone pose: [-124.16, -47.29, 22.61, -54.26, -5.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 589.43, \"ymin\": 279.91, \"xmax\": 620.64, \"ymax\": 346.0}, \"waypoint_deltas\": [{\"dx\": 0.62, \"dy\": 0.26, \"dz\": 0.03, \"dpitch\": 2.66, \"dyaw\": -4.2, \"droll\": 0.0}, {\"dx\": 1.32, \"dy\": 0.41, \"dz\": 0.06, \"dpitch\": 2.35, \"dyaw\": -4.81, \"droll\": 0.0}, {\"dx\": 2.02, \"dy\": 0.49, \"dz\": 0.09, \"dpitch\": 2.01, \"dyaw\": -5.24, \"droll\": 0.0}, {\"dx\": 2.66, \"dy\": 0.54, \"dz\": 0.12, \"dpitch\": 1.77, \"dyaw\": -5.48, \"droll\": 0.0}, {\"dx\": 3.25, \"dy\": 0.57, \"dz\": 0.2, \"dpitch\": 1.53, \"dyaw\": -5.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.05, "window_alt_abs_m": 0.86, "target_px_mean_hist": 584.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-120.91, -46.72, 22.81, -51.84, -15.97, 0.0]\n  Target bbox: [667.46, 345.73, 702.62, 405.94]\n\nFrame 2:\n  Drone pose: [-120.34, -46.69, 22.86, -52.9, -11.7, 0.0]\n  Target bbox: [622.18, 325.89, 657.52, 393.15]\n\nFrame 3:\n  Drone pose: [-119.88, -46.81, 23.03, -54.85, -9.66, 0.0]\n  Target bbox: [607.18, 301.32, 638.83, 358.73]\n\nFrame 4:\n  Drone pose: [-119.23, -46.76, 23.06, -53.34, -11.55, 0.0]\n  Target bbox: [623.11, 324.32, 656.55, 394.75]\n\nFrame 5 (current):\n  Drone pose: [-118.92, -46.6, 23.1, -53.04, -11.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.07, \"ymin\": 322.86, \"xmax\": 657.55, \"ymax\": 396.26}, \"waypoint_deltas\": [{\"dx\": 0.58, \"dy\": -0.11, \"dz\": 0.02, \"dpitch\": -0.18, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": 1.06, \"dy\": -0.13, \"dz\": 0.1, \"dpitch\": -0.25, \"dyaw\": 0.41, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": -0.14, \"dz\": 0.18, \"dpitch\": -0.33, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": -0.15, \"dz\": 0.27, \"dpitch\": -0.43, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": 2.55, \"dy\": -0.14, \"dz\": 0.35, \"dpitch\": -0.56, \"dyaw\": 0.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.63, "window_alt_abs_m": 0.29, "target_px_mean_hist": 569.5, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-116.31, -46.7, 23.27, -54.48, -12.61, 0.0]\n  Target bbox: [633.12, 313.03, 665.15, 371.97]\n\nFrame 2:\n  Drone pose: [-115.76, -46.84, 23.45, -53.8, -11.26, 0.0]\n  Target bbox: [623.45, 325.09, 656.23, 393.99]\n\nFrame 3:\n  Drone pose: [-115.4, -46.69, 23.55, -56.34, -16.68, 0.0]\n  Target bbox: [671.41, 280.49, 708.72, 351.77]\n\nFrame 4:\n  Drone pose: [-114.9, -46.7, 23.73, -53.24, -7.45, 0.0]\n  Target bbox: [582.19, 343.31, 613.93, 399.37]\n\nFrame 5 (current):\n  Drone pose: [-114.38, -46.66, 23.81, -53.99, -11.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.79, \"ymin\": 326.51, \"xmax\": 655.92, \"ymax\": 392.6}, \"waypoint_deltas\": [{\"dx\": 0.52, \"dy\": -0.02, \"dz\": 0.15, \"dpitch\": -0.21, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -0.03, \"dz\": 0.26, \"dpitch\": -0.41, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": -0.14, \"dz\": 0.37, \"dpitch\": -0.38, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": 1.76, \"dy\": -0.26, \"dz\": 0.47, \"dpitch\": -0.08, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": 2.17, \"dy\": -0.59, \"dz\": 0.6, \"dpitch\": -0.19, \"dyaw\": 0.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.36, "window_alt_abs_m": 0.54, "target_px_mean_hist": 530.8, "cur_frame_id": 22, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-112.22, -47.33, 24.53, -57.58, -12.01, 0.0]\n  Target bbox: [634.04, 277.05, 666.16, 333.58]\n\nFrame 2:\n  Drone pose: [-111.82, -47.39, 24.44, -57.78, -5.71, 0.0]\n  Target bbox: [573.67, 265.2, 606.69, 333.55]\n\nFrame 3:\n  Drone pose: [-111.57, -47.37, 24.57, -53.85, -10.63, 0.0]\n  Target bbox: [623.38, 329.07, 656.39, 390.06]\n\nFrame 4:\n  Drone pose: [-111.28, -47.61, 24.49, -53.51, -9.73, 0.0]\n  Target bbox: [624.35, 326.54, 655.35, 392.6]\n\nFrame 5 (current):\n  Drone pose: [-110.82, -47.9, 25.09, -52.26, -3.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 575.93, \"ymin\": 368.37, \"xmax\": 605.16, \"ymax\": 419.59}, \"waypoint_deltas\": [{\"dx\": 0.22, \"dy\": 0.02, \"dz\": -0.05, \"dpitch\": -1.46, \"dyaw\": -4.91, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": 0.01, \"dz\": 0.04, \"dpitch\": -1.14, \"dyaw\": -6.37, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": 0.12, \"dz\": -0.06, \"dpitch\": -0.59, \"dyaw\": -8.15, \"droll\": 0.0}, {\"dx\": 1.18, \"dy\": 0.32, \"dz\": 0.44, \"dpitch\": -0.67, \"dyaw\": -10.17, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.73, \"dz\": -0.36, \"dpitch\": 0.83, \"dyaw\": -12.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.07, "window_alt_abs_m": 0.91, "target_px_mean_hist": 499.2, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00041/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.95, -46.84, 23.88, -48.97, -22.56, 0.0]\n  Target bbox: [662.67, 339.87, 701.07, 410.2]\n\nFrame 2:\n  Drone pose: [-108.72, -46.98, 23.82, -50.4, -18.4, 0.0]\n  Target bbox: [625.04, 312.12, 662.16, 374.7]\n\nFrame 3:\n  Drone pose: [-108.34, -46.82, 23.68, -48.79, -19.76, 0.0]\n  Target bbox: [623.72, 330.42, 656.15, 388.88]\n\nFrame 4:\n  Drone pose: [-107.85, -46.76, 23.62, -46.83, -21.29, 0.0]\n  Target bbox: [638.74, 359.62, 671.98, 422.25]\n\nFrame 5 (current):\n  Drone pose: [-107.45, -46.69, 23.31, -43.32, -21.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.92, \"ymin\": 404.44, \"xmax\": 663.9, \"ymax\": 467.96}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": -0.12, \"dz\": -0.28, \"dpitch\": -4.23, \"dyaw\": 0.94, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": -0.19, \"dz\": -0.5, \"dpitch\": -4.52, \"dyaw\": -0.84, \"droll\": 0.0}, {\"dx\": 2.79, \"dy\": -0.04, \"dz\": -0.76, \"dpitch\": -5.03, \"dyaw\": -2.07, \"droll\": 0.0}, {\"dx\": 4.2, \"dy\": -0.39, \"dz\": -0.94, \"dpitch\": -5.95, \"dyaw\": -3.65, \"droll\": 0.0}, {\"dx\": 5.78, \"dy\": -1.22, \"dz\": -1.11, \"dpitch\": -7.82, \"dyaw\": -2.71, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.7, "window_alt_abs_m": 0.57, "target_px_mean_hist": 493.5, "cur_frame_id": 41, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.67, -47.91, 22.2, -49.75, -20.25, 0.0]\n  Target bbox: [574.87, 353.22, 611.22, 415.29]\n\nFrame 2:\n  Drone pose: [-101.09, -49.15, 21.94, -51.72, -20.9, 0.0]\n  Target bbox: [619.78, 323.85, 660.44, 395.22]\n\nFrame 3:\n  Drone pose: [-100.95, -49.98, 21.85, -51.03, -15.33, 0.0]\n  Target bbox: [577.96, 328.81, 615.7, 399.29]\n\nFrame 4:\n  Drone pose: [-101.12, -50.85, 21.69, -50.39, -15.9, 0.0]\n  Target bbox: [619.96, 325.42, 660.12, 393.67]\n\nFrame 5 (current):\n  Drone pose: [-101.07, -51.1, 21.6, -49.3, -16.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.11, \"ymin\": 330.32, \"xmax\": 655.54, \"ymax\": 394.14}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": -0.31, \"dz\": -0.21, \"dpitch\": 0.78, \"dyaw\": 1.2, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -0.51, \"dz\": -0.34, \"dpitch\": 1.73, \"dyaw\": 0.68, \"droll\": 0.0}, {\"dx\": 0.14, \"dy\": -0.7, \"dz\": -0.47, \"dpitch\": 2.46, \"dyaw\": 1.54, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -0.85, \"dz\": -0.58, \"dpitch\": 2.92, \"dyaw\": 0.62, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": -0.99, \"dz\": -0.69, \"dpitch\": 3.12, \"dyaw\": 1.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.0, "window_alt_abs_m": 0.6, "target_px_mean_hist": 584.2, "cur_frame_id": 50, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.11, -52.09, 20.91, -46.21, -14.17, 0.0]\n  Target bbox: [610.87, 324.79, 649.01, 393.67]\n\nFrame 2:\n  Drone pose: [-99.66, -52.23, 20.81, -43.49, -16.56, 0.0]\n  Target bbox: [629.33, 365.7, 663.35, 431.83]\n\nFrame 3:\n  Drone pose: [-99.28, -52.41, 20.69, -48.0, -12.13, 0.0]\n  Target bbox: [584.18, 288.26, 617.76, 350.06]\n\nFrame 4:\n  Drone pose: [-98.87, -52.63, 20.56, -45.32, -14.7, 0.0]\n  Target bbox: [623.37, 325.87, 656.81, 393.49]\n\nFrame 5 (current):\n  Drone pose: [-98.25, -52.85, 20.59, -45.43, -15.61, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.83, \"ymin\": 323.2, \"xmax\": 657.79, \"ymax\": 396.27}, \"waypoint_deltas\": [{\"dx\": 0.38, \"dy\": 0.02, \"dz\": -0.1, \"dpitch\": 0.32, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": -0.15, \"dz\": -0.16, \"dpitch\": 0.6, \"dyaw\": -0.85, \"droll\": 0.0}, {\"dx\": 1.32, \"dy\": -0.33, \"dz\": -0.22, \"dpitch\": 0.65, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": -0.52, \"dz\": -0.27, \"dpitch\": 0.87, \"dyaw\": -1.18, \"droll\": 0.0}, {\"dx\": 2.3, \"dy\": -0.71, \"dz\": -0.31, \"dpitch\": 0.87, \"dyaw\": -0.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.3, "window_alt_abs_m": 0.38, "target_px_mean_hist": 608.8, "cur_frame_id": 59, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00069/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.28, -53.75, 20.25, -44.63, -17.27, 0.0]\n  Target bbox: [621.63, 322.36, 657.98, 397.13]\n\nFrame 2:\n  Drone pose: [-94.91, -53.99, 20.35, -44.7, -16.5, 0.0]\n  Target bbox: [624.64, 326.97, 655.5, 392.37]\n\nFrame 3:\n  Drone pose: [-94.46, -54.1, 20.17, -44.21, -17.53, 0.0]\n  Target bbox: [620.02, 321.74, 659.57, 397.77]\n\nFrame 4:\n  Drone pose: [-93.83, -54.23, 20.11, -44.35, -17.3, 0.0]\n  Target bbox: [621.49, 325.03, 658.25, 394.34]\n\nFrame 5 (current):\n  Drone pose: [-93.52, -54.42, 20.11, -43.3, -20.1, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 662.63, \"ymin\": 340.06, \"xmax\": 702.52, \"ymax\": 409.94}, \"waypoint_deltas\": [{\"dx\": 0.55, \"dy\": -0.22, \"dz\": -0.01, \"dpitch\": -0.81, \"dyaw\": 2.67, \"droll\": 0.0}, {\"dx\": 1.05, \"dy\": -0.41, \"dz\": -0.02, \"dpitch\": -0.86, \"dyaw\": 3.21, \"droll\": 0.0}, {\"dx\": 1.55, \"dy\": -0.61, \"dz\": -0.04, \"dpitch\": -0.71, \"dyaw\": 2.37, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": -0.81, \"dz\": -0.05, \"dpitch\": -0.77, \"dyaw\": 2.93, \"droll\": 0.0}, {\"dx\": 2.55, \"dy\": -1.01, \"dz\": -0.06, \"dpitch\": -0.63, \"dyaw\": 2.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.84, "window_alt_abs_m": 0.34, "target_px_mean_hist": 612.2, "cur_frame_id": 69, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-91.1, -55.42, 20.09, -43.8, -17.91, 0.0]\n  Target bbox: [620.76, 321.56, 658.79, 398.08]\n\nFrame 2:\n  Drone pose: [-90.43, -55.63, 20.15, -43.92, -12.47, 0.0]\n  Target bbox: [561.13, 331.93, 598.08, 401.42]\n\nFrame 3:\n  Drone pose: [-89.97, -55.83, 20.03, -43.86, -18.27, 0.0]\n  Target bbox: [621.45, 322.47, 658.17, 397.06]\n\nFrame 4:\n  Drone pose: [-89.48, -55.86, 20.06, -46.1, -23.17, 0.0]\n  Target bbox: [683.38, 292.33, 718.44, 357.05]\n\nFrame 5 (current):\n  Drone pose: [-88.87, -56.07, 20.11, -44.0, -19.08, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.54, \"ymin\": 326.65, \"xmax\": 657.26, \"ymax\": 392.72}, \"waypoint_deltas\": [{\"dx\": 0.39, \"dy\": -0.31, \"dz\": -0.09, \"dpitch\": 0.15, \"dyaw\": 0.96, \"droll\": 0.0}, {\"dx\": 0.89, \"dy\": -0.5, \"dz\": -0.1, \"dpitch\": 0.08, \"dyaw\": 1.48, \"droll\": 0.0}, {\"dx\": 1.39, \"dy\": -0.69, \"dz\": -0.1, \"dpitch\": 0.22, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": 1.89, \"dy\": -0.9, \"dz\": -0.1, \"dpitch\": 0.13, \"dyaw\": 1.21, \"droll\": 0.0}, {\"dx\": 2.39, \"dy\": -1.11, \"dz\": -0.1, \"dpitch\": 0.26, \"dyaw\": 0.42, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.23, "window_alt_abs_m": 0.27, "target_px_mean_hist": 616.5, "cur_frame_id": 78, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-86.47, -57.04, 20.12, -41.62, -22.07, 0.0]\n  Target bbox: [659.0, 365.02, 694.2, 430.81]\n\nFrame 2:\n  Drone pose: [-86.12, -57.37, 19.88, -43.45, -18.03, 0.0]\n  Target bbox: [623.43, 326.02, 656.75, 393.41]\n\nFrame 3:\n  Drone pose: [-85.48, -57.62, 20.0, -43.7, -18.82, 0.0]\n  Target bbox: [619.64, 320.89, 659.92, 398.7]\n\nFrame 4:\n  Drone pose: [-84.94, -57.75, 20.0, -43.83, -18.52, 0.0]\n  Target bbox: [620.78, 323.14, 659.47, 396.31]\n\nFrame 5 (current):\n  Drone pose: [-84.49, -58.06, 20.0, -43.67, -18.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.35, \"ymin\": 322.42, \"xmax\": 659.29, \"ymax\": 397.08}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.2, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.4, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -0.6, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -0.8, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.82, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.59, "window_alt_abs_m": 0.36, "target_px_mean_hist": 621.5, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_936", "difficulty_score": 0.3604, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [111.37, 26.45, 22.0, -46.48, 177.14, 0.0]\n  Target bbox: [619.88, 325.29, 660.19, 394.17]\n\nFrame 2:\n  Drone pose: [110.85, 26.73, 21.2, -45.39, 176.53, 0.0]\n  Target bbox: [628.66, 328.22, 651.12, 391.04]\n\nFrame 3:\n  Drone pose: [110.32, 27.02, 20.67, -44.68, 177.35, 0.0]\n  Target bbox: [628.63, 325.4, 651.09, 393.95]\n\nFrame 4:\n  Drone pose: [109.8, 27.31, 20.64, -44.69, 178.17, 0.0]\n  Target bbox: [628.86, 329.78, 650.94, 389.47]\n\nFrame 5 (current):\n  Drone pose: [109.28, 27.6, 20.62, -44.69, 179.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.05, \"ymin\": 325.97, \"xmax\": 650.68, \"ymax\": 393.35}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.28, \"dz\": -0.03, \"dpitch\": 0.0, \"dyaw\": 0.82, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 0.57, \"dz\": -0.05, \"dpitch\": 0.0, \"dyaw\": 1.65, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": 0.86, \"dz\": -0.07, \"dpitch\": 0.01, \"dyaw\": 2.48, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": 1.15, \"dz\": -0.09, \"dpitch\": 0.03, \"dyaw\": 3.32, \"droll\": 0.0}, {\"dx\": -2.61, \"dy\": 1.43, \"dz\": -0.2, \"dpitch\": 0.17, \"dyaw\": 4.15, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.08, "window_alt_abs_m": 1.38, "target_px_mean_hist": 472.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.67, 29.03, 20.42, -44.52, -176.85, 0.0]\n  Target bbox: [628.15, 323.97, 652.19, 395.5]\n\nFrame 2:\n  Drone pose: [106.15, 29.32, 20.39, -44.48, -176.02, 0.0]\n  Target bbox: [628.36, 327.6, 651.88, 391.7]\n\nFrame 3:\n  Drone pose: [105.63, 29.61, 20.36, -44.43, -175.18, 0.0]\n  Target bbox: [628.16, 328.5, 652.04, 390.71]\n\nFrame 4:\n  Drone pose: [105.11, 29.9, 20.33, -44.38, -174.35, 0.0]\n  Target bbox: [627.82, 324.65, 652.49, 394.73]\n\nFrame 5 (current):\n  Drone pose: [104.59, 30.18, 20.3, -44.33, -173.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.0, \"ymin\": 325.03, \"xmax\": 653.31, \"ymax\": 394.33}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": 0.29, \"dz\": -0.03, \"dpitch\": 0.06, \"dyaw\": 0.83, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": 0.58, \"dz\": -0.06, \"dpitch\": 0.13, \"dyaw\": 1.66, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.87, \"dz\": -0.08, \"dpitch\": 0.2, \"dyaw\": 2.49, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": 1.15, \"dz\": -0.11, \"dpitch\": 0.27, \"dyaw\": 3.32, \"droll\": 0.0}, {\"dx\": -2.61, \"dy\": 1.44, \"dz\": -0.13, \"dpitch\": 0.35, \"dyaw\": 4.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.34, "window_alt_abs_m": 0.12, "target_px_mean_hist": 491.2, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.98, 31.62, 20.17, -43.98, -169.37, 0.0]\n  Target bbox: [626.47, 323.99, 653.85, 395.41]\n\nFrame 2:\n  Drone pose: [101.46, 31.91, 20.15, -43.91, -168.55, 0.0]\n  Target bbox: [626.36, 324.57, 653.95, 394.83]\n\nFrame 3:\n  Drone pose: [100.94, 32.2, 20.13, -43.83, -167.73, 0.0]\n  Target bbox: [626.67, 326.39, 653.58, 392.98]\n\nFrame 4:\n  Drone pose: [100.44, 32.2, 20.12, -43.8, -167.73, 0.0]\n  Target bbox: [624.92, 322.53, 655.48, 397.03]\n\nFrame 5 (current):\n  Drone pose: [99.94, 32.2, 20.1, -43.78, -167.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.82, \"ymin\": 322.39, \"xmax\": 655.58, \"ymax\": 397.15}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": -0.04, \"dpitch\": 0.07, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.08, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.64, "window_alt_abs_m": 0.07, "target_px_mean_hist": 488.5, "cur_frame_id": 22, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.44, 32.2, 20.05, -43.7, -167.73, 0.0]\n  Target bbox: [624.83, 322.38, 655.56, 397.14]\n\nFrame 2:\n  Drone pose: [96.94, 32.2, 20.04, -43.69, -167.73, 0.0]\n  Target bbox: [626.79, 328.63, 653.38, 390.6]\n\nFrame 3:\n  Drone pose: [96.44, 32.2, 20.04, -43.68, -167.73, 0.0]\n  Target bbox: [625.65, 325.41, 654.63, 393.98]\n\nFrame 4:\n  Drone pose: [95.94, 32.2, 20.03, -43.67, -167.73, 0.0]\n  Target bbox: [626.31, 325.19, 653.98, 394.21]\n\nFrame 5 (current):\n  Drone pose: [95.44, 32.2, 20.03, -43.66, -167.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.5, \"ymin\": 323.51, \"xmax\": 654.85, \"ymax\": 396.0}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.5, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 1.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 1.5, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.7, \"dy\": 1.57, \"dz\": -0.02, \"dpitch\": -0.37, \"dyaw\": -1.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.02, "target_px_mean_hist": 486.0, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00041/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.04, 33.81, 20.01, -44.45, -170.02, 0.0]\n  Target bbox: [622.46, 326.46, 657.54, 392.85]\n\nFrame 2:\n  Drone pose: [91.29, 33.74, 20.01, -44.95, -171.58, 0.0]\n  Target bbox: [618.16, 322.81, 661.79, 396.62]\n\nFrame 3:\n  Drone pose: [90.52, 33.65, 20.01, -45.46, -173.25, 0.0]\n  Target bbox: [620.57, 324.15, 659.41, 395.14]\n\nFrame 4:\n  Drone pose: [89.72, 33.35, 20.0, -46.05, -175.6, 0.0]\n  Target bbox: [617.66, 321.99, 662.34, 397.38]\n\nFrame 5 (current):\n  Drone pose: [88.93, 32.96, 20.0, -46.58, -178.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.19, \"ymin\": 327.26, \"xmax\": 652.04, \"ymax\": 391.78}, \"waypoint_deltas\": [{\"dx\": -0.78, \"dy\": -0.62, \"dz\": 0.0, \"dpitch\": -0.45, \"dyaw\": -2.0, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": -1.19, \"dz\": 0.0, \"dpitch\": -0.79, \"dyaw\": -5.47, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -1.56, \"dz\": 0.0, \"dpitch\": -0.53, \"dyaw\": -8.34, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -1.68, \"dz\": 0.0, \"dpitch\": -0.34, \"dyaw\": -10.33, \"droll\": 0.0}, {\"dx\": -3.01, \"dy\": -1.79, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -12.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.33, "window_alt_abs_m": 0.01, "target_px_mean_hist": 509.8, "cur_frame_id": 41, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [85.92, 31.17, 20.0, -46.68, 169.41, 0.0]\n  Target bbox: [621.19, 323.33, 658.96, 395.88]\n\nFrame 2:\n  Drone pose: [85.51, 31.16, 20.0, -46.37, 167.87, 0.0]\n  Target bbox: [622.22, 323.94, 657.96, 395.28]\n\nFrame 3:\n  Drone pose: [85.03, 31.09, 20.0, -46.14, 166.12, 0.0]\n  Target bbox: [620.34, 322.72, 659.81, 396.48]\n\nFrame 4:\n  Drone pose: [84.43, 30.97, 20.0, -46.04, 164.15, 0.0]\n  Target bbox: [622.58, 324.03, 657.62, 395.2]\n\nFrame 5 (current):\n  Drone pose: [83.72, 30.81, 20.0, -46.06, 161.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.32, \"ymin\": 323.29, \"xmax\": 659.84, \"ymax\": 395.91}, \"waypoint_deltas\": [{\"dx\": -0.76, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -2.29, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": -0.34, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -4.46, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": -6.37, \"droll\": 0.0}, {\"dx\": -2.67, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.49, \"dyaw\": -8.0, \"droll\": 0.0}, {\"dx\": -3.17, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": 0.5, \"dyaw\": -8.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.44, "window_alt_abs_m": 0.0, "target_px_mean_hist": 525.5, "cur_frame_id": 50, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [80.55, 30.29, 20.0, -45.56, 153.93, 0.0]\n  Target bbox: [620.61, 320.95, 659.04, 398.41]\n\nFrame 2:\n  Drone pose: [80.0, 30.32, 20.0, -45.65, 153.92, 0.0]\n  Target bbox: [621.8, 323.0, 657.94, 396.3]\n\nFrame 3:\n  Drone pose: [79.33, 30.39, 20.0, -45.93, 153.88, 0.0]\n  Target bbox: [624.3, 326.23, 655.58, 392.89]\n\nFrame 4:\n  Drone pose: [78.52, 30.53, 20.0, -46.46, 153.85, 0.0]\n  Target bbox: [623.11, 324.03, 656.69, 395.12]\n\nFrame 5 (current):\n  Drone pose: [77.66, 30.76, 20.0, -47.16, 154.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.67, \"ymin\": 320.99, \"xmax\": 659.01, \"ymax\": 398.23}, \"waypoint_deltas\": [{\"dx\": -0.83, \"dy\": 0.33, \"dz\": 0.0, \"dpitch\": -0.71, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": 0.73, \"dz\": 0.0, \"dpitch\": -1.3, \"dyaw\": 1.47, \"droll\": 0.0}, {\"dx\": -2.07, \"dy\": 1.15, \"dz\": 0.0, \"dpitch\": -1.65, \"dyaw\": 2.73, \"droll\": 0.0}, {\"dx\": -2.47, \"dy\": 1.52, \"dz\": 0.0, \"dpitch\": -1.74, \"dyaw\": 4.06, \"droll\": 0.0}, {\"dx\": -2.77, \"dy\": 1.78, \"dz\": 0.0, \"dpitch\": -1.6, \"dyaw\": 5.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.24, "window_alt_abs_m": 0.0, "target_px_mean_hist": 533.0, "cur_frame_id": 59, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00069/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [74.61, 32.56, 20.0, -48.41, 159.46, 0.0]\n  Target bbox: [623.38, 323.4, 656.37, 395.66]\n\nFrame 2:\n  Drone pose: [74.3, 32.53, 20.0, -48.08, 159.61, 0.0]\n  Target bbox: [623.24, 323.58, 656.5, 395.52]\n\nFrame 3:\n  Drone pose: [74.01, 32.48, 20.0, -47.74, 159.7, 0.0]\n  Target bbox: [622.95, 322.55, 656.75, 396.59]\n\nFrame 4:\n  Drone pose: [73.62, 32.32, 20.0, -47.48, 159.3, 0.0]\n  Target bbox: [622.22, 321.43, 657.43, 397.79]\n\nFrame 5 (current):\n  Drone pose: [73.17, 32.13, 20.0, -47.29, 158.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.55, \"ymin\": 319.45, \"xmax\": 659.01, \"ymax\": 399.89}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.18, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -0.55, \"droll\": 0.0}, {\"dx\": -1.23, \"dy\": -0.22, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": -0.94, \"droll\": 0.0}, {\"dx\": -1.95, \"dy\": -0.25, \"dz\": 0.0, \"dpitch\": -0.53, \"dyaw\": -1.34, \"droll\": 0.0}, {\"dx\": -2.68, \"dy\": -0.29, \"dz\": 0.0, \"dpitch\": -0.86, \"dyaw\": -1.75, \"droll\": 0.0}, {\"dx\": -3.41, \"dy\": -0.33, \"dz\": 0.0, \"dpitch\": -1.19, \"dyaw\": -2.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.13, "window_alt_abs_m": 0.0, "target_px_mean_hist": 551.8, "cur_frame_id": 69, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [69.76, 31.8, 20.0, -48.48, 156.63, 0.0]\n  Target bbox: [621.39, 322.77, 658.84, 396.23]\n\nFrame 2:\n  Drone pose: [69.03, 31.77, 20.0, -48.46, 154.64, 0.0]\n  Target bbox: [622.66, 324.38, 657.53, 394.57]\n\nFrame 3:\n  Drone pose: [68.3, 31.73, 20.0, -48.4, 152.65, 0.0]\n  Target bbox: [624.1, 324.72, 656.13, 394.31]\n\nFrame 4:\n  Drone pose: [67.57, 31.7, 20.0, -48.31, 150.67, 0.0]\n  Target bbox: [624.01, 326.26, 656.16, 392.67]\n\nFrame 5 (current):\n  Drone pose: [66.85, 31.66, 20.0, -48.19, 148.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.13, \"ymin\": 323.64, \"xmax\": 657.16, \"ymax\": 395.4}, \"waypoint_deltas\": [{\"dx\": -0.73, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": -1.95, \"droll\": 0.0}, {\"dx\": -1.46, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": -3.87, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -4.09, \"droll\": 0.0}, {\"dx\": -2.75, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.38, \"dyaw\": -5.61, \"droll\": 0.0}, {\"dx\": -3.31, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": 0.69, \"dyaw\": -6.78, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.92, "window_alt_abs_m": 0.0, "target_px_mean_hist": 562.5, "cur_frame_id": 78, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [63.54, 31.76, 20.0, -47.5, 141.93, 0.0]\n  Target bbox: [622.89, 323.42, 657.48, 395.66]\n\nFrame 2:\n  Drone pose: [62.97, 31.86, 20.0, -47.17, 140.79, 0.0]\n  Target bbox: [623.41, 323.91, 656.97, 395.23]\n\nFrame 3:\n  Drone pose: [62.41, 31.97, 20.0, -46.84, 139.67, 0.0]\n  Target bbox: [625.86, 326.87, 654.38, 392.12]\n\nFrame 4:\n  Drone pose: [61.85, 32.07, 20.0, -46.73, 138.58, 0.0]\n  Target bbox: [629.12, 326.94, 651.1, 392.18]\n\nFrame 5 (current):\n  Drone pose: [61.29, 32.18, 20.0, -46.39, 137.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.8, \"ymin\": 325.47, \"xmax\": 657.15, \"ymax\": 393.72}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": -1.13, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": -0.94, \"droll\": 0.0}, {\"dx\": -1.69, \"dy\": 0.31, \"dz\": 0.0, \"dpitch\": 0.53, \"dyaw\": -1.96, \"droll\": 0.0}, {\"dx\": -2.26, \"dy\": 0.42, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": -4.07, \"droll\": 0.0}, {\"dx\": -2.82, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": 0.73, \"dyaw\": -5.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.42, "window_alt_abs_m": 0.0, "target_px_mean_hist": 548.5, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [111.37, 26.45, 22.0, -42.7, 182.14, 0.0]\n  Target bbox: [562.13, 391.52, 601.38, 458.71]\n\nFrame 2:\n  Drone pose: [110.85, 26.73, 21.2, -46.09, 176.57, 0.0]\n  Target bbox: [625.38, 312.07, 653.24, 383.62]\n\nFrame 3:\n  Drone pose: [110.48, 27.07, 20.71, -47.52, 176.54, 0.0]\n  Target bbox: [622.39, 322.8, 657.94, 396.43]\n\nFrame 4:\n  Drone pose: [109.82, 27.29, 20.74, -43.03, 168.41, 0.0]\n  Target bbox: [683.24, 308.68, 722.2, 376.29]\n\nFrame 5 (current):\n  Drone pose: [109.28, 27.6, 20.62, -49.45, 181.57, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 597.17, \"ymin\": 245.88, \"xmax\": 620.87, \"ymax\": 314.27}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.28, \"dz\": -0.03, \"dpitch\": 4.76, \"dyaw\": -1.75, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 0.57, \"dz\": -0.05, \"dpitch\": 4.76, \"dyaw\": -0.92, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": 0.86, \"dz\": -0.07, \"dpitch\": 4.77, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": 1.15, \"dz\": -0.09, \"dpitch\": 4.79, \"dyaw\": 0.75, \"droll\": 0.0}, {\"dx\": -2.61, \"dy\": 1.43, \"dz\": -0.2, \"dpitch\": 4.93, \"dyaw\": 1.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 26.88, "window_alt_abs_m": 1.46, "target_px_mean_hist": 453.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.67, 29.03, 20.42, -46.67, -171.85, 0.0]\n  Target bbox: [565.98, 289.02, 594.1, 362.04]\n\nFrame 2:\n  Drone pose: [106.15, 29.32, 20.39, -43.55, -180.04, 0.0]\n  Target bbox: [676.47, 341.66, 700.62, 411.3]\n\nFrame 3:\n  Drone pose: [105.63, 29.61, 20.36, -40.39, -180.18, 0.0]\n  Target bbox: [687.95, 395.0, 712.96, 464.0]\n\nFrame 4:\n  Drone pose: [105.11, 29.9, 20.33, -39.88, -169.35, 0.0]\n  Target bbox: [566.35, 404.53, 593.14, 469.91]\n\nFrame 5 (current):\n  Drone pose: [104.59, 30.18, 20.3, -44.33, -173.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.9, \"ymin\": 326.62, \"xmax\": 652.35, \"ymax\": 392.65}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": 0.29, \"dz\": -0.03, \"dpitch\": 0.06, \"dyaw\": 0.83, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": 0.58, \"dz\": -0.06, \"dpitch\": 0.13, \"dyaw\": 1.66, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.87, \"dz\": -0.08, \"dpitch\": 0.2, \"dyaw\": 2.49, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": 1.15, \"dz\": -0.11, \"dpitch\": 0.27, \"dyaw\": 3.32, \"droll\": 0.0}, {\"dx\": -2.61, \"dy\": 1.44, \"dz\": -0.13, \"dpitch\": 0.35, \"dyaw\": 4.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.33, "window_alt_abs_m": 0.12, "target_px_mean_hist": 498.8, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.92, 31.74, 20.13, -46.03, -171.25, 0.0]\n  Target bbox: [623.5, 324.35, 656.27, 394.86]\n\nFrame 2:\n  Drone pose: [101.46, 31.91, 20.15, -48.91, -169.48, 0.0]\n  Target bbox: [637.64, 239.92, 665.87, 311.45]\n\nFrame 3:\n  Drone pose: [100.94, 32.2, 20.13, -42.46, -164.44, 0.0]\n  Target bbox: [584.92, 347.06, 615.36, 419.92]\n\nFrame 4:\n  Drone pose: [100.44, 32.2, 20.12, -43.8, -167.73, 0.0]\n  Target bbox: [629.21, 327.37, 650.55, 391.93]\n\nFrame 5 (current):\n  Drone pose: [100.01, 32.04, 20.18, -40.44, -162.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 563.97, \"ymin\": 315.06, \"xmax\": 586.13, \"ymax\": 372.53}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": 0.16, \"dz\": -0.09, \"dpitch\": -3.32, \"dyaw\": -4.94, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 0.16, \"dz\": -0.1, \"dpitch\": -3.3, \"dyaw\": -4.94, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.16, \"dz\": -0.11, \"dpitch\": -3.29, \"dyaw\": -4.94, \"droll\": 0.0}, {\"dx\": -2.07, \"dy\": 0.16, \"dz\": -0.12, \"dpitch\": -3.27, \"dyaw\": -4.94, \"droll\": 0.0}, {\"dx\": -2.57, \"dy\": 0.16, \"dz\": -0.13, \"dpitch\": -3.26, \"dyaw\": -4.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.02, "window_alt_abs_m": 0.12, "target_px_mean_hist": 504.0, "cur_frame_id": 22, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.44, 32.2, 20.05, -43.7, -167.73, 0.0]\n  Target bbox: [625.84, 323.64, 654.49, 395.8]\n\nFrame 2:\n  Drone pose: [96.94, 32.2, 20.04, -43.69, -167.73, 0.0]\n  Target bbox: [626.55, 327.03, 653.68, 392.32]\n\nFrame 3:\n  Drone pose: [96.44, 32.2, 20.04, -43.68, -167.73, 0.0]\n  Target bbox: [627.44, 326.67, 652.82, 392.66]\n\nFrame 4:\n  Drone pose: [96.01, 32.29, 20.08, -47.25, -161.18, 0.0]\n  Target bbox: [566.76, 333.67, 600.27, 409.77]\n\nFrame 5 (current):\n  Drone pose: [95.44, 32.2, 20.03, -43.66, -167.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.81, \"ymin\": 324.13, \"xmax\": 654.52, \"ymax\": 395.33}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.5, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 1.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 1.5, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.7, \"dy\": 1.57, \"dz\": -0.02, \"dpitch\": -0.37, \"dyaw\": -1.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.09, "window_alt_abs_m": 0.1, "target_px_mean_hist": 504.0, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00041/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.04, 33.81, 20.01, -46.31, -171.73, 0.0]\n  Target bbox: [644.75, 295.23, 676.99, 362.33]\n\nFrame 2:\n  Drone pose: [91.29, 33.74, 20.01, -44.57, -176.58, 0.0]\n  Target bbox: [679.3, 331.66, 720.28, 404.22]\n\nFrame 3:\n  Drone pose: [90.52, 33.65, 20.01, -45.46, -173.25, 0.0]\n  Target bbox: [628.08, 323.21, 652.25, 396.05]\n\nFrame 4:\n  Drone pose: [89.76, 33.2, 19.96, -40.74, -169.9, 0.0]\n  Target bbox: [561.75, 365.24, 604.81, 437.34]\n\nFrame 5 (current):\n  Drone pose: [88.93, 32.96, 20.0, -45.58, -174.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 584.7, \"ymin\": 341.57, \"xmax\": 610.68, \"ymax\": 412.94}, \"waypoint_deltas\": [{\"dx\": -0.78, \"dy\": -0.62, \"dz\": 0.0, \"dpitch\": -1.45, \"dyaw\": -5.66, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": -1.19, \"dz\": 0.0, \"dpitch\": -1.79, \"dyaw\": -9.13, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -1.56, \"dz\": 0.0, \"dpitch\": -1.53, \"dyaw\": -12.0, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -1.68, \"dz\": 0.0, \"dpitch\": -1.34, \"dyaw\": -13.99, \"droll\": 0.0}, {\"dx\": -3.01, \"dy\": -1.79, \"dz\": 0.0, \"dpitch\": -1.1, \"dyaw\": -15.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.31, "window_alt_abs_m": 0.1, "target_px_mean_hist": 524.5, "cur_frame_id": 41, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [85.92, 31.17, 20.0, -43.72, 164.5, 0.0]\n  Target bbox: [677.73, 374.83, 716.46, 447.16]\n\nFrame 2:\n  Drone pose: [85.51, 31.16, 20.0, -48.68, 172.87, 0.0]\n  Target bbox: [566.04, 290.29, 597.25, 355.21]\n\nFrame 3:\n  Drone pose: [84.98, 31.05, 19.91, -49.96, 164.66, 0.0]\n  Target bbox: [625.39, 320.18, 654.19, 398.75]\n\nFrame 4:\n  Drone pose: [84.43, 30.97, 20.0, -47.84, 163.64, 0.0]\n  Target bbox: [630.11, 294.82, 662.42, 364.08]\n\nFrame 5 (current):\n  Drone pose: [83.72, 30.81, 20.0, -43.95, 158.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 662.98, \"ymin\": 360.82, \"xmax\": 696.85, \"ymax\": 430.94}, \"waypoint_deltas\": [{\"dx\": -0.76, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": -2.14, \"dyaw\": 1.1, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": -0.34, \"dz\": 0.0, \"dpitch\": -2.1, \"dyaw\": -1.07, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": -1.92, \"dyaw\": -2.98, \"droll\": 0.0}, {\"dx\": -2.67, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -1.62, \"dyaw\": -4.61, \"droll\": 0.0}, {\"dx\": -3.17, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -1.61, \"dyaw\": -4.66, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.65, "window_alt_abs_m": 0.18, "target_px_mean_hist": 563.0, "cur_frame_id": 50, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [80.55, 30.29, 20.0, -44.62, 158.93, 0.0]\n  Target bbox: [563.99, 343.18, 597.61, 411.21]\n\nFrame 2:\n  Drone pose: [80.0, 30.32, 20.0, -45.67, 155.64, 0.0]\n  Target bbox: [603.44, 325.66, 635.52, 393.01]\n\nFrame 3:\n  Drone pose: [79.33, 30.39, 20.0, -45.93, 153.88, 0.0]\n  Target bbox: [624.0, 325.79, 655.84, 393.41]\n\nFrame 4:\n  Drone pose: [78.52, 30.53, 20.0, -46.46, 153.85, 0.0]\n  Target bbox: [621.9, 322.08, 657.81, 397.16]\n\nFrame 5 (current):\n  Drone pose: [77.66, 30.76, 20.0, -42.16, 153.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.67, \"ymin\": 404.77, \"xmax\": 665.11, \"ymax\": 482.73}, \"waypoint_deltas\": [{\"dx\": -0.83, \"dy\": 0.33, \"dz\": 0.0, \"dpitch\": -5.71, \"dyaw\": 1.09, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": 0.73, \"dz\": 0.0, \"dpitch\": -6.3, \"dyaw\": 2.03, \"droll\": 0.0}, {\"dx\": -2.07, \"dy\": 1.15, \"dz\": 0.0, \"dpitch\": -6.65, \"dyaw\": 3.29, \"droll\": 0.0}, {\"dx\": -2.47, \"dy\": 1.52, \"dz\": 0.0, \"dpitch\": -6.74, \"dyaw\": 4.62, \"droll\": 0.0}, {\"dx\": -2.77, \"dy\": 1.78, \"dz\": 0.0, \"dpitch\": -6.6, \"dyaw\": 5.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.47, "window_alt_abs_m": 0.0, "target_px_mean_hist": 536.5, "cur_frame_id": 59, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00069/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [74.61, 32.56, 20.0, -45.9, 159.21, 0.0]\n  Target bbox: [623.64, 361.44, 661.71, 442.07]\n\nFrame 2:\n  Drone pose: [74.3, 32.53, 20.0, -49.08, 154.61, 0.0]\n  Target bbox: [679.09, 310.01, 713.68, 379.29]\n\nFrame 3:\n  Drone pose: [74.01, 32.48, 20.0, -47.74, 159.7, 0.0]\n  Target bbox: [624.24, 326.06, 655.61, 392.94]\n\nFrame 4:\n  Drone pose: [73.62, 32.32, 20.0, -47.48, 159.3, 0.0]\n  Target bbox: [622.11, 320.53, 657.53, 398.64]\n\nFrame 5 (current):\n  Drone pose: [73.17, 32.13, 20.0, -51.4, 161.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 587.64, \"ymin\": 251.1, \"xmax\": 625.86, \"ymax\": 331.18}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.18, \"dz\": 0.0, \"dpitch\": 4.23, \"dyaw\": -3.41, \"droll\": 0.0}, {\"dx\": -1.23, \"dy\": -0.22, \"dz\": 0.0, \"dpitch\": 3.9, \"dyaw\": -3.8, \"droll\": 0.0}, {\"dx\": -1.95, \"dy\": -0.25, \"dz\": 0.0, \"dpitch\": 3.58, \"dyaw\": -4.2, \"droll\": 0.0}, {\"dx\": -2.68, \"dy\": -0.29, \"dz\": 0.0, \"dpitch\": 3.25, \"dyaw\": -4.61, \"droll\": 0.0}, {\"dx\": -3.41, \"dy\": -0.33, \"dz\": 0.0, \"dpitch\": 2.92, \"dyaw\": -5.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.44, "window_alt_abs_m": 0.0, "target_px_mean_hist": 559.5, "cur_frame_id": 69, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [69.76, 31.8, 20.0, -48.48, 156.63, 0.0]\n  Target bbox: [620.01, 321.64, 660.28, 397.46]\n\nFrame 2:\n  Drone pose: [69.03, 31.77, 20.0, -49.19, 149.64, 0.0]\n  Target bbox: [676.29, 312.74, 716.24, 385.35]\n\nFrame 3:\n  Drone pose: [68.3, 31.73, 20.0, -48.33, 150.06, 0.0]\n  Target bbox: [650.5, 325.22, 688.36, 397.05]\n\nFrame 4:\n  Drone pose: [67.57, 31.7, 20.0, -53.31, 147.66, 0.0]\n  Target bbox: [656.86, 242.7, 691.79, 309.38]\n\nFrame 5 (current):\n  Drone pose: [66.92, 31.75, 20.04, -45.81, 147.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.98, \"ymin\": 325.48, \"xmax\": 655.27, \"ymax\": 393.69}, \"waypoint_deltas\": [{\"dx\": -0.8, \"dy\": -0.13, \"dz\": -0.04, \"dpitch\": -2.22, \"dyaw\": -1.22, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": -0.16, \"dz\": -0.04, \"dpitch\": -2.03, \"dyaw\": -3.14, \"droll\": 0.0}, {\"dx\": -2.19, \"dy\": -0.14, \"dz\": -0.04, \"dpitch\": -2.27, \"dyaw\": -3.36, \"droll\": 0.0}, {\"dx\": -2.82, \"dy\": -0.1, \"dz\": -0.04, \"dpitch\": -2.0, \"dyaw\": -4.88, \"droll\": 0.0}, {\"dx\": -3.38, \"dy\": 0.01, \"dz\": -0.04, \"dpitch\": -1.69, \"dyaw\": -6.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.15, "window_alt_abs_m": 0.04, "target_px_mean_hist": 559.0, "cur_frame_id": 78, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [63.54, 31.76, 20.0, -47.5, 141.93, 0.0]\n  Target bbox: [623.29, 324.06, 657.03, 394.95]\n\nFrame 2:\n  Drone pose: [62.97, 31.86, 20.0, -47.17, 140.79, 0.0]\n  Target bbox: [621.01, 323.23, 658.83, 395.93]\n\nFrame 3:\n  Drone pose: [62.25, 32.06, 19.98, -41.41, 131.37, 0.0]\n  Target bbox: [663.52, 408.0, 696.18, 481.14]\n\nFrame 4:\n  Drone pose: [61.82, 32.1, 19.98, -42.67, 132.05, 0.0]\n  Target bbox: [675.81, 303.17, 712.99, 367.25]\n\nFrame 5 (current):\n  Drone pose: [61.29, 32.18, 20.0, -44.52, 141.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 577.13, \"ymin\": 353.64, \"xmax\": 620.22, \"ymax\": 430.22}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": -2.05, \"dyaw\": -3.4, \"droll\": 0.0}, {\"dx\": -1.13, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": -1.7, \"dyaw\": -4.45, \"droll\": 0.0}, {\"dx\": -1.69, \"dy\": 0.31, \"dz\": 0.0, \"dpitch\": -1.34, \"dyaw\": -5.47, \"droll\": 0.0}, {\"dx\": -2.26, \"dy\": 0.42, \"dz\": 0.0, \"dpitch\": -1.52, \"dyaw\": -7.58, \"droll\": 0.0}, {\"dx\": -2.82, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": -1.14, \"dyaw\": -8.56, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.23, "window_alt_abs_m": 0.05, "target_px_mean_hist": 458.8, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776223869", "difficulty_score": 0.3018, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-88.16, -55.94, 22.0, -46.4, 116.57, 0.0]\n  Target bbox: [618.89, 324.62, 661.22, 394.68]\n\nFrame 2:\n  Drone pose: [-89.38, -56.47, 21.2, -44.67, 112.23, 0.0]\n  Target bbox: [615.74, 324.19, 664.44, 395.3]\n\nFrame 3:\n  Drone pose: [-90.07, -56.47, 20.67, -43.61, 109.92, 0.0]\n  Target bbox: [625.41, 325.41, 654.82, 393.91]\n\nFrame 4:\n  Drone pose: [-90.52, -56.18, 20.64, -43.5, 108.55, 0.0]\n  Target bbox: [616.73, 321.8, 663.03, 397.76]\n\nFrame 5 (current):\n  Drone pose: [-90.85, -55.75, 20.62, -43.3, 108.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.93, \"ymin\": 321.87, \"xmax\": 662.86, \"ymax\": 397.62}, \"waypoint_deltas\": [{\"dx\": -0.28, \"dy\": 0.48, \"dz\": -0.03, \"dpitch\": 0.17, \"dyaw\": 0.55, \"droll\": 0.0}, {\"dx\": -0.53, \"dy\": 0.96, \"dz\": -0.05, \"dpitch\": 0.33, \"dyaw\": 1.17, \"droll\": 0.0}, {\"dx\": -0.78, \"dy\": 1.44, \"dz\": -0.07, \"dpitch\": 0.28, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 1.91, \"dz\": -0.09, \"dpitch\": 0.23, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": -1.34, \"dy\": 2.38, \"dz\": -0.2, \"dpitch\": 0.3, \"dyaw\": -1.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.4, "window_alt_abs_m": 1.38, "target_px_mean_hist": 347.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-92.19, -53.37, 20.42, -43.0, 107.94, 0.0]\n  Target bbox: [621.5, 324.22, 658.77, 395.16]\n\nFrame 2:\n  Drone pose: [-92.55, -52.9, 20.39, -43.06, 106.98, 0.0]\n  Target bbox: [618.79, 323.34, 661.0, 396.23]\n\nFrame 3:\n  Drone pose: [-92.97, -52.42, 20.36, -42.96, 107.17, 0.0]\n  Target bbox: [621.74, 324.69, 658.05, 394.81]\n\nFrame 4:\n  Drone pose: [-93.44, -51.94, 20.33, -42.88, 107.24, 0.0]\n  Target bbox: [616.81, 321.52, 663.01, 397.98]\n\nFrame 5 (current):\n  Drone pose: [-93.95, -51.45, 20.3, -42.83, 107.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.26, \"ymin\": 321.3, \"xmax\": 663.5, \"ymax\": 398.32}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.5, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": -0.05, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": 0.99, \"dz\": -0.06, \"dpitch\": 0.09, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": 1.47, \"dz\": -0.08, \"dpitch\": 0.17, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -1.89, \"dy\": 1.94, \"dz\": -0.11, \"dpitch\": 0.28, \"dyaw\": 0.23, \"droll\": 0.0}, {\"dx\": -2.25, \"dy\": 2.41, \"dz\": -0.13, \"dpitch\": 0.21, \"dyaw\": -0.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.25, "window_alt_abs_m": 0.12, "target_px_mean_hist": 535.8, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-96.52, -48.59, 20.15, -42.65, 105.59, 0.0]\n  Target bbox: [621.21, 324.66, 659.04, 394.66]\n\nFrame 2:\n  Drone pose: [-96.79, -48.14, 20.13, -42.66, 104.82, 0.0]\n  Target bbox: [614.96, 323.77, 665.35, 395.69]\n\nFrame 3:\n  Drone pose: [-97.05, -47.69, 20.12, -42.66, 104.12, 0.0]\n  Target bbox: [615.4, 323.37, 664.94, 396.06]\n\nFrame 4:\n  Drone pose: [-97.28, -47.24, 20.1, -42.64, 103.44, 0.0]\n  Target bbox: [620.01, 324.87, 660.26, 394.45]\n\nFrame 5 (current):\n  Drone pose: [-97.52, -46.79, 20.09, -42.63, 102.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.47, \"ymin\": 324.92, \"xmax\": 663.85, \"ymax\": 394.51}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": 0.45, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.66, \"droll\": 0.0}, {\"dx\": -0.46, \"dy\": 0.89, \"dz\": -0.02, \"dpitch\": 0.05, \"dyaw\": -1.31, \"droll\": 0.0}, {\"dx\": -0.69, \"dy\": 1.34, \"dz\": -0.03, \"dpitch\": 0.07, \"dyaw\": -1.97, \"droll\": 0.0}, {\"dx\": -0.91, \"dy\": 1.79, \"dz\": -0.04, \"dpitch\": 0.1, \"dyaw\": -2.59, \"droll\": 0.0}, {\"dx\": -1.13, \"dy\": 2.25, \"dz\": -0.05, \"dpitch\": 0.12, \"dyaw\": -3.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.8, "window_alt_abs_m": 0.06, "target_px_mean_hist": 537.2, "cur_frame_id": 23, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-98.87, -44.08, 20.04, -42.5, 98.98, 0.0]\n  Target bbox: [624.3, 326.5, 655.99, 392.87]\n\nFrame 2:\n  Drone pose: [-99.08, -43.62, 20.03, -42.48, 98.38, 0.0]\n  Target bbox: [620.33, 324.49, 660.03, 394.81]\n\nFrame 3:\n  Drone pose: [-99.28, -43.15, 20.03, -42.47, 97.82, 0.0]\n  Target bbox: [622.84, 326.34, 657.47, 393.02]\n\nFrame 4:\n  Drone pose: [-99.46, -42.7, 20.02, -42.44, 97.3, 0.0]\n  Target bbox: [617.37, 324.58, 663.04, 394.73]\n\nFrame 5 (current):\n  Drone pose: [-99.65, -42.24, 20.02, -42.41, 96.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.94, \"ymin\": 325.0, \"xmax\": 663.48, \"ymax\": 394.32}, \"waypoint_deltas\": [{\"dx\": -0.18, \"dy\": 0.47, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": -0.34, \"dy\": 0.94, \"dz\": -0.01, \"dpitch\": 0.04, \"dyaw\": -0.96, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 1.41, \"dz\": -0.01, \"dpitch\": 0.06, \"dyaw\": -1.36, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": 1.89, \"dz\": -0.01, \"dpitch\": 0.09, \"dyaw\": -1.74, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": 2.36, \"dz\": -0.01, \"dpitch\": 0.11, \"dyaw\": -2.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.2, "window_alt_abs_m": 0.02, "target_px_mean_hist": 546.0, "cur_frame_id": 33, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00043/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.51, -39.4, 20.01, -42.28, 94.38, 0.0]\n  Target bbox: [628.05, 326.09, 652.2, 393.2]\n\nFrame 2:\n  Drone pose: [-100.61, -38.93, 20.0, -42.26, 94.11, 0.0]\n  Target bbox: [623.74, 325.5, 656.61, 393.79]\n\nFrame 3:\n  Drone pose: [-100.69, -38.45, 20.0, -42.24, 93.89, 0.0]\n  Target bbox: [626.12, 327.71, 654.16, 391.65]\n\nFrame 4:\n  Drone pose: [-100.75, -37.96, 20.0, -42.22, 93.71, 0.0]\n  Target bbox: [621.34, 326.78, 659.02, 392.5]\n\nFrame 5 (current):\n  Drone pose: [-100.8, -37.48, 20.0, -42.2, 93.58, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.62, \"ymin\": 326.49, \"xmax\": 657.71, \"ymax\": 392.74}, \"waypoint_deltas\": [{\"dx\": -0.03, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": 1.46, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -0.16, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": 1.95, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 2.44, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.8, "window_alt_abs_m": 0.0, "target_px_mean_hist": 556.2, "cur_frame_id": 43, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00053/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.83, -34.54, 20.0, -42.11, 93.49, 0.0]\n  Target bbox: [623.3, 327.05, 657.04, 392.28]\n\nFrame 2:\n  Drone pose: [-100.82, -34.04, 20.0, -42.12, 93.52, 0.0]\n  Target bbox: [622.58, 327.09, 657.76, 392.23]\n\nFrame 3:\n  Drone pose: [-100.82, -33.53, 20.0, -42.13, 93.51, 0.0]\n  Target bbox: [623.96, 327.15, 656.36, 392.19]\n\nFrame 4:\n  Drone pose: [-100.84, -33.0, 20.0, -42.16, 93.47, 0.0]\n  Target bbox: [618.7, 325.27, 661.74, 394.0]\n\nFrame 5 (current):\n  Drone pose: [-100.88, -32.47, 20.0, -42.21, 93.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.79, \"ymin\": 327.66, \"xmax\": 656.53, \"ymax\": 391.68}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": 1.08, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 1.03, \"droll\": 0.0}, {\"dx\": -0.21, \"dy\": 1.62, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 0.81, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": 2.16, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": 1.95, \"droll\": 0.0}, {\"dx\": -0.39, \"dy\": 2.69, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": 1.72, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.18, "window_alt_abs_m": 0.0, "target_px_mean_hist": 548.8, "cur_frame_id": 53, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.27, -29.78, 20.0, -42.41, 95.08, 0.0]\n  Target bbox: [621.23, 324.33, 658.59, 395.19]\n\nFrame 2:\n  Drone pose: [-101.35, -29.24, 20.0, -42.4, 96.23, 0.0]\n  Target bbox: [620.21, 326.25, 660.13, 393.02]\n\nFrame 3:\n  Drone pose: [-101.42, -28.71, 20.0, -42.46, 96.03, 0.0]\n  Target bbox: [621.29, 324.26, 658.52, 395.26]\n\nFrame 4:\n  Drone pose: [-101.5, -28.18, 20.0, -42.43, 97.21, 0.0]\n  Target bbox: [616.24, 325.33, 664.18, 394.03]\n\nFrame 5 (current):\n  Drone pose: [-101.57, -27.65, 20.0, -42.49, 97.02, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.51, \"ymin\": 325.8, \"xmax\": 660.84, \"ymax\": 393.49}, \"waypoint_deltas\": [{\"dx\": -0.08, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": 1.06, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.94, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 1.59, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.68, \"droll\": 0.0}, {\"dx\": -0.37, \"dy\": 2.12, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 1.78, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": 2.65, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 1.48, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.72, "window_alt_abs_m": 0.0, "target_px_mean_hist": 537.2, "cur_frame_id": 62, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.17, -24.48, 20.0, -42.54, 99.56, 0.0]\n  Target bbox: [626.65, 325.24, 653.61, 394.04]\n\nFrame 2:\n  Drone pose: [-102.29, -23.95, 20.0, -42.6, 99.24, 0.0]\n  Target bbox: [618.52, 322.45, 661.49, 396.98]\n\nFrame 3:\n  Drone pose: [-102.41, -23.43, 20.0, -42.55, 100.29, 0.0]\n  Target bbox: [622.49, 324.47, 657.83, 394.87]\n\nFrame 4:\n  Drone pose: [-102.53, -22.91, 20.0, -42.6, 99.97, 0.0]\n  Target bbox: [620.82, 323.14, 659.04, 396.27]\n\nFrame 5 (current):\n  Drone pose: [-102.65, -22.39, 20.0, -42.54, 101.02, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.17, \"ymin\": 325.42, \"xmax\": 660.14, \"ymax\": 393.94}, \"waypoint_deltas\": [{\"dx\": -0.12, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": 1.04, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -0.65, \"droll\": 0.0}, {\"dx\": -0.38, \"dy\": 1.57, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": -0.52, \"dy\": 2.09, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -0.68, \"dy\": 2.61, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.93, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.73, "window_alt_abs_m": 0.0, "target_px_mean_hist": 547.5, "cur_frame_id": 72, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00082/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.48, -19.27, 20.0, -42.66, 101.53, 0.0]\n  Target bbox: [616.48, 321.43, 663.35, 398.23]\n\nFrame 2:\n  Drone pose: [-103.65, -18.75, 20.0, -42.59, 102.46, 0.0]\n  Target bbox: [624.8, 326.37, 655.45, 393.01]\n\nFrame 3:\n  Drone pose: [-103.81, -18.23, 20.0, -42.66, 102.03, 0.0]\n  Target bbox: [617.73, 321.92, 662.23, 397.51]\n\nFrame 4:\n  Drone pose: [-103.97, -17.71, 20.0, -42.58, 102.95, 0.0]\n  Target bbox: [618.52, 324.71, 661.77, 394.63]\n\nFrame 5 (current):\n  Drone pose: [-104.13, -17.19, 20.0, -42.65, 102.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.03, \"ymin\": 320.48, \"xmax\": 663.85, \"ymax\": 399.06}, \"waypoint_deltas\": [{\"dx\": -0.15, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.94, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": 1.03, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.52, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": 1.54, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": 2.06, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.99, \"droll\": 0.0}, {\"dx\": -0.83, \"dy\": 2.57, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.7, "window_alt_abs_m": 0.0, "target_px_mean_hist": 534.0, "cur_frame_id": 82, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/ORI/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-105.15, -14.11, 20.0, -42.62, 103.89, 0.0]\n  Target bbox: [626.34, 325.13, 653.91, 394.17]\n\nFrame 2:\n  Drone pose: [-105.35, -13.59, 20.0, -42.7, 103.37, 0.0]\n  Target bbox: [617.53, 321.21, 662.25, 398.38]\n\nFrame 3:\n  Drone pose: [-105.54, -13.08, 20.0, -42.62, 104.19, 0.0]\n  Target bbox: [624.18, 326.22, 656.07, 393.17]\n\nFrame 4:\n  Drone pose: [-105.74, -12.57, 20.0, -42.7, 103.67, 0.0]\n  Target bbox: [621.34, 323.61, 658.53, 395.75]\n\nFrame 5 (current):\n  Drone pose: [-105.94, -12.06, 20.0, -42.61, 104.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.67, \"ymin\": 325.49, \"xmax\": 658.61, \"ymax\": 393.91}, \"waypoint_deltas\": [{\"dx\": -0.18, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": -0.37, \"dy\": 1.02, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.36, \"droll\": 0.0}, {\"dx\": -0.55, \"dy\": 1.53, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": -0.73, \"dy\": 2.04, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": 2.55, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.7, "window_alt_abs_m": 0.0, "target_px_mean_hist": 547.8, "cur_frame_id": 92, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-88.25, -56.08, 22.05, -48.12, 109.1, 0.0]\n  Target bbox: [613.58, 320.44, 666.43, 398.99]\n\nFrame 2:\n  Drone pose: [-89.38, -56.47, 21.2, -48.87, 113.97, 0.0]\n  Target bbox: [602.08, 255.94, 635.65, 322.9]\n\nFrame 3:\n  Drone pose: [-90.07, -56.47, 20.67, -43.61, 109.92, 0.0]\n  Target bbox: [616.86, 324.53, 663.36, 394.96]\n\nFrame 4:\n  Drone pose: [-90.52, -56.18, 20.64, -43.5, 108.55, 0.0]\n  Target bbox: [622.11, 324.64, 657.73, 394.7]\n\nFrame 5 (current):\n  Drone pose: [-90.85, -55.75, 20.62, -42.17, 110.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 601.2, \"ymin\": 342.44, \"xmax\": 637.75, \"ymax\": 415.36}, \"waypoint_deltas\": [{\"dx\": -0.28, \"dy\": 0.48, \"dz\": -0.03, \"dpitch\": -0.96, \"dyaw\": -1.09, \"droll\": 0.0}, {\"dx\": -0.53, \"dy\": 0.96, \"dz\": -0.05, \"dpitch\": -0.8, \"dyaw\": -0.47, \"droll\": 0.0}, {\"dx\": -0.78, \"dy\": 1.44, \"dz\": -0.07, \"dpitch\": -0.85, \"dyaw\": -1.13, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 1.91, \"dz\": -0.09, \"dpitch\": -0.9, \"dyaw\": -1.83, \"droll\": 0.0}, {\"dx\": -1.34, \"dy\": 2.38, \"dz\": -0.2, \"dpitch\": -0.83, \"dyaw\": -2.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.32, "window_alt_abs_m": 1.43, "target_px_mean_hist": 389.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00007/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00008/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00011/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-91.63, -54.31, 20.55, -43.02, 109.46, 0.0]\n  Target bbox: [624.76, 326.15, 654.97, 393.13]\n\nFrame 2:\n  Drone pose: [-91.78, -53.86, 20.63, -44.15, 116.06, 0.0]\n  Target bbox: [619.72, 323.75, 659.99, 395.72]\n\nFrame 3:\n  Drone pose: [-92.19, -53.37, 20.42, -43.0, 107.94, 0.0]\n  Target bbox: [621.82, 324.3, 658.44, 395.08]\n\nFrame 4:\n  Drone pose: [-92.55, -52.9, 20.39, -47.15, 106.61, 0.0]\n  Target bbox: [625.06, 255.33, 664.01, 326.85]\n\nFrame 5 (current):\n  Drone pose: [-92.97, -52.42, 20.36, -42.96, 107.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.54, \"ymin\": 321.34, \"xmax\": 663.27, \"ymax\": 398.17}, \"waypoint_deltas\": [{\"dx\": -0.47, \"dy\": 0.48, \"dz\": -0.03, \"dpitch\": 0.08, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": 0.97, \"dz\": -0.06, \"dpitch\": 0.13, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": 1.47, \"dz\": -0.09, \"dpitch\": 0.17, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": 1.96, \"dz\": -0.12, \"dpitch\": 0.22, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -2.46, \"dy\": 2.44, \"dz\": -0.14, \"dpitch\": 0.3, \"dyaw\": 0.06, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.6, "window_alt_abs_m": 0.35, "target_px_mean_hist": 538.0, "cur_frame_id": 11, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00018/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-94.46, -50.95, 20.27, -41.61, 109.76, 0.0]\n  Target bbox: [588.68, 345.36, 626.45, 414.58]\n\nFrame 2:\n  Drone pose: [-94.96, -50.46, 20.24, -42.74, 107.15, 0.0]\n  Target bbox: [621.44, 323.74, 658.38, 395.65]\n\nFrame 3:\n  Drone pose: [-95.43, -49.98, 20.22, -45.9, 106.77, 0.0]\n  Target bbox: [622.98, 267.7, 668.4, 343.21]\n\nFrame 4:\n  Drone pose: [-95.84, -49.51, 20.19, -42.18, 102.44, 0.0]\n  Target bbox: [686.05, 333.59, 718.47, 401.98]\n\nFrame 5 (current):\n  Drone pose: [-96.2, -49.04, 20.17, -42.62, 106.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.24, \"ymin\": 325.31, \"xmax\": 659.02, \"ymax\": 394.12}, \"waypoint_deltas\": [{\"dx\": -0.32, \"dy\": 0.45, \"dz\": -0.02, \"dpitch\": -0.03, \"dyaw\": -0.87, \"droll\": 0.0}, {\"dx\": -0.59, \"dy\": 0.9, \"dz\": -0.04, \"dpitch\": -0.04, \"dyaw\": -1.64, \"droll\": 0.0}, {\"dx\": -0.85, \"dy\": 1.35, \"dz\": -0.05, \"dpitch\": -0.04, \"dyaw\": -2.34, \"droll\": 0.0}, {\"dx\": -1.08, \"dy\": 1.8, \"dz\": -0.07, \"dpitch\": -0.02, \"dyaw\": -3.02, \"droll\": 0.0}, {\"dx\": -1.32, \"dy\": 2.25, \"dz\": -0.08, \"dpitch\": -0.01, \"dyaw\": -3.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.35, "window_alt_abs_m": 0.1, "target_px_mean_hist": 530.8, "cur_frame_id": 18, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.05, -47.69, 20.12, -39.24, 105.99, 0.0]\n  Target bbox: [594.4, 381.83, 638.58, 452.79]\n\nFrame 2:\n  Drone pose: [-97.28, -47.24, 20.1, -40.6, 98.44, 0.0]\n  Target bbox: [686.66, 361.45, 717.75, 430.0]\n\nFrame 3:\n  Drone pose: [-97.52, -46.79, 20.09, -45.38, 98.07, 0.0]\n  Target bbox: [675.59, 277.16, 721.86, 353.15]\n\nFrame 4:\n  Drone pose: [-97.78, -46.27, 20.24, -44.5, 99.05, 0.0]\n  Target bbox: [642.85, 296.5, 672.18, 362.04]\n\nFrame 5 (current):\n  Drone pose: [-97.83, -46.0, 20.1, -39.21, 99.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.47, \"ymin\": 343.29, \"xmax\": 660.37, \"ymax\": 412.88}, \"waypoint_deltas\": [{\"dx\": -0.38, \"dy\": 0.55, \"dz\": -0.04, \"dpitch\": -3.35, \"dyaw\": 1.32, \"droll\": 0.0}, {\"dx\": -0.6, \"dy\": 1.0, \"dz\": -0.05, \"dpitch\": -3.32, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": -0.82, \"dy\": 1.46, \"dz\": -0.06, \"dpitch\": -3.3, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 1.92, \"dz\": -0.06, \"dpitch\": -3.29, \"dyaw\": -0.52, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": 2.38, \"dz\": -0.07, \"dpitch\": -3.27, \"dyaw\": -1.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.35, "window_alt_abs_m": 0.33, "target_px_mean_hist": 565.5, "cur_frame_id": 25, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-98.87, -44.08, 20.04, -45.85, 93.98, 0.0]\n  Target bbox: [680.61, 272.52, 724.62, 337.91]\n\nFrame 2:\n  Drone pose: [-99.08, -43.62, 20.03, -41.77, 102.21, 0.0]\n  Target bbox: [573.57, 339.23, 611.24, 406.21]\n\nFrame 3:\n  Drone pose: [-99.28, -43.15, 20.03, -42.47, 97.82, 0.0]\n  Target bbox: [619.78, 325.98, 660.55, 393.3]\n\nFrame 4:\n  Drone pose: [-99.46, -42.7, 20.02, -41.15, 100.67, 0.0]\n  Target bbox: [573.46, 346.58, 622.47, 417.68]\n\nFrame 5 (current):\n  Drone pose: [-99.65, -42.24, 20.02, -40.92, 91.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 681.21, \"ymin\": 352.28, \"xmax\": 724.07, \"ymax\": 420.58}, \"waypoint_deltas\": [{\"dx\": -0.18, \"dy\": 0.47, \"dz\": 0.0, \"dpitch\": -1.47, \"dyaw\": 4.5, \"droll\": 0.0}, {\"dx\": -0.34, \"dy\": 0.94, \"dz\": -0.01, \"dpitch\": -1.45, \"dyaw\": 4.04, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 1.41, \"dz\": -0.01, \"dpitch\": -1.43, \"dyaw\": 3.64, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": 1.89, \"dz\": -0.01, \"dpitch\": -1.4, \"dyaw\": 3.26, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": 2.36, \"dz\": -0.01, \"dpitch\": -1.38, \"dyaw\": 2.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.37, "window_alt_abs_m": 0.02, "target_px_mean_hist": 552.2, "cur_frame_id": 33, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.14, -40.83, 20.01, -46.14, 96.23, 0.0]\n  Target bbox: [609.78, 262.49, 649.89, 329.43]\n\nFrame 2:\n  Drone pose: [-100.27, -40.35, 20.01, -42.32, 95.04, 0.0]\n  Target bbox: [619.24, 326.89, 661.14, 392.46]\n\nFrame 3:\n  Drone pose: [-100.4, -39.88, 20.01, -42.3, 94.69, 0.0]\n  Target bbox: [617.19, 326.41, 663.25, 392.94]\n\nFrame 4:\n  Drone pose: [-100.51, -39.4, 20.01, -41.92, 91.19, 0.0]\n  Target bbox: [666.29, 334.5, 693.63, 398.25]\n\nFrame 5 (current):\n  Drone pose: [-100.61, -38.93, 20.0, -40.37, 96.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 592.22, \"ymin\": 357.09, \"xmax\": 626.96, \"ymax\": 426.36}, \"waypoint_deltas\": [{\"dx\": -0.08, \"dy\": 0.48, \"dz\": 0.0, \"dpitch\": -1.87, \"dyaw\": -2.66, \"droll\": 0.0}, {\"dx\": -0.14, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": -1.85, \"dyaw\": -2.84, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 1.45, \"dz\": 0.0, \"dpitch\": -1.83, \"dyaw\": -2.97, \"droll\": 0.0}, {\"dx\": -0.22, \"dy\": 1.94, \"dz\": 0.0, \"dpitch\": -1.81, \"dyaw\": -3.06, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": 2.42, \"dz\": 0.0, \"dpitch\": -1.79, \"dyaw\": -3.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.39, "window_alt_abs_m": 0.01, "target_px_mean_hist": 552.5, "cur_frame_id": 40, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00047/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.78, -37.43, 20.16, -44.53, 95.66, 0.0]\n  Target bbox: [585.16, 248.23, 627.76, 319.57]\n\nFrame 2:\n  Drone pose: [-100.86, -37.0, 19.81, -39.76, 96.19, 0.0]\n  Target bbox: [642.97, 364.84, 685.22, 437.24]\n\nFrame 3:\n  Drone pose: [-100.85, -36.51, 20.0, -46.91, 92.51, 0.0]\n  Target bbox: [628.98, 246.4, 675.08, 313.43]\n\nFrame 4:\n  Drone pose: [-100.86, -36.02, 20.0, -39.75, 98.42, 0.0]\n  Target bbox: [555.32, 365.95, 599.67, 437.44]\n\nFrame 5 (current):\n  Drone pose: [-100.85, -35.53, 20.0, -42.13, 93.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.55, \"ymin\": 325.33, \"xmax\": 658.85, \"ymax\": 393.96}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": 0.99, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 1.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 2.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": 2.53, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.11, "window_alt_abs_m": 0.55, "target_px_mean_hist": 546.5, "cur_frame_id": 47, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.81, -33.37, 20.05, -45.36, 88.18, 0.0]\n  Target bbox: [675.71, 315.53, 724.17, 387.21]\n\nFrame 2:\n  Drone pose: [-100.84, -33.0, 20.0, -42.16, 93.47, 0.0]\n  Target bbox: [624.5, 327.73, 655.8, 391.6]\n\nFrame 3:\n  Drone pose: [-100.88, -32.47, 20.0, -40.68, 94.8, 0.0]\n  Target bbox: [609.69, 351.74, 634.46, 419.28]\n\nFrame 4:\n  Drone pose: [-100.94, -31.94, 20.0, -42.26, 93.21, 0.0]\n  Target bbox: [616.95, 320.7, 663.09, 398.86]\n\nFrame 5 (current):\n  Drone pose: [-101.01, -31.39, 20.0, -42.28, 94.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.91, \"ymin\": 321.92, \"xmax\": 662.99, \"ymax\": 397.57}, \"waypoint_deltas\": [{\"dx\": -0.08, \"dy\": 0.54, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": 1.08, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.92, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 1.61, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": -0.34, \"dy\": 2.15, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 1.84, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": 2.68, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 1.64, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.38, "window_alt_abs_m": 0.05, "target_px_mean_hist": 552.5, "cur_frame_id": 55, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.27, -29.78, 20.0, -42.41, 95.08, 0.0]\n  Target bbox: [618.8, 325.18, 661.35, 394.16]\n\nFrame 2:\n  Drone pose: [-101.26, -29.21, 20.03, -43.67, 97.89, 0.0]\n  Target bbox: [633.28, 305.17, 670.1, 376.9]\n\nFrame 3:\n  Drone pose: [-101.42, -28.71, 20.0, -42.46, 96.03, 0.0]\n  Target bbox: [618.42, 322.12, 661.62, 397.35]\n\nFrame 4:\n  Drone pose: [-101.5, -28.18, 20.0, -39.68, 102.21, 0.0]\n  Target bbox: [555.57, 372.9, 599.89, 442.7]\n\nFrame 5 (current):\n  Drone pose: [-101.57, -27.65, 20.0, -45.3, 101.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 564.52, \"ymin\": 280.82, \"xmax\": 597.92, \"ymax\": 347.49}, \"waypoint_deltas\": [{\"dx\": -0.08, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": 2.76, \"dyaw\": -4.94, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": 1.06, \"dz\": 0.0, \"dpitch\": 2.79, \"dyaw\": -3.79, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 1.59, \"dz\": 0.0, \"dpitch\": 2.73, \"dyaw\": -4.05, \"droll\": 0.0}, {\"dx\": -0.37, \"dy\": 2.12, \"dz\": 0.0, \"dpitch\": 2.77, \"dyaw\": -2.95, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": 2.65, \"dz\": 0.0, \"dpitch\": 2.71, \"dyaw\": -3.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.31, "window_alt_abs_m": 0.07, "target_px_mean_hist": 542.2, "cur_frame_id": 62, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501/aug_001/frames_playback/frame_00069/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.83, -26.06, 20.0, -47.57, 97.57, 0.0]\n  Target bbox: [620.02, 237.13, 663.21, 314.19]\n\nFrame 2:\n  Drone pose: [-101.94, -25.53, 20.0, -43.86, 99.79, 0.0]\n  Target bbox: [607.43, 301.67, 647.81, 373.13]\n\nFrame 3:\n  Drone pose: [-102.05, -25.0, 20.0, -42.59, 98.5, 0.0]\n  Target bbox: [619.39, 322.21, 660.39, 397.34]\n\nFrame 4:\n  Drone pose: [-102.17, -24.48, 20.0, -42.27, 104.56, 0.0]\n  Target bbox: [555.35, 331.07, 600.37, 401.06]\n\nFrame 5 (current):\n  Drone pose: [-102.29, -23.95, 20.0, -41.14, 97.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 640.4, \"ymin\": 350.78, \"xmax\": 684.34, \"ymax\": 418.03}, \"waypoint_deltas\": [{\"dx\": -0.12, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": -1.41, \"dyaw\": 2.84, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": 1.04, \"dz\": 0.0, \"dpitch\": -1.46, \"dyaw\": 2.52, \"droll\": 0.0}, {\"dx\": -0.36, \"dy\": 1.56, \"dz\": 0.0, \"dpitch\": -1.4, \"dyaw\": 3.57, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": 2.08, \"dz\": 0.0, \"dpitch\": -1.46, \"dyaw\": 3.26, \"droll\": 0.0}, {\"dx\": -0.6, \"dy\": 2.6, \"dz\": 0.0, \"dpitch\": -1.52, \"dyaw\": 2.92, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.67, "window_alt_abs_m": 0.0, "target_px_mean_hist": 550.0, "cur_frame_id": 69, "source": "aug_001", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776078501", "difficulty_score": 0.2595, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-128.65, 10.68, 22.0, -55.25, -71.7, 0.0]\n  Target bbox: [612.7, 320.82, 667.58, 397.9]\n\nFrame 2:\n  Drone pose: [-129.48, 10.14, 21.2, -54.24, -66.0, 0.0]\n  Target bbox: [613.56, 318.45, 666.68, 400.22]\n\nFrame 3:\n  Drone pose: [-129.51, 9.96, 20.67, -53.4, -63.82, 0.0]\n  Target bbox: [612.13, 317.17, 668.08, 401.55]\n\nFrame 4:\n  Drone pose: [-129.49, 9.77, 20.64, -53.26, -61.76, 0.0]\n  Target bbox: [615.37, 318.76, 664.26, 399.87]\n\nFrame 5 (current):\n  Drone pose: [-129.42, 9.6, 20.62, -52.3, -60.91, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.44, \"ymin\": 318.45, \"xmax\": 664.15, \"ymax\": 400.3}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": -0.16, \"dz\": -0.03, \"dpitch\": 0.75, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.65, \"dy\": -0.38, \"dz\": -0.05, \"dpitch\": 1.37, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.64, \"dz\": -0.07, \"dpitch\": 1.89, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.36, \"dy\": -0.96, \"dz\": -0.09, \"dpitch\": 2.32, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": 1.72, \"dy\": -1.32, \"dz\": -0.2, \"dpitch\": 2.81, \"dyaw\": 0.35, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.8, "window_alt_abs_m": 1.38, "target_px_mean_hist": 774.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-127.34, 7.89, 20.39, -49.16, -60.34, 0.0]\n  Target bbox: [616.93, 319.89, 662.7, 399.03]\n\nFrame 2:\n  Drone pose: [-127.0, 7.46, 20.36, -48.89, -59.98, 0.0]\n  Target bbox: [618.07, 319.45, 661.51, 399.55]\n\nFrame 3:\n  Drone pose: [-126.73, 6.95, 20.33, -48.66, -59.28, 0.0]\n  Target bbox: [615.58, 318.23, 663.91, 400.89]\n\nFrame 4:\n  Drone pose: [-126.46, 6.41, 20.3, -48.69, -58.52, 0.0]\n  Target bbox: [619.41, 321.55, 660.24, 397.41]\n\nFrame 5 (current):\n  Drone pose: [-125.67, 6.18, 20.27, -48.52, -59.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.82, \"ymin\": 321.97, \"xmax\": 658.89, \"ymax\": 396.92}, \"waypoint_deltas\": [{\"dx\": 1.0, \"dy\": -0.08, \"dz\": -0.03, \"dpitch\": 0.25, \"dyaw\": -2.12, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -0.16, \"dz\": -0.05, \"dpitch\": 0.53, \"dyaw\": -4.21, \"droll\": 0.0}, {\"dx\": 3.19, \"dy\": -0.13, \"dz\": -0.08, \"dpitch\": 0.89, \"dyaw\": -7.03, \"droll\": 0.0}, {\"dx\": 4.39, \"dy\": 0.01, \"dz\": -0.1, \"dpitch\": 1.46, \"dyaw\": -9.89, \"droll\": 0.0}, {\"dx\": 5.49, \"dy\": 0.09, \"dz\": -0.12, \"dpitch\": 1.81, \"dyaw\": -13.83, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.16, "window_alt_abs_m": 0.12, "target_px_mean_hist": 519.5, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-119.23, 6.26, 20.14, -46.31, -76.98, 0.0]\n  Target bbox: [618.81, 321.5, 661.58, 397.66]\n\nFrame 2:\n  Drone pose: [-118.9, 5.94, 20.12, -46.12, -78.09, 0.0]\n  Target bbox: [618.47, 321.31, 661.94, 397.84]\n\nFrame 3:\n  Drone pose: [-118.74, 5.51, 20.11, -46.03, -78.63, 0.0]\n  Target bbox: [621.64, 322.28, 658.73, 396.86]\n\nFrame 4:\n  Drone pose: [-118.62, 5.03, 20.09, -46.02, -79.0, 0.0]\n  Target bbox: [615.33, 322.23, 665.08, 396.93]\n\nFrame 5 (current):\n  Drone pose: [-118.47, 4.53, 20.08, -46.05, -79.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.08, \"ymin\": 321.93, \"xmax\": 661.31, \"ymax\": 397.17}, \"waypoint_deltas\": [{\"dx\": 0.22, \"dy\": -0.5, \"dz\": -0.01, \"dpitch\": -0.05, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -0.99, \"dz\": -0.02, \"dpitch\": 0.04, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -1.49, \"dz\": -0.03, \"dpitch\": 0.08, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": 1.37, \"dy\": -1.99, \"dz\": -0.04, \"dpitch\": 0.11, \"dyaw\": 0.39, \"droll\": 0.0}, {\"dx\": 1.86, \"dy\": -2.48, \"dz\": -0.04, \"dpitch\": 0.13, \"dyaw\": 0.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.49, "window_alt_abs_m": 0.06, "target_px_mean_hist": 758.8, "cur_frame_id": 24, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-115.61, 1.09, 20.03, -45.83, -79.07, 0.0]\n  Target bbox: [617.91, 320.33, 661.96, 398.87]\n\nFrame 2:\n  Drone pose: [-115.1, 0.62, 20.02, -45.79, -79.12, 0.0]\n  Target bbox: [614.4, 318.46, 665.43, 400.94]\n\nFrame 3:\n  Drone pose: [-114.59, 0.15, 20.02, -45.75, -79.18, 0.0]\n  Target bbox: [619.0, 320.93, 660.87, 398.23]\n\nFrame 4:\n  Drone pose: [-114.06, -0.34, 20.02, -45.73, -79.26, 0.0]\n  Target bbox: [617.51, 320.48, 662.36, 398.79]\n\nFrame 5 (current):\n  Drone pose: [-113.54, -0.83, 20.01, -45.72, -79.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.6, \"ymin\": 317.17, \"xmax\": 667.21, \"ymax\": 402.32}, \"waypoint_deltas\": [{\"dx\": 0.52, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -1.02, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -1.55, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -2.09, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": 2.45, \"dy\": -2.64, \"dz\": -0.01, \"dpitch\": -0.18, \"dyaw\": 0.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.26, "window_alt_abs_m": 0.01, "target_px_mean_hist": 731.5, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.64, -4.03, 20.0, -45.97, -78.91, 0.0]\n  Target bbox: [612.77, 317.35, 667.01, 402.15]\n\nFrame 2:\n  Drone pose: [-110.2, -4.59, 20.0, -46.05, -78.71, 0.0]\n  Target bbox: [614.34, 317.59, 665.42, 401.79]\n\nFrame 3:\n  Drone pose: [-109.75, -5.13, 20.0, -46.1, -78.52, 0.0]\n  Target bbox: [615.25, 318.74, 664.61, 400.54]\n\nFrame 4:\n  Drone pose: [-109.32, -5.65, 20.0, -46.11, -78.3, 0.0]\n  Target bbox: [614.99, 318.13, 664.78, 401.21]\n\nFrame 5 (current):\n  Drone pose: [-108.93, -6.16, 20.0, -46.09, -77.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.73, \"ymin\": 319.46, \"xmax\": 663.13, \"ymax\": 399.75}, \"waypoint_deltas\": [{\"dx\": 0.34, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.52, \"droll\": 0.0}, {\"dx\": 0.64, \"dy\": -1.04, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 1.15, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": -1.57, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.24, \"droll\": 0.0}, {\"dx\": 1.28, \"dy\": -2.08, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": -0.81, \"droll\": 0.0}, {\"dx\": 1.66, \"dy\": -2.58, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -0.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.95, "window_alt_abs_m": 0.0, "target_px_mean_hist": 750.5, "cur_frame_id": 45, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-106.58, -9.7, 20.0, -46.09, -77.46, 0.0]\n  Target bbox: [614.6, 318.48, 665.22, 400.85]\n\nFrame 2:\n  Drone pose: [-106.34, -10.2, 20.0, -46.0, -76.68, 0.0]\n  Target bbox: [622.6, 321.76, 657.74, 397.33]\n\nFrame 3:\n  Drone pose: [-106.14, -10.71, 20.0, -46.09, -77.28, 0.0]\n  Target bbox: [621.47, 321.86, 658.89, 397.28]\n\nFrame 4:\n  Drone pose: [-105.92, -11.23, 20.0, -46.19, -77.92, 0.0]\n  Target bbox: [620.57, 321.51, 659.81, 397.6]\n\nFrame 5 (current):\n  Drone pose: [-105.66, -11.75, 20.0, -46.3, -78.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.35, \"ymin\": 319.36, \"xmax\": 663.49, \"ymax\": 399.9}, \"waypoint_deltas\": [{\"dx\": 0.35, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.48, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": 1.28, \"dy\": -1.47, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.67, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": -1.94, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": 0.6, \"droll\": 0.0}, {\"dx\": 2.23, \"dy\": -2.43, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": 0.79, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.83, "window_alt_abs_m": 0.0, "target_px_mean_hist": 766.8, "cur_frame_id": 56, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.42, -14.88, 20.0, -46.24, -76.31, 0.0]\n  Target bbox: [615.7, 318.97, 664.09, 400.32]\n\nFrame 2:\n  Drone pose: [-103.45, -15.6, 20.0, -46.37, -74.51, 0.0]\n  Target bbox: [612.98, 317.4, 666.72, 402.05]\n\nFrame 3:\n  Drone pose: [-103.5, -16.34, 20.0, -46.48, -72.6, 0.0]\n  Target bbox: [614.39, 318.44, 665.34, 400.89]\n\nFrame 4:\n  Drone pose: [-103.68, -17.13, 20.0, -46.59, -70.25, 0.0]\n  Target bbox: [619.47, 321.0, 660.31, 398.07]\n\nFrame 5 (current):\n  Drone pose: [-104.21, -18.12, 20.0, -46.7, -66.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.08, \"ymin\": 318.05, \"xmax\": 665.5, \"ymax\": 401.31}, \"waypoint_deltas\": [{\"dx\": -0.63, \"dy\": -1.04, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 4.01, \"droll\": 0.0}, {\"dx\": -1.28, \"dy\": -2.13, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 8.13, \"droll\": 0.0}, {\"dx\": -1.85, \"dy\": -3.18, \"dz\": 0.0, \"dpitch\": 0.51, \"dyaw\": 11.94, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": -4.05, \"dz\": 0.0, \"dpitch\": 0.71, \"dyaw\": 14.43, \"droll\": 0.0}, {\"dx\": -2.21, \"dy\": -4.89, \"dz\": 0.0, \"dpitch\": 0.93, \"dyaw\": 16.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.68, "window_alt_abs_m": 0.0, "target_px_mean_hist": 750.8, "cur_frame_id": 66, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-106.44, -23.78, 20.0, -45.56, -48.16, 0.0]\n  Target bbox: [619.29, 321.53, 660.15, 397.71]\n\nFrame 2:\n  Drone pose: [-106.31, -24.48, 20.0, -45.4, -46.89, 0.0]\n  Target bbox: [621.8, 323.51, 657.78, 395.53]\n\nFrame 3:\n  Drone pose: [-106.17, -25.17, 20.0, -45.23, -45.7, 0.0]\n  Target bbox: [622.34, 323.59, 657.24, 395.5]\n\nFrame 4:\n  Drone pose: [-105.81, -25.76, 20.0, -45.17, -45.2, 0.0]\n  Target bbox: [620.87, 323.43, 659.01, 395.69]\n\nFrame 5 (current):\n  Drone pose: [-105.43, -26.33, 20.0, -45.12, -44.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.68, \"ymin\": 323.36, \"xmax\": 658.45, \"ymax\": 395.77}, \"waypoint_deltas\": [{\"dx\": 0.4, \"dy\": -0.57, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": 0.8, \"dy\": -1.13, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.72, \"droll\": 0.0}, {\"dx\": 1.2, \"dy\": -1.69, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 1.06, \"droll\": 0.0}, {\"dx\": 1.6, \"dy\": -2.25, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": 1.4, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -2.81, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": 1.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.37, "window_alt_abs_m": 0.0, "target_px_mean_hist": 770.0, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.66, -30.25, 20.0, -44.76, -42.36, 0.0]\n  Target bbox: [621.59, 321.55, 658.93, 397.68]\n\nFrame 2:\n  Drone pose: [-102.27, -30.81, 20.0, -44.69, -41.98, 0.0]\n  Target bbox: [620.44, 321.85, 660.11, 397.44]\n\nFrame 3:\n  Drone pose: [-101.88, -31.37, 20.0, -44.63, -41.62, 0.0]\n  Target bbox: [621.1, 321.6, 659.43, 397.65]\n\nFrame 4:\n  Drone pose: [-101.46, -31.93, 20.0, -44.6, -41.32, 0.0]\n  Target bbox: [624.35, 324.48, 655.98, 394.58]\n\nFrame 5 (current):\n  Drone pose: [-101.0, -32.47, 20.0, -44.59, -41.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.44, \"ymin\": 322.64, \"xmax\": 660.05, \"ymax\": 396.57}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": -1.34, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": 0.84, \"dy\": -2.21, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 2.9, \"droll\": 0.0}, {\"dx\": 0.84, \"dy\": -3.22, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": 5.03, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": -4.4, \"dz\": 0.0, \"dpitch\": 0.46, \"dyaw\": 7.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.19, "window_alt_abs_m": 0.0, "target_px_mean_hist": 768.8, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/ORI/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.78, -38.31, 20.0, -43.67, -29.57, 0.0]\n  Target bbox: [620.79, 322.86, 659.54, 396.45]\n\nFrame 2:\n  Drone pose: [-100.92, -39.46, 20.0, -43.32, -27.05, 0.0]\n  Target bbox: [618.83, 320.68, 661.57, 398.79]\n\nFrame 3:\n  Drone pose: [-101.02, -40.59, 20.0, -42.92, -24.69, 0.0]\n  Target bbox: [619.95, 322.45, 660.34, 396.96]\n\nFrame 4:\n  Drone pose: [-101.03, -41.61, 20.0, -42.57, -22.79, 0.0]\n  Target bbox: [619.06, 322.01, 661.2, 397.42]\n\nFrame 5 (current):\n  Drone pose: [-100.72, -42.29, 20.0, -42.42, -22.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.92, \"ymin\": 322.66, \"xmax\": 660.34, \"ymax\": 396.8}, \"waypoint_deltas\": [{\"dx\": 0.41, \"dy\": -0.55, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.21, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": -1.43, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 1.14, \"droll\": 0.0}, {\"dx\": 2.25, \"dy\": -2.33, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": 1.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.46, "window_alt_abs_m": 0.0, "target_px_mean_hist": 723.0, "cur_frame_id": 97, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-128.65, 10.68, 22.0, -52.1, -66.7, 0.0]\n  Target bbox: [566.79, 374.65, 616.36, 453.23]\n\nFrame 2:\n  Drone pose: [-129.48, 10.14, 21.2, -52.0, -66.32, 0.0]\n  Target bbox: [619.16, 356.46, 667.64, 437.35]\n\nFrame 3:\n  Drone pose: [-129.51, 9.96, 20.67, -53.4, -63.82, 0.0]\n  Target bbox: [610.72, 316.53, 669.48, 402.17]\n\nFrame 4:\n  Drone pose: [-129.49, 9.77, 20.64, -53.26, -61.76, 0.0]\n  Target bbox: [614.57, 316.7, 664.96, 401.99]\n\nFrame 5 (current):\n  Drone pose: [-129.42, 9.6, 20.62, -52.3, -60.91, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.97, \"ymin\": 319.66, \"xmax\": 662.68, \"ymax\": 399.03}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": -0.16, \"dz\": -0.03, \"dpitch\": 0.75, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.65, \"dy\": -0.38, \"dz\": -0.05, \"dpitch\": 1.37, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.64, \"dz\": -0.07, \"dpitch\": 1.89, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.36, \"dy\": -0.96, \"dz\": -0.09, \"dpitch\": 2.32, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": 1.72, \"dy\": -1.32, \"dz\": -0.2, \"dpitch\": 2.81, \"dyaw\": 0.35, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.8, "window_alt_abs_m": 1.38, "target_px_mean_hist": 749.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 2}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-127.34, 7.89, 20.39, -49.16, -60.34, 0.0]\n  Target bbox: [616.14, 319.36, 663.45, 399.63]\n\nFrame 2:\n  Drone pose: [-127.0, 7.46, 20.36, -48.89, -59.98, 0.0]\n  Target bbox: [616.54, 319.85, 663.08, 399.1]\n\nFrame 3:\n  Drone pose: [-126.73, 6.95, 20.33, -49.45, -55.12, 0.0]\n  Target bbox: [571.33, 309.15, 614.95, 385.49]\n\nFrame 4:\n  Drone pose: [-126.46, 6.41, 20.3, -45.47, -58.82, 0.0]\n  Target bbox: [617.85, 371.86, 668.67, 455.49]\n\nFrame 5 (current):\n  Drone pose: [-125.77, 6.16, 20.24, -43.74, -58.05, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 676.7, \"ymin\": 353.42, \"xmax\": 722.05, \"ymax\": 431.3}, \"waypoint_deltas\": [{\"dx\": 1.1, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": -4.53, \"dyaw\": -3.93, \"droll\": 0.0}, {\"dx\": 2.09, \"dy\": -0.14, \"dz\": -0.02, \"dpitch\": -4.25, \"dyaw\": -6.02, \"droll\": 0.0}, {\"dx\": 3.29, \"dy\": -0.11, \"dz\": -0.05, \"dpitch\": -3.89, \"dyaw\": -8.84, \"droll\": 0.0}, {\"dx\": 4.49, \"dy\": 0.03, \"dz\": -0.07, \"dpitch\": -3.32, \"dyaw\": -11.7, \"droll\": 0.0}, {\"dx\": 5.59, \"dy\": 0.11, \"dz\": -0.09, \"dpitch\": -2.97, \"dyaw\": -15.64, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.68, "window_alt_abs_m": 0.15, "target_px_mean_hist": 532.2, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-119.23, 6.26, 20.14, -43.85, -81.66, 0.0]\n  Target bbox: [670.85, 365.3, 719.3, 439.49]\n\nFrame 2:\n  Drone pose: [-118.9, 5.94, 20.12, -46.47, -74.44, 0.0]\n  Target bbox: [572.15, 316.16, 622.19, 393.43]\n\nFrame 3:\n  Drone pose: [-118.74, 5.51, 20.11, -46.03, -78.63, 0.0]\n  Target bbox: [618.2, 321.9, 662.19, 397.23]\n\nFrame 4:\n  Drone pose: [-118.62, 5.03, 20.09, -43.27, -74.0, 0.0]\n  Target bbox: [558.26, 369.2, 604.49, 446.0]\n\nFrame 5 (current):\n  Drone pose: [-118.38, 4.51, 19.97, -46.83, -81.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.92, \"ymin\": 319.26, \"xmax\": 665.78, \"ymax\": 400.01}, \"waypoint_deltas\": [{\"dx\": 0.13, \"dy\": -0.48, \"dz\": 0.1, \"dpitch\": 0.73, \"dyaw\": 1.17, \"droll\": 0.0}, {\"dx\": 0.44, \"dy\": -0.97, \"dz\": 0.09, \"dpitch\": 0.82, \"dyaw\": 1.76, \"droll\": 0.0}, {\"dx\": 0.83, \"dy\": -1.47, \"dz\": 0.08, \"dpitch\": 0.86, \"dyaw\": 2.09, \"droll\": 0.0}, {\"dx\": 1.28, \"dy\": -1.97, \"dz\": 0.07, \"dpitch\": 0.89, \"dyaw\": 2.23, \"droll\": 0.0}, {\"dx\": 1.77, \"dy\": -2.46, \"dz\": 0.07, \"dpitch\": 0.91, \"dyaw\": 2.28, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.36, "window_alt_abs_m": 0.16, "target_px_mean_hist": 729.0, "cur_frame_id": 24, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-115.61, 1.09, 20.03, -48.02, -74.07, 0.0]\n  Target bbox: [556.7, 285.64, 604.75, 363.65]\n\nFrame 2:\n  Drone pose: [-114.93, 0.68, 19.99, -41.69, -77.41, 0.0]\n  Target bbox: [620.77, 325.0, 658.89, 394.36]\n\nFrame 3:\n  Drone pose: [-114.59, 0.15, 20.02, -48.4, -75.0, 0.0]\n  Target bbox: [563.44, 274.17, 616.9, 358.84]\n\nFrame 4:\n  Drone pose: [-114.06, -0.34, 20.02, -47.72, -74.66, 0.0]\n  Target bbox: [562.72, 288.51, 608.13, 366.66]\n\nFrame 5 (current):\n  Drone pose: [-113.54, -0.83, 20.01, -45.72, -79.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.41, \"ymin\": 317.31, \"xmax\": 666.36, \"ymax\": 402.15}, \"waypoint_deltas\": [{\"dx\": 0.52, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -1.02, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -1.55, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -2.09, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": 2.45, \"dy\": -2.64, \"dz\": -0.01, \"dpitch\": -0.18, \"dyaw\": 0.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.77, "window_alt_abs_m": 0.08, "target_px_mean_hist": 709.0, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.64, -4.03, 20.0, -45.38, -76.75, 0.0]\n  Target bbox: [588.35, 328.67, 639.98, 411.31]\n\nFrame 2:\n  Drone pose: [-110.2, -4.59, 20.0, -46.05, -78.71, 0.0]\n  Target bbox: [613.48, 317.49, 666.34, 401.89]\n\nFrame 3:\n  Drone pose: [-109.75, -5.13, 20.0, -42.15, -83.52, 0.0]\n  Target bbox: [672.61, 386.02, 725.08, 469.64]\n\nFrame 4:\n  Drone pose: [-109.32, -5.65, 20.0, -46.11, -78.3, 0.0]\n  Target bbox: [618.97, 320.8, 660.88, 398.35]\n\nFrame 5 (current):\n  Drone pose: [-108.93, -6.16, 20.0, -48.9, -82.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 672.02, \"ymin\": 271.85, \"xmax\": 725.68, \"ymax\": 356.64}, \"waypoint_deltas\": [{\"dx\": 0.34, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 2.85, \"dyaw\": 5.52, \"droll\": 0.0}, {\"dx\": 0.64, \"dy\": -1.04, \"dz\": 0.0, \"dpitch\": 2.88, \"dyaw\": 6.15, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": -1.57, \"dz\": 0.0, \"dpitch\": 2.73, \"dyaw\": 5.24, \"droll\": 0.0}, {\"dx\": 1.28, \"dy\": -2.08, \"dz\": 0.0, \"dpitch\": 2.6, \"dyaw\": 4.19, \"droll\": 0.0}, {\"dx\": 1.66, \"dy\": -2.58, \"dz\": 0.0, \"dpitch\": 2.65, \"dyaw\": 4.56, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.65, "window_alt_abs_m": 0.0, "target_px_mean_hist": 752.2, "cur_frame_id": 45, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-106.58, -9.7, 20.0, -46.83, -72.46, 0.0]\n  Target bbox: [554.53, 308.13, 607.56, 390.25]\n\nFrame 2:\n  Drone pose: [-106.34, -10.2, 20.0, -49.1, -77.79, 0.0]\n  Target bbox: [632.65, 269.47, 674.5, 345.82]\n\nFrame 3:\n  Drone pose: [-106.14, -10.71, 20.0, -46.61, -72.83, 0.0]\n  Target bbox: [561.26, 313.17, 614.29, 391.63]\n\nFrame 4:\n  Drone pose: [-105.92, -11.23, 20.0, -46.74, -72.92, 0.0]\n  Target bbox: [554.89, 311.19, 607.43, 393.42]\n\nFrame 5 (current):\n  Drone pose: [-105.58, -11.86, 20.07, -44.6, -82.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.33, \"ymin\": 401.5, \"xmax\": 671.14, \"ymax\": 483.74}, \"waypoint_deltas\": [{\"dx\": 0.27, \"dy\": -0.4, \"dz\": -0.07, \"dpitch\": -1.67, \"dyaw\": 4.12, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": -0.89, \"dz\": -0.07, \"dpitch\": -1.64, \"dyaw\": 4.33, \"droll\": 0.0}, {\"dx\": 1.2, \"dy\": -1.36, \"dz\": -0.07, \"dpitch\": -1.6, \"dyaw\": 4.31, \"droll\": 0.0}, {\"dx\": 1.72, \"dy\": -1.83, \"dz\": -0.07, \"dpitch\": -1.55, \"dyaw\": 4.24, \"droll\": 0.0}, {\"dx\": 2.15, \"dy\": -2.32, \"dz\": -0.07, \"dpitch\": -1.51, \"dyaw\": 4.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.84, "window_alt_abs_m": 0.07, "target_px_mean_hist": 726.8, "cur_frame_id": 56, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.51, -14.89, 20.14, -49.77, -80.47, 0.0]\n  Target bbox: [614.35, 317.35, 665.5, 401.65]\n\nFrame 2:\n  Drone pose: [-103.45, -15.6, 20.0, -46.37, -74.51, 0.0]\n  Target bbox: [615.56, 319.68, 664.12, 399.5]\n\nFrame 3:\n  Drone pose: [-103.7, -16.34, 19.99, -54.43, -78.39, 0.0]\n  Target bbox: [648.37, 244.01, 705.01, 331.98]\n\nFrame 4:\n  Drone pose: [-103.68, -17.13, 20.0, -46.59, -70.25, 0.0]\n  Target bbox: [618.19, 320.4, 661.58, 398.71]\n\nFrame 5 (current):\n  Drone pose: [-104.21, -18.12, 20.0, -46.7, -66.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.69, \"ymin\": 319.2, \"xmax\": 663.99, \"ymax\": 399.98}, \"waypoint_deltas\": [{\"dx\": -0.63, \"dy\": -1.04, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 4.01, \"droll\": 0.0}, {\"dx\": -1.28, \"dy\": -2.13, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 8.13, \"droll\": 0.0}, {\"dx\": -1.85, \"dy\": -3.18, \"dz\": 0.0, \"dpitch\": 0.51, \"dyaw\": 11.94, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": -4.05, \"dz\": 0.0, \"dpitch\": 0.71, \"dyaw\": 14.43, \"droll\": 0.0}, {\"dx\": -2.21, \"dy\": -4.89, \"dz\": 0.0, \"dpitch\": 0.93, \"dyaw\": 16.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.6, "window_alt_abs_m": 0.17, "target_px_mean_hist": 771.0, "cur_frame_id": 66, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-106.44, -23.78, 20.0, -41.31, -53.16, 0.0]\n  Target bbox: [679.91, 394.67, 718.25, 470.97]\n\nFrame 2:\n  Drone pose: [-106.31, -24.48, 20.0, -50.4, -51.89, 0.0]\n  Target bbox: [673.86, 235.63, 725.59, 319.12]\n\nFrame 3:\n  Drone pose: [-106.11, -25.28, 20.06, -49.34, -57.13, 0.0]\n  Target bbox: [664.9, 237.43, 711.57, 316.06]\n\nFrame 4:\n  Drone pose: [-105.81, -25.76, 20.0, -45.17, -45.2, 0.0]\n  Target bbox: [615.99, 320.77, 664.33, 398.51]\n\nFrame 5 (current):\n  Drone pose: [-105.35, -26.45, 20.06, -48.13, -42.82, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 566.12, \"ymin\": 241.57, \"xmax\": 619.38, \"ymax\": 325.73}, \"waypoint_deltas\": [{\"dx\": 0.32, \"dy\": -0.45, \"dz\": -0.06, \"dpitch\": 3.05, \"dyaw\": -1.6, \"droll\": 0.0}, {\"dx\": 0.72, \"dy\": -1.01, \"dz\": -0.06, \"dpitch\": 3.09, \"dyaw\": -1.25, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": -1.57, \"dz\": -0.06, \"dpitch\": 3.13, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -2.13, \"dz\": -0.06, \"dpitch\": 3.18, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": 1.92, \"dy\": -2.69, \"dz\": -0.06, \"dpitch\": 3.24, \"dyaw\": -0.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.82, "window_alt_abs_m": 0.19, "target_px_mean_hist": 743.2, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.75, -30.19, 19.97, -49.34, -41.66, 0.0]\n  Target bbox: [651.05, 321.95, 695.69, 403.23]\n\nFrame 2:\n  Drone pose: [-102.21, -30.9, 20.08, -44.47, -48.13, 0.0]\n  Target bbox: [674.26, 367.82, 720.25, 450.21]\n\nFrame 3:\n  Drone pose: [-101.88, -31.37, 20.0, -44.63, -41.62, 0.0]\n  Target bbox: [619.56, 321.99, 660.98, 397.23]\n\nFrame 4:\n  Drone pose: [-101.46, -31.93, 20.0, -44.6, -41.32, 0.0]\n  Target bbox: [622.37, 322.25, 658.09, 396.97]\n\nFrame 5 (current):\n  Drone pose: [-101.0, -32.47, 20.0, -44.59, -41.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.42, \"ymin\": 321.87, \"xmax\": 661.14, \"ymax\": 397.42}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": -1.34, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": 0.84, \"dy\": -2.21, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 2.9, \"droll\": 0.0}, {\"dx\": 0.84, \"dy\": -3.22, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": 5.03, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": -4.4, \"dz\": 0.0, \"dpitch\": 0.46, \"dyaw\": 7.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.43, "window_alt_abs_m": 0.19, "target_px_mean_hist": 784.5, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424/aug_001/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.78, -38.31, 20.0, -43.67, -29.57, 0.0]\n  Target bbox: [620.54, 322.51, 659.8, 396.82]\n\nFrame 2:\n  Drone pose: [-100.92, -39.46, 20.0, -43.32, -27.05, 0.0]\n  Target bbox: [616.6, 320.45, 663.84, 399.12]\n\nFrame 3:\n  Drone pose: [-101.02, -40.59, 20.0, -42.92, -24.69, 0.0]\n  Target bbox: [619.4, 322.02, 660.91, 397.43]\n\nFrame 4:\n  Drone pose: [-101.03, -41.61, 20.0, -43.32, -22.44, 0.0]\n  Target bbox: [615.28, 309.89, 656.18, 384.69]\n\nFrame 5 (current):\n  Drone pose: [-100.72, -42.29, 20.0, -46.08, -18.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 581.77, \"ymin\": 264.41, \"xmax\": 612.82, \"ymax\": 333.59}, \"waypoint_deltas\": [{\"dx\": 0.41, \"dy\": -0.55, \"dz\": 0.0, \"dpitch\": 3.75, \"dyaw\": -3.22, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": 3.83, \"dyaw\": -3.27, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": -1.43, \"dz\": 0.0, \"dpitch\": 3.9, \"dyaw\": -3.43, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": 3.73, \"dyaw\": -2.29, \"droll\": 0.0}, {\"dx\": 2.25, \"dy\": -2.33, \"dz\": 0.0, \"dpitch\": 3.81, \"dyaw\": -2.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.89, "window_alt_abs_m": 0.0, "target_px_mean_hist": 709.5, "cur_frame_id": 97, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776119424", "difficulty_score": 0.4246, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [109.36, 106.72, 22.0, -46.4, -98.53, 0.0]\n  Target bbox: [616.02, 320.97, 664.13, 398.52]\n\nFrame 2:\n  Drone pose: [107.87, 105.09, 21.2, -47.14, -96.06, 0.0]\n  Target bbox: [617.02, 324.67, 662.54, 394.49]\n\nFrame 3:\n  Drone pose: [107.06, 104.06, 20.67, -47.3, -93.69, 0.0]\n  Target bbox: [614.79, 318.68, 665.29, 400.63]\n\nFrame 4:\n  Drone pose: [106.62, 103.36, 20.64, -47.58, -93.91, 0.0]\n  Target bbox: [614.17, 318.28, 665.91, 401.05]\n\nFrame 5 (current):\n  Drone pose: [106.36, 102.78, 20.62, -47.63, -94.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.59, \"ymin\": 322.98, \"xmax\": 660.95, \"ymax\": 396.04}, \"waypoint_deltas\": [{\"dx\": -0.18, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": -0.01, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": -1.02, \"dz\": -0.05, \"dpitch\": 0.0, \"dyaw\": 1.0, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -1.52, \"dz\": -0.07, \"dpitch\": 0.02, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": -0.52, \"dy\": -2.03, \"dz\": -0.09, \"dpitch\": 0.03, \"dyaw\": 1.66, \"droll\": 0.0}, {\"dx\": -0.61, \"dy\": -2.53, \"dz\": -0.2, \"dpitch\": 0.17, \"dyaw\": 1.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.85, "window_alt_abs_m": 1.38, "target_px_mean_hist": 683.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.67, 99.74, 20.39, -47.43, -92.51, 0.0]\n  Target bbox: [623.2, 323.51, 656.41, 395.5]\n\nFrame 2:\n  Drone pose: [105.6, 99.24, 20.36, -47.4, -92.27, 0.0]\n  Target bbox: [617.46, 323.55, 662.03, 395.45]\n\nFrame 3:\n  Drone pose: [105.53, 98.73, 20.33, -47.37, -92.07, 0.0]\n  Target bbox: [616.7, 324.31, 662.81, 394.66]\n\nFrame 4:\n  Drone pose: [105.48, 98.22, 20.3, -47.34, -91.89, 0.0]\n  Target bbox: [617.67, 323.48, 661.82, 395.52]\n\nFrame 5 (current):\n  Drone pose: [105.44, 97.72, 20.27, -47.31, -91.76, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.83, \"ymin\": 323.47, \"xmax\": 659.71, \"ymax\": 395.52}, \"waypoint_deltas\": [{\"dx\": -0.03, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": 0.04, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": -1.54, \"dz\": -0.08, \"dpitch\": 0.06, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -2.06, \"dz\": -0.1, \"dpitch\": 0.06, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -2.59, \"dz\": -0.12, \"dpitch\": 0.05, \"dyaw\": -0.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.75, "window_alt_abs_m": 0.12, "target_px_mean_hist": 771.5, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.62, 94.6, 20.14, -47.28, -92.37, 0.0]\n  Target bbox: [618.22, 323.62, 661.3, 395.35]\n\nFrame 2:\n  Drone pose: [105.75, 94.06, 20.12, -47.31, -92.77, 0.0]\n  Target bbox: [619.93, 323.06, 659.62, 395.88]\n\nFrame 3:\n  Drone pose: [105.89, 93.52, 20.1, -47.35, -93.26, 0.0]\n  Target bbox: [620.3, 322.84, 659.26, 396.1]\n\nFrame 4:\n  Drone pose: [106.06, 92.96, 20.09, -47.4, -93.83, 0.0]\n  Target bbox: [622.68, 322.55, 656.93, 396.37]\n\nFrame 5 (current):\n  Drone pose: [106.25, 92.4, 20.08, -47.46, -94.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.0, \"ymin\": 322.8, \"xmax\": 661.53, \"ymax\": 396.16}, \"waypoint_deltas\": [{\"dx\": 0.21, \"dy\": -0.57, \"dz\": -0.01, \"dpitch\": -0.08, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": -1.15, \"dz\": -0.02, \"dpitch\": -0.16, \"dyaw\": -1.37, \"droll\": 0.0}, {\"dx\": 0.6, \"dy\": -1.74, \"dz\": -0.03, \"dpitch\": -0.26, \"dyaw\": -2.03, \"droll\": 0.0}, {\"dx\": 0.76, \"dy\": -2.33, \"dz\": -0.04, \"dpitch\": -0.37, \"dyaw\": -2.58, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": -2.94, \"dz\": -0.04, \"dpitch\": -0.51, \"dyaw\": -2.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.09, "window_alt_abs_m": 0.06, "target_px_mean_hist": 776.0, "cur_frame_id": 24, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.15, 88.22, 20.03, -48.23, -99.34, 0.0]\n  Target bbox: [615.06, 318.03, 665.08, 401.1]\n\nFrame 2:\n  Drone pose: [107.08, 87.58, 20.02, -48.32, -100.85, 0.0]\n  Target bbox: [613.27, 316.49, 666.95, 402.72]\n\nFrame 3:\n  Drone pose: [106.96, 86.94, 20.02, -48.43, -102.22, 0.0]\n  Target bbox: [614.35, 317.03, 665.88, 402.11]\n\nFrame 4:\n  Drone pose: [106.81, 86.29, 20.02, -48.54, -103.47, 0.0]\n  Target bbox: [615.34, 317.5, 664.89, 401.57]\n\nFrame 5 (current):\n  Drone pose: [106.62, 85.64, 20.01, -48.65, -104.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.85, \"ymin\": 316.8, \"xmax\": 666.41, \"ymax\": 402.36}, \"waypoint_deltas\": [{\"dx\": -0.21, \"dy\": -0.65, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -1.1, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -1.29, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": -2.16, \"droll\": 0.0}, {\"dx\": -0.67, \"dy\": -1.93, \"dz\": 0.0, \"dpitch\": -0.3, \"dyaw\": -3.18, \"droll\": 0.0}, {\"dx\": -0.91, \"dy\": -2.57, \"dz\": 0.0, \"dpitch\": -0.38, \"dyaw\": -4.16, \"droll\": 0.0}, {\"dx\": -1.17, \"dy\": -3.2, \"dz\": -0.01, \"dpitch\": -0.46, \"dyaw\": -5.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.29, "window_alt_abs_m": 0.01, "target_px_mean_hist": 783.8, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.18, 81.81, 20.0, -49.17, -110.65, 0.0]\n  Target bbox: [616.67, 318.16, 663.64, 400.79]\n\nFrame 2:\n  Drone pose: [104.91, 81.19, 20.0, -49.23, -111.52, 0.0]\n  Target bbox: [618.08, 319.67, 662.2, 399.27]\n\nFrame 3:\n  Drone pose: [104.61, 80.58, 20.0, -49.27, -112.34, 0.0]\n  Target bbox: [617.45, 318.63, 662.86, 400.26]\n\nFrame 4:\n  Drone pose: [104.31, 79.98, 20.0, -49.31, -113.1, 0.0]\n  Target bbox: [613.12, 316.74, 667.28, 402.36]\n\nFrame 5 (current):\n  Drone pose: [103.98, 79.38, 20.0, -49.33, -113.81, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.11, \"ymin\": 318.28, \"xmax\": 664.25, \"ymax\": 400.7}, \"waypoint_deltas\": [{\"dx\": -0.33, \"dy\": -0.58, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.64, \"droll\": 0.0}, {\"dx\": -0.68, \"dy\": -1.15, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.22, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": -1.71, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -1.73, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": -2.25, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -2.2, \"droll\": 0.0}, {\"dx\": -1.81, \"dy\": -2.79, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -2.62, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.15, "window_alt_abs_m": 0.0, "target_px_mean_hist": 813.8, "cur_frame_id": 45, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.4, 75.57, 20.0, -49.14, -117.17, 0.0]\n  Target bbox: [616.41, 318.0, 664.01, 401.01]\n\nFrame 2:\n  Drone pose: [101.0, 75.08, 20.0, -49.03, -117.47, 0.0]\n  Target bbox: [616.44, 318.65, 663.3, 400.35]\n\nFrame 3:\n  Drone pose: [100.6, 74.61, 20.0, -49.31, -116.18, 0.0]\n  Target bbox: [619.6, 320.69, 660.68, 398.14]\n\nFrame 4:\n  Drone pose: [100.19, 74.14, 20.0, -49.19, -116.42, 0.0]\n  Target bbox: [615.57, 317.7, 664.87, 401.35]\n\nFrame 5 (current):\n  Drone pose: [99.79, 73.69, 20.0, -48.82, -116.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.74, \"ymin\": 320.05, \"xmax\": 661.58, \"ymax\": 398.88}, \"waypoint_deltas\": [{\"dx\": -0.39, \"dy\": -0.46, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": -0.27, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": -0.91, \"dz\": 0.0, \"dpitch\": 0.34, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": -1.08, \"dy\": -1.37, \"dz\": 0.0, \"dpitch\": 0.53, \"dyaw\": -1.08, \"droll\": 0.0}, {\"dx\": -1.35, \"dy\": -1.84, \"dz\": 0.0, \"dpitch\": 0.76, \"dyaw\": -1.71, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": -2.32, \"dz\": 0.0, \"dpitch\": 1.04, \"dyaw\": -2.54, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.08, "window_alt_abs_m": 0.0, "target_px_mean_hist": 790.0, "cur_frame_id": 56, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.1, 70.9, 20.0, -47.86, -118.76, 0.0]\n  Target bbox: [614.25, 317.77, 665.53, 401.36]\n\nFrame 2:\n  Drone pose: [98.02, 70.41, 20.0, -47.91, -118.51, 0.0]\n  Target bbox: [612.24, 317.32, 667.57, 401.87]\n\nFrame 3:\n  Drone pose: [97.98, 69.92, 20.0, -47.93, -118.39, 0.0]\n  Target bbox: [612.3, 317.29, 667.5, 401.89]\n\nFrame 4:\n  Drone pose: [97.98, 69.42, 20.0, -47.93, -118.38, 0.0]\n  Target bbox: [612.23, 317.34, 667.58, 401.84]\n\nFrame 5 (current):\n  Drone pose: [98.0, 68.91, 20.0, -47.92, -118.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.9, \"ymin\": 317.76, \"xmax\": 665.9, \"ymax\": 401.33}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.24, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": -1.99, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": -2.48, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -0.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.43, "window_alt_abs_m": 0.0, "target_px_mean_hist": 783.8, "cur_frame_id": 66, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.13, 65.94, 20.0, -47.77, -118.78, 0.0]\n  Target bbox: [612.54, 317.23, 667.27, 401.94]\n\nFrame 2:\n  Drone pose: [98.13, 65.46, 20.0, -47.75, -118.75, 0.0]\n  Target bbox: [611.64, 317.58, 668.21, 401.65]\n\nFrame 3:\n  Drone pose: [98.12, 64.98, 20.0, -47.73, -118.69, 0.0]\n  Target bbox: [616.04, 318.63, 663.72, 400.47]\n\nFrame 4:\n  Drone pose: [98.09, 64.5, 20.0, -47.72, -118.56, 0.0]\n  Target bbox: [612.65, 317.02, 667.14, 402.14]\n\nFrame 5 (current):\n  Drone pose: [98.04, 64.03, 20.0, -47.71, -118.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.55, \"ymin\": 317.83, \"xmax\": 666.26, \"ymax\": 401.3}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": -0.46, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -0.93, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.61, \"droll\": 0.0}, {\"dx\": -0.29, \"dy\": -1.39, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 1.03, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 1.51, \"droll\": 0.0}, {\"dx\": -0.59, \"dy\": -2.33, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": 2.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.4, "window_alt_abs_m": 0.0, "target_px_mean_hist": 802.2, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.13, 60.73, 20.0, -48.1, -115.32, 0.0]\n  Target bbox: [612.44, 317.66, 667.31, 401.48]\n\nFrame 2:\n  Drone pose: [96.99, 60.25, 20.0, -48.18, -114.88, 0.0]\n  Target bbox: [611.35, 317.83, 668.42, 401.35]\n\nFrame 3:\n  Drone pose: [96.89, 59.76, 20.0, -48.24, -114.55, 0.0]\n  Target bbox: [613.65, 317.48, 666.07, 401.59]\n\nFrame 4:\n  Drone pose: [96.82, 59.26, 20.0, -48.28, -114.34, 0.0]\n  Target bbox: [618.91, 319.37, 660.8, 399.59]\n\nFrame 5 (current):\n  Drone pose: [96.8, 58.75, 20.0, -48.31, -114.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 611.35, \"ymin\": 317.86, \"xmax\": 668.41, \"ymax\": 401.3}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -1.08, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.33, \"droll\": 0.0}, {\"dx\": 0.15, \"dy\": -1.64, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": 0.25, \"dy\": -2.22, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -1.1, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -2.82, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": -1.56, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.03, "window_alt_abs_m": 0.0, "target_px_mean_hist": 802.0, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/ORI/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.25, 55.33, 20.0, -48.63, -116.31, 0.0]\n  Target bbox: [612.96, 317.27, 666.79, 401.8]\n\nFrame 2:\n  Drone pose: [97.32, 54.72, 20.0, -48.74, -116.69, 0.0]\n  Target bbox: [613.4, 317.43, 666.37, 401.61]\n\nFrame 3:\n  Drone pose: [97.36, 54.12, 20.0, -48.86, -116.95, 0.0]\n  Target bbox: [613.13, 317.43, 666.65, 401.59]\n\nFrame 4:\n  Drone pose: [97.35, 53.53, 20.0, -49.01, -117.08, 0.0]\n  Target bbox: [615.75, 317.73, 664.69, 401.3]\n\nFrame 5 (current):\n  Drone pose: [97.31, 52.95, 20.0, -48.75, -118.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.18, \"ymin\": 316.83, \"xmax\": 667.62, \"ymax\": 402.27}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.57, \"dz\": 0.0, \"dpitch\": -0.48, \"dyaw\": -2.36, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": -1.16, \"dz\": 0.0, \"dpitch\": -0.94, \"dyaw\": -4.81, \"droll\": 0.0}, {\"dx\": -0.15, \"dy\": -1.77, \"dz\": 0.0, \"dpitch\": -1.37, \"dyaw\": -7.4, \"droll\": 0.0}, {\"dx\": -0.2, \"dy\": -2.39, \"dz\": 0.0, \"dpitch\": -1.77, \"dyaw\": -10.11, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": -3.01, \"dz\": 0.0, \"dpitch\": -2.1, \"dyaw\": -12.89, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.32, "window_alt_abs_m": 0.0, "target_px_mean_hist": 795.2, "cur_frame_id": 97, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [109.34, 106.64, 22.08, -46.34, -105.76, 0.0]\n  Target bbox: [621.79, 327.27, 658.52, 391.93]\n\nFrame 2:\n  Drone pose: [107.87, 105.09, 21.2, -47.02, -100.09, 0.0]\n  Target bbox: [660.27, 322.58, 712.96, 403.19]\n\nFrame 3:\n  Drone pose: [107.04, 104.19, 20.52, -50.3, -92.05, 0.0]\n  Target bbox: [665.22, 335.78, 721.26, 421.7]\n\nFrame 4:\n  Drone pose: [106.62, 103.36, 20.64, -47.58, -93.91, 0.0]\n  Target bbox: [613.33, 317.62, 666.69, 401.7]\n\nFrame 5 (current):\n  Drone pose: [106.36, 102.78, 20.62, -47.63, -94.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.19, \"ymin\": 323.86, \"xmax\": 663.34, \"ymax\": 395.15}, \"waypoint_deltas\": [{\"dx\": -0.18, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": -0.01, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": -1.02, \"dz\": -0.05, \"dpitch\": 0.0, \"dyaw\": 1.0, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -1.52, \"dz\": -0.07, \"dpitch\": 0.02, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": -0.52, \"dy\": -2.03, \"dz\": -0.09, \"dpitch\": 0.03, \"dyaw\": 1.66, \"droll\": 0.0}, {\"dx\": -0.61, \"dy\": -2.53, \"dz\": -0.2, \"dpitch\": 0.17, \"dyaw\": 1.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.36, "window_alt_abs_m": 1.72, "target_px_mean_hist": 670.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.67, 99.74, 20.39, -45.7, -97.51, 0.0]\n  Target bbox: [673.95, 353.27, 720.25, 427.55]\n\nFrame 2:\n  Drone pose: [105.6, 99.24, 20.36, -48.91, -94.89, 0.0]\n  Target bbox: [648.45, 298.33, 691.7, 370.81]\n\nFrame 3:\n  Drone pose: [105.53, 98.73, 20.33, -48.47, -87.34, 0.0]\n  Target bbox: [562.25, 307.39, 609.42, 377.94]\n\nFrame 4:\n  Drone pose: [105.48, 98.22, 20.3, -47.11, -96.68, 0.0]\n  Target bbox: [667.73, 323.35, 722.22, 407.01]\n\nFrame 5 (current):\n  Drone pose: [105.27, 97.77, 20.3, -42.49, -85.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 605.21, \"ymin\": 324.59, \"xmax\": 654.1, \"ymax\": 401.46}, \"waypoint_deltas\": [{\"dx\": 0.14, \"dy\": -0.56, \"dz\": -0.06, \"dpitch\": -4.8, \"dyaw\": -5.87, \"droll\": 0.0}, {\"dx\": 0.14, \"dy\": -1.08, \"dz\": -0.08, \"dpitch\": -4.78, \"dyaw\": -5.86, \"droll\": 0.0}, {\"dx\": 0.15, \"dy\": -1.59, \"dz\": -0.11, \"dpitch\": -4.76, \"dyaw\": -5.91, \"droll\": 0.0}, {\"dx\": 0.19, \"dy\": -2.11, \"dz\": -0.13, \"dpitch\": -4.76, \"dyaw\": -6.05, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": -2.64, \"dz\": -0.15, \"dpitch\": -4.77, \"dyaw\": -6.27, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 30.38, "window_alt_abs_m": 0.09, "target_px_mean_hist": 770.0, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.62, 94.6, 20.14, -47.28, -92.37, 0.0]\n  Target bbox: [619.06, 323.38, 660.47, 395.57]\n\nFrame 2:\n  Drone pose: [105.75, 94.06, 20.12, -47.31, -92.77, 0.0]\n  Target bbox: [616.69, 323.67, 662.8, 395.34]\n\nFrame 3:\n  Drone pose: [105.89, 93.52, 20.1, -43.27, -89.78, 0.0]\n  Target bbox: [583.15, 392.82, 616.47, 465.13]\n\nFrame 4:\n  Drone pose: [106.06, 92.96, 20.09, -44.57, -94.82, 0.0]\n  Target bbox: [628.51, 370.26, 674.04, 444.02]\n\nFrame 5 (current):\n  Drone pose: [106.25, 92.4, 20.08, -47.46, -94.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.86, \"ymin\": 322.49, \"xmax\": 662.64, \"ymax\": 396.49}, \"waypoint_deltas\": [{\"dx\": 0.21, \"dy\": -0.57, \"dz\": -0.01, \"dpitch\": -0.08, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": -1.15, \"dz\": -0.02, \"dpitch\": -0.16, \"dyaw\": -1.37, \"droll\": 0.0}, {\"dx\": 0.6, \"dy\": -1.74, \"dz\": -0.03, \"dpitch\": -0.26, \"dyaw\": -2.03, \"droll\": 0.0}, {\"dx\": 0.76, \"dy\": -2.33, \"dz\": -0.04, \"dpitch\": -0.37, \"dyaw\": -2.58, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": -2.94, \"dz\": -0.04, \"dpitch\": -0.51, \"dyaw\": -2.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.77, "window_alt_abs_m": 0.06, "target_px_mean_hist": 782.2, "cur_frame_id": 24, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.15, 88.22, 20.03, -48.23, -99.34, 0.0]\n  Target bbox: [613.87, 316.87, 666.31, 402.32]\n\nFrame 2:\n  Drone pose: [107.08, 87.58, 20.02, -48.16, -105.6, 0.0]\n  Target bbox: [666.61, 321.46, 721.04, 406.67]\n\nFrame 3:\n  Drone pose: [106.96, 86.94, 20.02, -43.43, -98.63, 0.0]\n  Target bbox: [572.45, 401.36, 626.21, 488.02]\n\nFrame 4:\n  Drone pose: [106.81, 86.29, 20.02, -50.34, -101.67, 0.0]\n  Target bbox: [594.65, 287.96, 644.33, 370.99]\n\nFrame 5 (current):\n  Drone pose: [106.62, 85.64, 20.01, -48.65, -104.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 611.93, \"ymin\": 316.08, \"xmax\": 668.35, \"ymax\": 403.18}, \"waypoint_deltas\": [{\"dx\": -0.21, \"dy\": -0.65, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -1.1, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -1.29, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": -2.16, \"droll\": 0.0}, {\"dx\": -0.67, \"dy\": -1.93, \"dz\": 0.0, \"dpitch\": -0.3, \"dyaw\": -3.18, \"droll\": 0.0}, {\"dx\": -0.91, \"dy\": -2.57, \"dz\": 0.0, \"dpitch\": -0.38, \"dyaw\": -4.16, \"droll\": 0.0}, {\"dx\": -1.17, \"dy\": -3.2, \"dz\": -0.01, \"dpitch\": -0.46, \"dyaw\": -5.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.24, "window_alt_abs_m": 0.01, "target_px_mean_hist": 763.2, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.18, 81.81, 20.0, -49.17, -110.65, 0.0]\n  Target bbox: [616.77, 319.22, 663.5, 399.74]\n\nFrame 2:\n  Drone pose: [104.91, 81.19, 20.0, -50.86, -106.52, 0.0]\n  Target bbox: [557.69, 291.61, 611.47, 376.12]\n\nFrame 3:\n  Drone pose: [104.61, 80.58, 20.0, -52.78, -112.9, 0.0]\n  Target bbox: [619.21, 257.55, 673.87, 343.72]\n\nFrame 4:\n  Drone pose: [104.31, 79.98, 20.0, -49.31, -113.1, 0.0]\n  Target bbox: [616.23, 318.59, 664.09, 400.38]\n\nFrame 5 (current):\n  Drone pose: [103.98, 79.38, 20.0, -48.56, -108.81, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 561.78, \"ymin\": 333.75, \"xmax\": 607.91, \"ymax\": 414.42}, \"waypoint_deltas\": [{\"dx\": -0.33, \"dy\": -0.58, \"dz\": 0.0, \"dpitch\": -0.78, \"dyaw\": -5.64, \"droll\": 0.0}, {\"dx\": -0.68, \"dy\": -1.15, \"dz\": 0.0, \"dpitch\": -0.79, \"dyaw\": -6.22, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": -1.71, \"dz\": 0.0, \"dpitch\": -0.78, \"dyaw\": -6.73, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": -2.25, \"dz\": 0.0, \"dpitch\": -0.76, \"dyaw\": -7.2, \"droll\": 0.0}, {\"dx\": -1.81, \"dy\": -2.79, \"dz\": 0.0, \"dpitch\": -0.72, \"dyaw\": -7.62, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 816.5, "cur_frame_id": 45, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.4, 75.57, 20.0, -52.66, -114.32, 0.0]\n  Target bbox: [583.69, 258.45, 632.61, 343.37]\n\nFrame 2:\n  Drone pose: [101.0, 75.08, 20.0, -50.53, -122.47, 0.0]\n  Target bbox: [671.45, 295.17, 719.57, 377.11]\n\nFrame 3:\n  Drone pose: [100.6, 74.61, 20.0, -52.85, -111.58, 0.0]\n  Target bbox: [560.02, 258.08, 617.55, 345.0]\n\nFrame 4:\n  Drone pose: [100.14, 74.02, 20.11, -46.37, -112.15, 0.0]\n  Target bbox: [641.26, 324.7, 688.76, 404.29]\n\nFrame 5 (current):\n  Drone pose: [99.79, 73.69, 20.0, -45.0, -115.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 601.97, \"ymin\": 381.2, \"xmax\": 655.09, \"ymax\": 466.64}, \"waypoint_deltas\": [{\"dx\": -0.39, \"dy\": -0.46, \"dz\": 0.0, \"dpitch\": -3.65, \"dyaw\": -1.31, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": -0.91, \"dz\": 0.0, \"dpitch\": -3.48, \"dyaw\": -1.65, \"droll\": 0.0}, {\"dx\": -1.08, \"dy\": -1.37, \"dz\": 0.0, \"dpitch\": -3.29, \"dyaw\": -2.12, \"droll\": 0.0}, {\"dx\": -1.35, \"dy\": -1.84, \"dz\": 0.0, \"dpitch\": -3.06, \"dyaw\": -2.75, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": -2.32, \"dz\": 0.0, \"dpitch\": -2.78, \"dyaw\": -3.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.08, "window_alt_abs_m": 0.23, "target_px_mean_hist": 795.2, "cur_frame_id": 56, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.1, 70.9, 20.0, -47.86, -118.76, 0.0]\n  Target bbox: [614.73, 317.94, 665.26, 401.17]\n\nFrame 2:\n  Drone pose: [98.01, 70.33, 20.11, -44.56, -113.09, 0.0]\n  Target bbox: [615.28, 320.05, 664.77, 399.37]\n\nFrame 3:\n  Drone pose: [97.98, 69.92, 20.0, -47.93, -118.39, 0.0]\n  Target bbox: [613.73, 317.75, 666.08, 401.35]\n\nFrame 4:\n  Drone pose: [97.98, 69.42, 20.0, -47.93, -118.38, 0.0]\n  Target bbox: [616.33, 317.61, 664.04, 401.49]\n\nFrame 5 (current):\n  Drone pose: [98.04, 68.94, 20.06, -53.07, -115.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.49, \"ymin\": 316.52, \"xmax\": 668.05, \"ymax\": 402.13}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": -0.53, \"dz\": -0.06, \"dpitch\": 5.17, \"dyaw\": -3.2, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -1.03, \"dz\": -0.06, \"dpitch\": 5.19, \"dyaw\": -3.29, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -1.53, \"dz\": -0.06, \"dpitch\": 5.22, \"dyaw\": -3.36, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -2.02, \"dz\": -0.06, \"dpitch\": 5.25, \"dyaw\": -3.42, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -2.51, \"dz\": -0.06, \"dpitch\": 5.27, \"dyaw\": -3.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.04, "window_alt_abs_m": 0.29, "target_px_mean_hist": 782.8, "cur_frame_id": 66, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.13, 65.94, 20.0, -47.77, -118.78, 0.0]\n  Target bbox: [616.83, 318.54, 662.93, 400.49]\n\nFrame 2:\n  Drone pose: [98.13, 65.46, 20.0, -47.45, -113.75, 0.0]\n  Target bbox: [556.84, 325.04, 608.72, 407.65]\n\nFrame 3:\n  Drone pose: [98.12, 64.98, 20.0, -47.73, -118.69, 0.0]\n  Target bbox: [611.58, 317.54, 668.29, 401.67]\n\nFrame 4:\n  Drone pose: [98.09, 64.5, 20.0, -47.72, -118.56, 0.0]\n  Target bbox: [615.05, 318.3, 664.74, 400.82]\n\nFrame 5 (current):\n  Drone pose: [98.04, 64.03, 20.0, -49.76, -113.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 557.5, \"ymin\": 286.02, \"xmax\": 607.91, \"ymax\": 367.85}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": -0.46, \"dz\": 0.0, \"dpitch\": 2.05, \"dyaw\": -4.73, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -0.93, \"dz\": 0.0, \"dpitch\": 2.02, \"dyaw\": -4.39, \"droll\": 0.0}, {\"dx\": -0.29, \"dy\": -1.39, \"dz\": 0.0, \"dpitch\": 1.98, \"dyaw\": -3.97, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": 1.92, \"dyaw\": -3.49, \"droll\": 0.0}, {\"dx\": -0.59, \"dy\": -2.33, \"dz\": 0.0, \"dpitch\": 1.84, \"dyaw\": -2.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.27, "window_alt_abs_m": 0.0, "target_px_mean_hist": 799.5, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.13, 60.73, 20.0, -48.1, -115.32, 0.0]\n  Target bbox: [611.71, 317.93, 668.07, 401.23]\n\nFrame 2:\n  Drone pose: [96.99, 60.25, 20.0, -48.18, -114.88, 0.0]\n  Target bbox: [613.38, 317.38, 666.71, 401.79]\n\nFrame 3:\n  Drone pose: [96.86, 59.94, 19.96, -45.61, -119.34, 0.0]\n  Target bbox: [607.84, 330.68, 656.95, 413.98]\n\nFrame 4:\n  Drone pose: [96.77, 59.15, 19.89, -50.96, -114.22, 0.0]\n  Target bbox: [621.38, 320.4, 658.21, 398.22]\n\nFrame 5 (current):\n  Drone pose: [96.8, 58.75, 20.0, -45.61, -110.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 571.91, \"ymin\": 364.82, \"xmax\": 621.93, \"ymax\": 447.06}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": -2.73, \"dyaw\": -3.88, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -1.08, \"dz\": 0.0, \"dpitch\": -2.77, \"dyaw\": -4.12, \"droll\": 0.0}, {\"dx\": 0.15, \"dy\": -1.64, \"dz\": 0.0, \"dpitch\": -2.81, \"dyaw\": -4.46, \"droll\": 0.0}, {\"dx\": 0.25, \"dy\": -2.22, \"dz\": 0.0, \"dpitch\": -2.86, \"dyaw\": -4.89, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -2.82, \"dz\": 0.0, \"dpitch\": -2.93, \"dyaw\": -5.35, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.73, "window_alt_abs_m": 0.22, "target_px_mean_hist": 815.5, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939/aug_001/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.28, 55.23, 19.88, -50.77, -115.99, 0.0]\n  Target bbox: [665.59, 352.34, 717.73, 439.09]\n\nFrame 2:\n  Drone pose: [97.32, 54.72, 20.0, -49.7, -111.69, 0.0]\n  Target bbox: [560.36, 304.38, 607.4, 385.78]\n\nFrame 3:\n  Drone pose: [97.36, 54.12, 20.0, -45.34, -121.95, 0.0]\n  Target bbox: [666.38, 377.32, 725.56, 463.92]\n\nFrame 4:\n  Drone pose: [97.35, 53.53, 20.0, -49.01, -117.08, 0.0]\n  Target bbox: [618.1, 319.74, 662.24, 399.18]\n\nFrame 5 (current):\n  Drone pose: [97.31, 52.95, 20.0, -48.19, -123.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 671.67, \"ymin\": 329.55, \"xmax\": 719.99, \"ymax\": 412.02}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.57, \"dz\": 0.0, \"dpitch\": -1.04, \"dyaw\": 2.64, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": -1.16, \"dz\": 0.0, \"dpitch\": -1.5, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": -0.15, \"dy\": -1.77, \"dz\": 0.0, \"dpitch\": -1.93, \"dyaw\": -2.4, \"droll\": 0.0}, {\"dx\": -0.2, \"dy\": -2.39, \"dz\": 0.0, \"dpitch\": -2.33, \"dyaw\": -5.11, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": -3.01, \"dz\": 0.0, \"dpitch\": -2.66, \"dyaw\": -7.89, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 25.99, "window_alt_abs_m": 0.12, "target_px_mean_hist": 847.0, "cur_frame_id": 97, "source": "aug_001", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776219939", "difficulty_score": 0.5036, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [128.89, -8.06, 22.0, -46.42, 180.0, 0.0]\n  Target bbox: [627.68, 330.36, 652.32, 388.65]\n\nFrame 2:\n  Drone pose: [126.64, -12.23, 21.2, -47.2, 167.11, 0.0]\n  Target bbox: [625.14, 326.3, 654.67, 392.67]\n\nFrame 3:\n  Drone pose: [124.8, -15.68, 20.67, -46.7, 155.72, 0.0]\n  Target bbox: [622.37, 323.99, 657.47, 395.05]\n\nFrame 4:\n  Drone pose: [123.49, -17.78, 20.65, -46.27, 148.87, 0.0]\n  Target bbox: [622.78, 323.91, 657.11, 395.13]\n\nFrame 5 (current):\n  Drone pose: [122.58, -18.93, 20.62, -45.81, 145.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.86, \"ymin\": 324.55, \"xmax\": 657.08, \"ymax\": 394.55}, \"waypoint_deltas\": [{\"dx\": -0.67, \"dy\": -0.54, \"dz\": -0.03, \"dpitch\": 0.31, \"dyaw\": -1.63, \"droll\": 0.0}, {\"dx\": -1.21, \"dy\": -0.77, \"dz\": -0.05, \"dpitch\": 0.49, \"dyaw\": -2.24, \"droll\": 0.0}, {\"dx\": -1.72, \"dy\": -0.85, \"dz\": -0.07, \"dpitch\": 0.58, \"dyaw\": -2.45, \"droll\": 0.0}, {\"dx\": -2.23, \"dy\": -0.88, \"dz\": -0.09, \"dpitch\": 0.63, \"dyaw\": -2.55, \"droll\": 0.0}, {\"dx\": -2.75, \"dy\": -0.91, \"dz\": -0.2, \"dpitch\": 0.79, \"dyaw\": -2.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 34.71, "window_alt_abs_m": 1.38, "target_px_mean_hist": 418.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [119.83, -19.84, 20.42, -45.02, 142.64, 0.0]\n  Target bbox: [622.57, 323.8, 657.37, 395.33]\n\nFrame 2:\n  Drone pose: [119.3, -19.87, 20.39, -44.98, 142.52, 0.0]\n  Target bbox: [621.47, 323.27, 658.47, 395.91]\n\nFrame 3:\n  Drone pose: [118.76, -19.91, 20.36, -44.94, 142.35, 0.0]\n  Target bbox: [623.61, 324.77, 656.36, 394.33]\n\nFrame 4:\n  Drone pose: [118.22, -19.96, 20.33, -44.9, 142.17, 0.0]\n  Target bbox: [623.64, 324.66, 656.36, 394.41]\n\nFrame 5 (current):\n  Drone pose: [117.68, -20.01, 20.3, -44.86, 141.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.96, \"ymin\": 325.02, \"xmax\": 656.02, \"ymax\": 394.07}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": -0.04, \"dz\": -0.03, \"dpitch\": 0.03, \"dyaw\": -0.16, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -0.07, \"dz\": -0.05, \"dpitch\": 0.07, \"dyaw\": -0.27, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": -0.09, \"dz\": -0.08, \"dpitch\": 0.11, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -0.11, \"dz\": -0.11, \"dpitch\": 0.15, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": -2.59, \"dy\": -0.12, \"dz\": -0.13, \"dpitch\": 0.18, \"dyaw\": -0.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.66, "window_alt_abs_m": 0.12, "target_px_mean_hist": 548.2, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [115.09, -20.13, 20.17, -44.68, 141.53, 0.0]\n  Target bbox: [625.47, 326.12, 654.58, 392.89]\n\nFrame 2:\n  Drone pose: [114.58, -20.13, 20.15, -44.65, 141.52, 0.0]\n  Target bbox: [624.87, 325.77, 655.14, 393.29]\n\nFrame 3:\n  Drone pose: [114.08, -20.13, 20.14, -44.63, 141.51, 0.0]\n  Target bbox: [622.63, 323.73, 657.35, 395.38]\n\nFrame 4:\n  Drone pose: [113.58, -20.13, 20.12, -44.61, 141.52, 0.0]\n  Target bbox: [623.25, 324.24, 656.72, 394.87]\n\nFrame 5 (current):\n  Drone pose: [113.08, -20.12, 20.11, -44.59, 141.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.93, \"ymin\": 323.32, \"xmax\": 658.04, \"ymax\": 395.82}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.03, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": -0.04, \"dpitch\": 0.05, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.29, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": -0.06, \"dpitch\": 0.31, \"dyaw\": 0.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.03, "window_alt_abs_m": 0.07, "target_px_mean_hist": 554.5, "cur_frame_id": 22, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [110.58, -20.12, 20.05, -44.28, 141.55, 0.0]\n  Target bbox: [622.22, 323.05, 657.73, 396.08]\n\nFrame 2:\n  Drone pose: [110.08, -20.12, 20.04, -44.28, 141.55, 0.0]\n  Target bbox: [622.05, 323.12, 657.92, 396.01]\n\nFrame 3:\n  Drone pose: [109.58, -20.12, 20.04, -44.27, 141.55, 0.0]\n  Target bbox: [622.26, 323.09, 657.68, 396.04]\n\nFrame 4:\n  Drone pose: [109.08, -20.12, 20.03, -44.26, 141.55, 0.0]\n  Target bbox: [621.91, 323.22, 658.04, 395.95]\n\nFrame 5 (current):\n  Drone pose: [108.58, -20.12, 20.03, -44.25, 141.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.86, \"ymin\": 324.24, \"xmax\": 656.13, \"ymax\": 394.82}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.01, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.01, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.02, "target_px_mean_hist": 558.2, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.08, -20.12, 20.01, -44.22, 141.55, 0.0]\n  Target bbox: [622.23, 323.03, 657.71, 396.11]\n\nFrame 2:\n  Drone pose: [105.58, -20.12, 20.01, -44.22, 141.55, 0.0]\n  Target bbox: [621.29, 322.69, 658.66, 396.5]\n\nFrame 3:\n  Drone pose: [105.08, -20.11, 20.01, -44.22, 141.55, 0.0]\n  Target bbox: [622.07, 322.96, 657.87, 396.19]\n\nFrame 4:\n  Drone pose: [104.58, -20.11, 20.01, -44.22, 141.55, 0.0]\n  Target bbox: [622.55, 323.7, 657.4, 395.45]\n\nFrame 5 (current):\n  Drone pose: [104.08, -20.11, 20.01, -44.22, 141.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.07, \"ymin\": 323.54, \"xmax\": 656.89, \"ymax\": 395.55}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.01, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.02, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -0.03, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": -0.03, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": 0.01, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.01, "target_px_mean_hist": 563.8, "cur_frame_id": 40, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00049/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.57, -20.1, 20.0, -44.23, 141.56, 0.0]\n  Target bbox: [622.08, 322.88, 657.86, 396.26]\n\nFrame 2:\n  Drone pose: [101.07, -20.02, 20.0, -44.31, 141.76, 0.0]\n  Target bbox: [622.14, 323.3, 657.82, 395.85]\n\nFrame 3:\n  Drone pose: [100.57, -19.87, 20.0, -44.44, 142.09, 0.0]\n  Target bbox: [623.47, 326.33, 656.75, 392.64]\n\nFrame 4:\n  Drone pose: [100.07, -19.68, 20.0, -44.16, 141.37, 0.0]\n  Target bbox: [623.13, 323.89, 656.84, 395.23]\n\nFrame 5 (current):\n  Drone pose: [99.57, -19.46, 20.0, -44.36, 141.9, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.79, \"ymin\": 325.98, \"xmax\": 654.45, \"ymax\": 393.06}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.25, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -0.59, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": 0.73, \"dz\": 0.0, \"dpitch\": 0.27, \"dyaw\": -0.62, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": 0.94, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": -2.49, \"dy\": 1.14, \"dz\": 0.0, \"dpitch\": 0.34, \"dyaw\": -0.82, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.78, "window_alt_abs_m": 0.0, "target_px_mean_hist": 580.0, "cur_frame_id": 49, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.08, -18.32, 20.0, -44.02, 141.08, 0.0]\n  Target bbox: [623.69, 324.42, 656.29, 394.69]\n\nFrame 2:\n  Drone pose: [96.58, -18.13, 20.0, -44.2, 141.52, 0.0]\n  Target bbox: [621.93, 323.1, 658.03, 396.06]\n\nFrame 3:\n  Drone pose: [96.08, -17.92, 20.0, -44.39, 141.99, 0.0]\n  Target bbox: [627.62, 325.34, 652.59, 393.66]\n\nFrame 4:\n  Drone pose: [95.57, -17.7, 20.0, -44.13, 141.33, 0.0]\n  Target bbox: [624.24, 325.16, 655.76, 393.94]\n\nFrame 5 (current):\n  Drone pose: [95.08, -17.47, 20.0, -44.34, 141.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.98, \"ymin\": 325.66, \"xmax\": 657.28, \"ymax\": 393.35}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.25, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -1.46, \"dy\": 0.77, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": -0.45, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": 1.03, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": -2.43, \"dy\": 1.29, \"dz\": 0.0, \"dpitch\": 0.28, \"dyaw\": -0.35, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.11, "window_alt_abs_m": 0.0, "target_px_mean_hist": 578.5, "cur_frame_id": 58, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.65, -16.18, 20.0, -44.06, 141.53, 0.0]\n  Target bbox: [624.72, 325.4, 655.3, 393.68]\n\nFrame 2:\n  Drone pose: [92.17, -15.95, 20.0, -44.26, 142.11, 0.0]\n  Target bbox: [622.99, 325.39, 657.29, 393.66]\n\nFrame 3:\n  Drone pose: [91.67, -15.72, 20.0, -44.0, 141.48, 0.0]\n  Target bbox: [623.77, 324.03, 656.21, 395.04]\n\nFrame 4:\n  Drone pose: [91.18, -15.5, 20.0, -44.2, 141.99, 0.0]\n  Target bbox: [624.42, 325.09, 655.58, 393.99]\n\nFrame 5 (current):\n  Drone pose: [90.67, -15.27, 20.0, -44.41, 142.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.21, \"ymin\": 325.63, \"xmax\": 656.04, \"ymax\": 393.43}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.24, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": 0.48, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": 0.73, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": -0.71, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": -2.58, \"dy\": 1.21, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": -0.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.24, "window_alt_abs_m": 0.0, "target_px_mean_hist": 571.5, "cur_frame_id": 67, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [88.09, -14.06, 20.0, -44.25, 141.71, 0.0]\n  Target bbox: [622.24, 323.03, 657.71, 396.1]\n\nFrame 2:\n  Drone pose: [87.58, -13.83, 20.0, -44.47, 142.23, 0.0]\n  Target bbox: [622.66, 325.27, 657.62, 393.76]\n\nFrame 3:\n  Drone pose: [87.08, -13.61, 20.0, -44.21, 141.55, 0.0]\n  Target bbox: [622.23, 323.03, 657.71, 396.11]\n\nFrame 4:\n  Drone pose: [86.57, -13.42, 20.0, -44.39, 141.99, 0.0]\n  Target bbox: [626.07, 326.7, 654.12, 392.27]\n\nFrame 5 (current):\n  Drone pose: [86.07, -13.24, 20.0, -44.1, 141.24, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.54, \"ymin\": 324.35, \"xmax\": 656.49, \"ymax\": 394.73}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.18, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": 0.41, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.36, \"dz\": 0.0, \"dpitch\": -0.33, \"dyaw\": 0.85, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.57, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.79, \"dz\": 0.0, \"dpitch\": -0.26, \"dyaw\": 0.68, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 1.02, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.06, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.4, "window_alt_abs_m": 0.0, "target_px_mean_hist": 584.8, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/ORI/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [83.57, -12.22, 20.0, -44.12, 141.3, 0.0]\n  Target bbox: [622.04, 322.93, 657.9, 396.23]\n\nFrame 2:\n  Drone pose: [83.08, -11.98, 20.0, -44.34, 141.87, 0.0]\n  Target bbox: [626.88, 324.61, 653.37, 394.45]\n\nFrame 3:\n  Drone pose: [82.58, -11.72, 20.0, -44.11, 141.3, 0.0]\n  Target bbox: [622.04, 322.93, 657.9, 396.23]\n\nFrame 4:\n  Drone pose: [82.08, -11.47, 20.0, -44.34, 141.89, 0.0]\n  Target bbox: [626.85, 323.72, 653.42, 395.36]\n\nFrame 5 (current):\n  Drone pose: [81.58, -11.23, 20.0, -44.1, 141.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.64, \"ymin\": 325.34, \"xmax\": 655.37, \"ymax\": 393.74}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": -0.19, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.58, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": 0.77, \"dz\": 0.0, \"dpitch\": -0.27, \"dyaw\": 0.6, \"droll\": 0.0}, {\"dx\": -2.52, \"dy\": 1.02, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.33, "window_alt_abs_m": 0.0, "target_px_mean_hist": 593.2, "cur_frame_id": 85, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [128.89, -8.06, 22.0, -46.42, 180.0, 0.0]\n  Target bbox: [627.82, 329.71, 652.18, 389.36]\n\nFrame 2:\n  Drone pose: [126.53, -12.1, 21.14, -47.32, 167.44, 0.0]\n  Target bbox: [624.49, 326.19, 655.31, 392.78]\n\nFrame 3:\n  Drone pose: [124.8, -15.68, 20.67, -49.72, 150.72, 0.0]\n  Target bbox: [678.78, 274.57, 717.26, 346.94]\n\nFrame 4:\n  Drone pose: [123.45, -17.66, 20.61, -43.59, 148.19, 0.0]\n  Target bbox: [635.46, 372.65, 666.26, 439.67]\n\nFrame 5 (current):\n  Drone pose: [122.58, -18.93, 20.62, -45.81, 145.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.45, \"ymin\": 323.72, \"xmax\": 657.46, \"ymax\": 395.37}, \"waypoint_deltas\": [{\"dx\": -0.67, \"dy\": -0.54, \"dz\": -0.03, \"dpitch\": 0.31, \"dyaw\": -1.63, \"droll\": 0.0}, {\"dx\": -1.21, \"dy\": -0.77, \"dz\": -0.05, \"dpitch\": 0.49, \"dyaw\": -2.24, \"droll\": 0.0}, {\"dx\": -1.72, \"dy\": -0.85, \"dz\": -0.07, \"dpitch\": 0.58, \"dyaw\": -2.45, \"droll\": 0.0}, {\"dx\": -2.23, \"dy\": -0.88, \"dz\": -0.09, \"dpitch\": 0.63, \"dyaw\": -2.55, \"droll\": 0.0}, {\"dx\": -2.75, \"dy\": -0.91, \"dz\": -0.2, \"dpitch\": 0.79, \"dyaw\": -2.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 34.71, "window_alt_abs_m": 1.39, "target_px_mean_hist": 409.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [120.03, -19.82, 20.42, -47.33, 148.04, 0.0]\n  Target bbox: [560.63, 283.03, 599.22, 354.91]\n\nFrame 2:\n  Drone pose: [119.34, -19.76, 20.41, -45.06, 142.84, 0.0]\n  Target bbox: [624.43, 325.86, 655.56, 393.22]\n\nFrame 3:\n  Drone pose: [118.76, -19.91, 20.36, -44.94, 142.35, 0.0]\n  Target bbox: [622.26, 323.47, 657.67, 395.67]\n\nFrame 4:\n  Drone pose: [118.15, -19.92, 20.27, -44.94, 142.14, 0.0]\n  Target bbox: [624.58, 325.35, 655.43, 393.67]\n\nFrame 5 (current):\n  Drone pose: [117.72, -19.91, 20.19, -44.75, 142.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.72, \"ymin\": 324.13, \"xmax\": 657.23, \"ymax\": 395.02}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": -0.14, \"dz\": 0.08, \"dpitch\": -0.08, \"dyaw\": -0.48, \"droll\": 0.0}, {\"dx\": -1.1, \"dy\": -0.17, \"dz\": 0.06, \"dpitch\": -0.04, \"dyaw\": -0.59, \"droll\": 0.0}, {\"dx\": -1.61, \"dy\": -0.19, \"dz\": 0.03, \"dpitch\": 0.0, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": -2.13, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.73, \"droll\": 0.0}, {\"dx\": -2.63, \"dy\": -0.22, \"dz\": -0.02, \"dpitch\": 0.07, \"dyaw\": -0.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.06, "window_alt_abs_m": 0.22, "target_px_mean_hist": 553.2, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [115.07, -20.14, 20.22, -44.75, 141.48, 0.0]\n  Target bbox: [624.31, 325.05, 655.7, 393.99]\n\nFrame 2:\n  Drone pose: [114.55, -20.31, 20.08, -44.41, 141.05, 0.0]\n  Target bbox: [625.3, 326.0, 654.72, 393.06]\n\nFrame 3:\n  Drone pose: [113.91, -20.2, 20.19, -44.1, 146.04, 0.0]\n  Target bbox: [561.89, 338.18, 598.25, 409.68]\n\nFrame 4:\n  Drone pose: [113.58, -20.13, 20.12, -43.2, 137.44, 0.0]\n  Target bbox: [671.72, 349.46, 706.47, 419.42]\n\nFrame 5 (current):\n  Drone pose: [113.17, -20.0, 19.98, -46.17, 146.1, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 571.23, \"ymin\": 294.52, \"xmax\": 608.67, \"ymax\": 367.66}, \"waypoint_deltas\": [{\"dx\": -0.59, \"dy\": -0.12, \"dz\": 0.11, \"dpitch\": 1.6, \"dyaw\": -4.56, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": -0.12, \"dz\": 0.1, \"dpitch\": 1.61, \"dyaw\": -4.56, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": -0.12, \"dz\": 0.09, \"dpitch\": 1.63, \"dyaw\": -4.56, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -0.12, \"dz\": 0.08, \"dpitch\": 1.87, \"dyaw\": -4.55, \"droll\": 0.0}, {\"dx\": -2.59, \"dy\": -0.12, \"dz\": 0.07, \"dpitch\": 1.89, \"dyaw\": -4.55, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.68, "window_alt_abs_m": 0.47, "target_px_mean_hist": 567.8, "cur_frame_id": 22, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [110.58, -20.12, 20.05, -42.55, 146.55, 0.0]\n  Target bbox: [562.84, 356.25, 596.24, 424.69]\n\nFrame 2:\n  Drone pose: [110.08, -20.12, 20.04, -44.28, 141.55, 0.0]\n  Target bbox: [624.5, 324.82, 655.51, 394.21]\n\nFrame 3:\n  Drone pose: [109.74, -20.2, 20.03, -43.99, 141.64, 0.0]\n  Target bbox: [624.07, 325.1, 655.93, 394.02]\n\nFrame 4:\n  Drone pose: [109.08, -20.12, 20.03, -40.52, 143.33, 0.0]\n  Target bbox: [600.56, 386.27, 635.63, 458.94]\n\nFrame 5 (current):\n  Drone pose: [108.56, -20.24, 20.11, -44.94, 139.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 649.57, \"ymin\": 313.01, \"xmax\": 684.64, \"ymax\": 384.48}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": 0.12, \"dz\": -0.09, \"dpitch\": 0.7, \"dyaw\": 2.55, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": 0.12, \"dz\": -0.09, \"dpitch\": 0.7, \"dyaw\": 2.55, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": 0.13, \"dz\": -0.09, \"dpitch\": 0.71, \"dyaw\": 2.55, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": 0.13, \"dz\": -0.1, \"dpitch\": 0.71, \"dyaw\": 2.55, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": 0.12, \"dz\": -0.1, \"dpitch\": 0.72, \"dyaw\": 2.55, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.12, "window_alt_abs_m": 0.09, "target_px_mean_hist": 574.0, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.03, -19.94, 20.0, -49.01, 138.07, 0.0]\n  Target bbox: [667.71, 246.82, 704.33, 319.9]\n\nFrame 2:\n  Drone pose: [105.58, -20.12, 20.01, -44.22, 141.55, 0.0]\n  Target bbox: [622.57, 323.6, 657.44, 395.51]\n\nFrame 3:\n  Drone pose: [105.08, -20.11, 20.01, -43.99, 136.55, 0.0]\n  Target bbox: [683.15, 330.03, 717.88, 400.55]\n\nFrame 4:\n  Drone pose: [104.68, -20.18, 19.86, -43.82, 141.57, 0.0]\n  Target bbox: [624.47, 324.76, 655.55, 394.29]\n\nFrame 5 (current):\n  Drone pose: [104.08, -20.11, 20.01, -44.22, 141.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.75, \"ymin\": 325.31, \"xmax\": 655.26, \"ymax\": 393.75}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.01, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.02, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -0.03, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": -0.03, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": 0.01, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.51, "window_alt_abs_m": 0.31, "target_px_mean_hist": 572.5, "cur_frame_id": 40, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00049/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.62, -19.99, 20.02, -47.01, 136.92, 0.0]\n  Target bbox: [682.9, 280.39, 718.19, 351.42]\n\nFrame 2:\n  Drone pose: [101.07, -20.02, 20.0, -45.53, 137.87, 0.0]\n  Target bbox: [668.79, 303.88, 705.26, 376.51]\n\nFrame 3:\n  Drone pose: [100.57, -19.87, 20.0, -44.44, 142.09, 0.0]\n  Target bbox: [623.71, 325.51, 656.56, 393.55]\n\nFrame 4:\n  Drone pose: [100.17, -19.65, 19.83, -39.34, 142.07, 0.0]\n  Target bbox: [616.79, 398.05, 652.18, 471.66]\n\nFrame 5 (current):\n  Drone pose: [99.63, -19.29, 20.06, -42.62, 143.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 606.27, \"ymin\": 357.78, \"xmax\": 637.84, \"ymax\": 426.05}, \"waypoint_deltas\": [{\"dx\": -0.56, \"dy\": 0.08, \"dz\": -0.06, \"dpitch\": -1.5, \"dyaw\": -2.57, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": 0.32, \"dz\": -0.06, \"dpitch\": -1.72, \"dyaw\": -1.99, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": 0.56, \"dz\": -0.06, \"dpitch\": -1.47, \"dyaw\": -2.6, \"droll\": 0.0}, {\"dx\": -2.05, \"dy\": 0.77, \"dz\": -0.06, \"dpitch\": -1.67, \"dyaw\": -2.1, \"droll\": 0.0}, {\"dx\": -2.55, \"dy\": 0.97, \"dz\": -0.06, \"dpitch\": -1.4, \"dyaw\": -2.8, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.0, "window_alt_abs_m": 0.42, "target_px_mean_hist": 586.2, "cur_frame_id": 49, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.11, -18.41, 19.84, -47.2, 141.74, 0.0]\n  Target bbox: [612.36, 263.73, 647.65, 336.64]\n\nFrame 2:\n  Drone pose: [96.74, -18.22, 20.01, -43.93, 140.89, 0.0]\n  Target bbox: [631.68, 323.81, 666.26, 395.9]\n\nFrame 3:\n  Drone pose: [96.08, -17.92, 20.0, -42.93, 146.99, 0.0]\n  Target bbox: [565.68, 351.39, 594.01, 420.27]\n\nFrame 4:\n  Drone pose: [95.57, -17.7, 20.0, -41.82, 136.37, 0.0]\n  Target bbox: [683.86, 365.3, 716.61, 434.81]\n\nFrame 5 (current):\n  Drone pose: [95.07, -17.47, 19.88, -44.17, 141.87, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.43, \"ymin\": 324.44, \"xmax\": 651.81, \"ymax\": 394.64}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": 0.25, \"dz\": 0.12, \"dpitch\": 0.07, \"dyaw\": -0.56, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": 0.51, \"dz\": 0.12, \"dpitch\": -0.15, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": -1.45, \"dy\": 0.77, \"dz\": 0.12, \"dpitch\": 0.08, \"dyaw\": -0.44, \"droll\": 0.0}, {\"dx\": -1.93, \"dy\": 1.03, \"dz\": 0.12, \"dpitch\": -0.13, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": -2.42, \"dy\": 1.29, \"dz\": 0.12, \"dpitch\": 0.11, \"dyaw\": -0.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.06, "window_alt_abs_m": 0.3, "target_px_mean_hist": 585.2, "cur_frame_id": 58, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.47, -16.2, 20.07, -39.36, 141.27, 0.0]\n  Target bbox: [623.07, 409.16, 653.84, 478.1]\n\nFrame 2:\n  Drone pose: [92.17, -15.95, 20.0, -44.38, 137.11, 0.0]\n  Target bbox: [682.98, 325.93, 718.29, 392.54]\n\nFrame 3:\n  Drone pose: [91.67, -15.72, 20.0, -44.0, 141.48, 0.0]\n  Target bbox: [622.36, 323.16, 657.58, 395.99]\n\nFrame 4:\n  Drone pose: [91.3, -15.46, 19.87, -43.9, 142.31, 0.0]\n  Target bbox: [621.55, 322.82, 658.39, 396.38]\n\nFrame 5 (current):\n  Drone pose: [90.67, -15.27, 20.0, -39.98, 139.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 666.55, \"ymin\": 401.16, \"xmax\": 693.9, \"ymax\": 468.18}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.24, \"dz\": 0.0, \"dpitch\": -4.2, \"dyaw\": 2.68, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": 0.48, \"dz\": 0.0, \"dpitch\": -4.44, \"dyaw\": 3.23, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": 0.73, \"dz\": 0.0, \"dpitch\": -4.23, \"dyaw\": 2.6, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": -4.48, \"dyaw\": 3.14, \"droll\": 0.0}, {\"dx\": -2.58, \"dy\": 1.21, \"dz\": 0.0, \"dpitch\": -4.27, \"dyaw\": 2.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.47, "window_alt_abs_m": 0.33, "target_px_mean_hist": 573.2, "cur_frame_id": 67, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [88.09, -14.06, 20.0, -44.25, 141.71, 0.0]\n  Target bbox: [623.6, 323.84, 656.38, 395.21]\n\nFrame 2:\n  Drone pose: [87.58, -13.83, 20.0, -44.47, 142.23, 0.0]\n  Target bbox: [628.54, 325.07, 651.68, 393.99]\n\nFrame 3:\n  Drone pose: [86.96, -13.59, 19.93, -45.52, 139.53, 0.0]\n  Target bbox: [647.65, 304.4, 677.57, 372.8]\n\nFrame 4:\n  Drone pose: [86.72, -13.53, 19.98, -43.53, 137.0, 0.0]\n  Target bbox: [687.39, 335.64, 713.98, 405.75]\n\nFrame 5 (current):\n  Drone pose: [86.07, -13.24, 20.0, -44.84, 142.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 602.3, \"ymin\": 310.69, \"xmax\": 639.37, \"ymax\": 384.15}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.18, \"dz\": 0.0, \"dpitch\": 0.57, \"dyaw\": -1.14, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.36, \"dz\": 0.0, \"dpitch\": 0.41, \"dyaw\": -0.7, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.57, \"dz\": 0.0, \"dpitch\": 0.68, \"dyaw\": -1.39, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.79, \"dz\": 0.0, \"dpitch\": 0.48, \"dyaw\": -0.87, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 1.02, \"dz\": 0.0, \"dpitch\": 0.72, \"dyaw\": -1.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.55, "window_alt_abs_m": 0.14, "target_px_mean_hist": 590.5, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741/aug_001/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [83.57, -12.22, 20.0, -44.12, 141.3, 0.0]\n  Target bbox: [624.33, 324.72, 655.67, 394.33]\n\nFrame 2:\n  Drone pose: [82.98, -11.9, 19.94, -44.43, 141.88, 0.0]\n  Target bbox: [627.22, 325.78, 653.0, 393.26]\n\nFrame 3:\n  Drone pose: [82.43, -11.64, 19.93, -47.63, 145.73, 0.0]\n  Target bbox: [566.93, 268.32, 603.31, 340.33]\n\nFrame 4:\n  Drone pose: [82.1, -11.47, 20.05, -46.08, 141.64, 0.0]\n  Target bbox: [631.25, 297.99, 656.35, 364.53]\n\nFrame 5 (current):\n  Drone pose: [81.55, -11.19, 19.99, -47.67, 140.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 636.03, \"ymin\": 265.33, \"xmax\": 669.06, \"ymax\": 336.11}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": 0.17, \"dz\": 0.01, \"dpitch\": 3.38, \"dyaw\": 1.49, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": 0.36, \"dz\": 0.01, \"dpitch\": 3.66, \"dyaw\": 0.77, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": 0.54, \"dz\": 0.01, \"dpitch\": 3.49, \"dyaw\": 1.16, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": 0.73, \"dz\": 0.01, \"dpitch\": 3.3, \"dyaw\": 1.59, \"droll\": 0.0}, {\"dx\": -2.49, \"dy\": 0.98, \"dz\": 0.01, \"dpitch\": 3.53, \"dyaw\": 0.99, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.85, "window_alt_abs_m": 0.25, "target_px_mean_hist": 591.8, "cur_frame_id": 85, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776222741", "difficulty_score": 0.2751, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-127.85, 38.79, 22.0, -46.19, 73.3, 0.0]\n  Target bbox: [619.47, 324.23, 660.24, 395.05]\n\nFrame 2:\n  Drone pose: [-129.61, 39.24, 21.2, -44.2, 68.65, 0.0]\n  Target bbox: [615.28, 323.01, 664.49, 396.48]\n\nFrame 3:\n  Drone pose: [-130.69, 40.08, 20.96, -43.7, 65.53, 0.0]\n  Target bbox: [616.22, 322.58, 663.57, 396.94]\n\nFrame 4:\n  Drone pose: [-131.25, 41.01, 20.94, -43.89, 63.61, 0.0]\n  Target bbox: [620.8, 323.98, 659.48, 395.4]\n\nFrame 5 (current):\n  Drone pose: [-131.53, 41.89, 20.93, -43.85, 61.22, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.61, \"ymin\": 322.11, \"xmax\": 662.82, \"ymax\": 397.45}, \"waypoint_deltas\": [{\"dx\": -0.14, \"dy\": 0.77, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -1.93, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": 1.45, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": -3.57, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": 2.06, \"dz\": -0.01, \"dpitch\": 0.22, \"dyaw\": -3.74, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 2.58, \"dz\": 0.0, \"dpitch\": 0.22, \"dyaw\": -3.85, \"droll\": 0.0}, {\"dx\": -0.27, \"dy\": 3.14, \"dz\": -0.06, \"dpitch\": 0.24, \"dyaw\": -3.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.08, "window_alt_abs_m": 1.07, "target_px_mean_hist": 643.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-131.8, 45.03, 20.87, -43.61, 57.28, 0.0]\n  Target bbox: [618.05, 322.26, 661.79, 397.26]\n\nFrame 2:\n  Drone pose: [-132.11, 45.41, 20.88, -43.25, 56.73, 0.0]\n  Target bbox: [620.11, 323.05, 659.73, 396.4]\n\nFrame 3:\n  Drone pose: [-132.22, 45.88, 20.89, -43.16, 56.54, 0.0]\n  Target bbox: [618.58, 322.73, 661.26, 396.82]\n\nFrame 4:\n  Drone pose: [-132.26, 46.37, 20.91, -43.15, 56.45, 0.0]\n  Target bbox: [616.49, 321.82, 663.4, 397.79]\n\nFrame 5 (current):\n  Drone pose: [-132.26, 46.85, 20.94, -43.16, 56.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.71, \"ymin\": 322.0, \"xmax\": 663.2, \"ymax\": 397.58}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.48, \"dz\": 0.03, \"dpitch\": -0.05, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": 0.96, \"dz\": 0.07, \"dpitch\": 0.25, \"dyaw\": -0.82, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": 1.43, \"dz\": 0.11, \"dpitch\": 0.16, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 1.9, \"dz\": 0.16, \"dpitch\": 0.42, \"dyaw\": -1.44, \"droll\": 0.0}, {\"dx\": 0.38, \"dy\": 2.37, \"dz\": 0.23, \"dpitch\": 0.27, \"dyaw\": -1.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.84, "window_alt_abs_m": 0.07, "target_px_mean_hist": 636.0, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-131.88, 49.22, 21.17, -42.89, 55.3, 0.0]\n  Target bbox: [619.65, 324.36, 660.75, 395.12]\n\nFrame 2:\n  Drone pose: [-131.77, 49.71, 21.24, -42.68, 54.48, 0.0]\n  Target bbox: [617.6, 322.89, 662.3, 396.75]\n\nFrame 3:\n  Drone pose: [-131.66, 50.21, 21.32, -42.87, 54.7, 0.0]\n  Target bbox: [621.54, 324.85, 658.85, 394.63]\n\nFrame 4:\n  Drone pose: [-131.56, 50.73, 21.39, -42.89, 53.83, 0.0]\n  Target bbox: [620.29, 324.47, 659.58, 395.08]\n\nFrame 5 (current):\n  Drone pose: [-131.47, 51.23, 21.47, -43.07, 54.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.06, \"ymin\": 325.57, \"xmax\": 658.31, \"ymax\": 393.9}, \"waypoint_deltas\": [{\"dx\": 0.08, \"dy\": 0.51, \"dz\": 0.08, \"dpitch\": 0.2, \"dyaw\": -0.87, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": 1.0, \"dz\": 0.22, \"dpitch\": -0.06, \"dyaw\": -0.7, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": 1.49, \"dz\": 0.37, \"dpitch\": -0.32, \"dyaw\": -0.52, \"droll\": 0.0}, {\"dx\": 0.33, \"dy\": 1.98, \"dz\": 0.53, \"dpitch\": -0.21, \"dyaw\": -1.36, \"droll\": 0.0}, {\"dx\": 0.44, \"dy\": 2.48, \"dz\": 0.6, \"dpitch\": -0.38, \"dyaw\": -1.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.1, "window_alt_abs_m": 0.3, "target_px_mean_hist": 621.0, "cur_frame_id": 22, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-131.03, 53.71, 22.07, -43.45, 52.88, 0.0]\n  Target bbox: [623.34, 327.62, 656.97, 391.79]\n\nFrame 2:\n  Drone pose: [-130.85, 54.23, 22.13, -43.31, 52.2, 0.0]\n  Target bbox: [617.25, 323.93, 662.74, 395.76]\n\nFrame 3:\n  Drone pose: [-130.47, 54.85, 22.18, -43.79, 52.79, 0.0]\n  Target bbox: [623.55, 326.48, 656.78, 392.96]\n\nFrame 4:\n  Drone pose: [-129.85, 55.6, 22.22, -44.21, 52.64, 0.0]\n  Target bbox: [616.39, 323.35, 663.61, 396.31]\n\nFrame 5 (current):\n  Drone pose: [-127.65, 56.07, 22.61, -46.39, 57.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.05, \"ymin\": 326.11, \"xmax\": 659.25, \"ymax\": 393.2}, \"waypoint_deltas\": [{\"dx\": 3.65, \"dy\": 0.24, \"dz\": 0.8, \"dpitch\": -2.84, \"dyaw\": 8.34, \"droll\": 0.0}, {\"dx\": 8.39, \"dy\": 0.31, \"dz\": 0.87, \"dpitch\": -4.26, \"dyaw\": 21.03, \"droll\": 0.0}, {\"dx\": 12.69, \"dy\": 0.36, \"dz\": 1.15, \"dpitch\": -4.49, \"dyaw\": 32.9, \"droll\": 0.0}, {\"dx\": 16.46, \"dy\": 0.37, \"dz\": 0.35, \"dpitch\": -2.29, \"dyaw\": 42.66, \"droll\": 0.0}, {\"dx\": 18.72, \"dy\": 0.56, \"dz\": -0.41, \"dpitch\": -0.29, \"dyaw\": 47.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.33, "window_alt_abs_m": 0.54, "target_px_mean_hist": 574.0, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00041/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.43, 57.13, 22.15, -46.61, 105.0, 0.0]\n  Target bbox: [615.72, 321.7, 664.06, 397.83]\n\nFrame 2:\n  Drone pose: [-107.93, 57.63, 22.09, -46.53, 105.0, 0.0]\n  Target bbox: [617.05, 322.28, 662.76, 397.16]\n\nFrame 3:\n  Drone pose: [-107.43, 58.13, 22.03, -46.44, 105.0, 0.0]\n  Target bbox: [615.32, 321.18, 664.46, 398.34]\n\nFrame 4:\n  Drone pose: [-106.93, 58.63, 21.95, -46.33, 105.0, 0.0]\n  Target bbox: [615.56, 321.44, 664.21, 398.11]\n\nFrame 5 (current):\n  Drone pose: [-106.43, 59.13, 21.8, -46.12, 105.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.38, \"ymin\": 321.01, \"xmax\": 663.37, \"ymax\": 398.48}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.5, \"dz\": -0.16, \"dpitch\": 0.22, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 1.0, \"dz\": -0.35, \"dpitch\": 0.47, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 1.5, \"dz\": -0.44, \"dpitch\": 0.6, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 2.0, \"dz\": -0.54, \"dpitch\": 0.74, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.07, \"dy\": 2.47, \"dz\": -0.63, \"dpitch\": 0.75, \"dyaw\": -1.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.36, "target_px_mean_hist": 608.5, "cur_frame_id": 41, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.36, 61.6, 21.17, -45.37, 103.8, 0.0]\n  Target bbox: [614.69, 320.08, 665.1, 399.46]\n\nFrame 2:\n  Drone pose: [-104.31, 62.08, 21.07, -45.35, 102.52, 0.0]\n  Target bbox: [615.83, 320.92, 664.0, 398.58]\n\nFrame 3:\n  Drone pose: [-104.35, 62.55, 20.98, -45.33, 100.96, 0.0]\n  Target bbox: [616.15, 320.88, 663.72, 398.58]\n\nFrame 4:\n  Drone pose: [-104.42, 63.02, 20.89, -45.3, 99.32, 0.0]\n  Target bbox: [615.61, 320.27, 664.26, 399.21]\n\nFrame 5 (current):\n  Drone pose: [-104.59, 63.49, 20.8, -45.27, 97.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.96, \"ymin\": 321.73, \"xmax\": 661.93, \"ymax\": 397.66}, \"waypoint_deltas\": [{\"dx\": -0.16, \"dy\": 0.46, \"dz\": -0.08, \"dpitch\": 0.06, \"dyaw\": -1.94, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": 0.93, \"dz\": -0.15, \"dpitch\": 0.13, \"dyaw\": -3.89, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 1.39, \"dz\": -0.21, \"dpitch\": 0.23, \"dyaw\": -5.83, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": 1.86, \"dz\": -0.27, \"dpitch\": 0.36, \"dyaw\": -7.78, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": 2.33, \"dz\": -0.33, \"dpitch\": 0.52, \"dyaw\": -9.71, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.42, "window_alt_abs_m": 0.37, "target_px_mean_hist": 651.8, "cur_frame_id": 50, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-105.4, 65.82, 20.47, -44.75, 87.68, 0.0]\n  Target bbox: [614.05, 318.6, 666.02, 400.97]\n\nFrame 2:\n  Drone pose: [-105.57, 66.28, 20.42, -44.57, 85.76, 0.0]\n  Target bbox: [615.86, 320.02, 664.21, 399.49]\n\nFrame 3:\n  Drone pose: [-105.73, 66.75, 20.37, -44.36, 83.85, 0.0]\n  Target bbox: [615.72, 319.57, 664.4, 399.94]\n\nFrame 4:\n  Drone pose: [-105.89, 67.21, 20.33, -44.13, 81.96, 0.0]\n  Target bbox: [615.65, 319.86, 664.5, 399.71]\n\nFrame 5 (current):\n  Drone pose: [-106.06, 67.68, 20.29, -43.88, 80.1, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.28, \"ymin\": 319.29, \"xmax\": 665.86, \"ymax\": 400.33}, \"waypoint_deltas\": [{\"dx\": -0.16, \"dy\": 0.46, \"dz\": -0.04, \"dpitch\": 0.28, \"dyaw\": -1.84, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": 0.93, \"dz\": -0.07, \"dpitch\": 0.58, \"dyaw\": -3.64, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 1.39, \"dz\": -0.1, \"dpitch\": 0.9, \"dyaw\": -5.41, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": 1.86, \"dz\": -0.12, \"dpitch\": 1.24, \"dyaw\": -7.15, \"droll\": 0.0}, {\"dx\": -0.82, \"dy\": 2.33, \"dz\": -0.14, \"dpitch\": 1.38, \"dyaw\": -7.55, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.58, "window_alt_abs_m": 0.18, "target_px_mean_hist": 686.8, "cur_frame_id": 59, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00069/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-107.04, 70.47, 20.13, -42.35, 72.15, 0.0]\n  Target bbox: [616.82, 322.88, 662.89, 396.59]\n\nFrame 2:\n  Drone pose: [-107.2, 70.94, 20.11, -42.21, 71.76, 0.0]\n  Target bbox: [615.68, 322.68, 664.02, 396.83]\n\nFrame 3:\n  Drone pose: [-107.37, 71.4, 20.09, -42.08, 71.37, 0.0]\n  Target bbox: [618.79, 322.96, 660.92, 396.52]\n\nFrame 4:\n  Drone pose: [-107.53, 71.87, 20.08, -41.94, 70.98, 0.0]\n  Target bbox: [617.47, 322.96, 662.24, 396.55]\n\nFrame 5 (current):\n  Drone pose: [-107.69, 72.33, 20.07, -41.81, 70.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.4, \"ymin\": 322.81, \"xmax\": 663.3, \"ymax\": 396.77}, \"waypoint_deltas\": [{\"dx\": -0.17, \"dy\": 0.47, \"dz\": -0.01, \"dpitch\": 0.13, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": -0.33, \"dy\": 0.93, \"dz\": -0.02, \"dpitch\": 0.26, \"dyaw\": -0.76, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 1.4, \"dz\": -0.03, \"dpitch\": 0.39, \"dyaw\": -1.14, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": 1.87, \"dz\": -0.04, \"dpitch\": 0.52, \"dyaw\": -1.51, \"droll\": 0.0}, {\"dx\": -0.82, \"dy\": 2.33, \"dz\": -0.04, \"dpitch\": 0.65, \"dyaw\": -1.88, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.56, "window_alt_abs_m": 0.06, "target_px_mean_hist": 681.2, "cur_frame_id": 69, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.51, 74.66, 20.03, -41.16, 68.71, 0.0]\n  Target bbox: [617.6, 323.06, 662.14, 396.52]\n\nFrame 2:\n  Drone pose: [-108.67, 75.13, 20.02, -41.04, 68.35, 0.0]\n  Target bbox: [616.34, 322.89, 663.41, 396.75]\n\nFrame 3:\n  Drone pose: [-108.67, 75.58, 20.02, -40.97, 68.41, 0.0]\n  Target bbox: [616.49, 322.74, 663.26, 396.88]\n\nFrame 4:\n  Drone pose: [-108.62, 76.02, 20.01, -40.92, 68.58, 0.0]\n  Target bbox: [615.56, 323.06, 664.2, 396.59]\n\nFrame 5 (current):\n  Drone pose: [-108.41, 76.45, 20.01, -40.94, 69.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.14, \"ymin\": 323.68, \"xmax\": 658.59, \"ymax\": 395.85}, \"waypoint_deltas\": [{\"dx\": 0.2, \"dy\": 0.44, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.55, \"droll\": 0.0}, {\"dx\": 0.4, \"dy\": 0.88, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.09, \"droll\": 0.0}, {\"dx\": 0.59, \"dy\": 1.32, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.6, \"droll\": 0.0}, {\"dx\": 0.77, \"dy\": 1.76, \"dz\": -0.01, \"dpitch\": -0.03, \"dyaw\": 2.1, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": 2.21, \"dz\": -0.01, \"dpitch\": -0.03, \"dyaw\": 2.57, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.18, "window_alt_abs_m": 0.01, "target_px_mean_hist": 655.8, "cur_frame_id": 78, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-107.47, 78.66, 20.0, -40.97, 71.73, 0.0]\n  Target bbox: [619.2, 323.55, 660.49, 396.03]\n\nFrame 2:\n  Drone pose: [-107.3, 79.11, 20.0, -40.98, 72.19, 0.0]\n  Target bbox: [617.43, 323.34, 662.25, 396.25]\n\nFrame 3:\n  Drone pose: [-107.14, 79.56, 20.0, -40.98, 72.63, 0.0]\n  Target bbox: [616.46, 323.71, 663.24, 395.9]\n\nFrame 4:\n  Drone pose: [-106.99, 80.02, 20.0, -40.98, 73.05, 0.0]\n  Target bbox: [619.31, 323.68, 660.36, 395.89]\n\nFrame 5 (current):\n  Drone pose: [-106.84, 80.47, 20.0, -40.98, 73.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.13, \"ymin\": 323.79, \"xmax\": 659.56, \"ymax\": 395.72}, \"waypoint_deltas\": [{\"dx\": 0.14, \"dy\": 0.46, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.39, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 0.92, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.76, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": 1.38, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 1.11, \"droll\": 0.0}, {\"dx\": 0.54, \"dy\": 1.85, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 1.46, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": 2.31, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 1.79, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.72, "window_alt_abs_m": 0.0, "target_px_mean_hist": 664.2, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-127.85, 38.79, 22.0, -44.5, 70.72, 0.0]\n  Target bbox: [651.12, 353.12, 689.51, 424.16]\n\nFrame 2:\n  Drone pose: [-129.61, 39.24, 21.2, -44.2, 68.65, 0.0]\n  Target bbox: [616.43, 323.12, 663.32, 396.36]\n\nFrame 3:\n  Drone pose: [-130.69, 40.08, 20.96, -39.44, 67.77, 0.0]\n  Target bbox: [588.75, 394.18, 635.72, 469.0]\n\nFrame 4:\n  Drone pose: [-131.36, 41.03, 20.8, -46.87, 68.34, 0.0]\n  Target bbox: [558.7, 271.05, 599.03, 343.43]\n\nFrame 5 (current):\n  Drone pose: [-131.53, 41.89, 20.93, -41.91, 66.22, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 557.64, \"ymin\": 356.13, \"xmax\": 600.74, \"ymax\": 431.88}, \"waypoint_deltas\": [{\"dx\": -0.14, \"dy\": 0.77, \"dz\": 0.0, \"dpitch\": -1.81, \"dyaw\": -6.93, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": 1.45, \"dz\": 0.0, \"dpitch\": -1.59, \"dyaw\": -8.57, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": 2.06, \"dz\": -0.01, \"dpitch\": -1.72, \"dyaw\": -8.74, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 2.58, \"dz\": 0.0, \"dpitch\": -1.72, \"dyaw\": -8.85, \"droll\": 0.0}, {\"dx\": -0.27, \"dy\": 3.14, \"dz\": -0.06, \"dpitch\": -1.7, \"dyaw\": -8.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.64, "window_alt_abs_m": 1.33, "target_px_mean_hist": 656.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-131.75, 45.07, 20.91, -43.76, 57.32, 0.0]\n  Target bbox: [614.46, 321.19, 665.49, 398.42]\n\nFrame 2:\n  Drone pose: [-132.13, 45.23, 20.93, -43.11, 56.95, 0.0]\n  Target bbox: [616.62, 322.05, 663.29, 397.53]\n\nFrame 3:\n  Drone pose: [-132.27, 45.83, 20.96, -41.84, 51.55, 0.0]\n  Target bbox: [679.46, 347.11, 722.21, 420.39]\n\nFrame 4:\n  Drone pose: [-132.41, 46.37, 21.01, -44.06, 61.11, 0.0]\n  Target bbox: [554.1, 309.24, 602.34, 383.89]\n\nFrame 5 (current):\n  Drone pose: [-132.26, 46.85, 20.94, -41.56, 58.61, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 594.06, \"ymin\": 350.7, \"xmax\": 632.85, \"ymax\": 423.36}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.48, \"dz\": 0.03, \"dpitch\": -1.65, \"dyaw\": -2.02, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": 0.96, \"dz\": 0.07, \"dpitch\": -1.35, \"dyaw\": -2.95, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": 1.43, \"dz\": 0.11, \"dpitch\": -1.44, \"dyaw\": -2.73, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 1.9, \"dz\": 0.16, \"dpitch\": -1.18, \"dyaw\": -3.57, \"droll\": 0.0}, {\"dx\": 0.38, \"dy\": 2.37, \"dz\": 0.23, \"dpitch\": -1.33, \"dyaw\": -3.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.83, "window_alt_abs_m": 0.16, "target_px_mean_hist": 646.0, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-131.88, 49.22, 21.17, -44.33, 56.47, 0.0]\n  Target bbox: [606.74, 299.78, 644.3, 371.69]\n\nFrame 2:\n  Drone pose: [-131.77, 49.71, 21.24, -40.73, 56.6, 0.0]\n  Target bbox: [590.69, 355.98, 636.0, 429.85]\n\nFrame 3:\n  Drone pose: [-131.71, 50.36, 21.3, -42.97, 54.36, 0.0]\n  Target bbox: [624.63, 326.65, 655.64, 392.66]\n\nFrame 4:\n  Drone pose: [-131.53, 50.6, 21.3, -39.93, 55.62, 0.0]\n  Target bbox: [600.94, 370.04, 640.34, 441.13]\n\nFrame 5 (current):\n  Drone pose: [-131.43, 51.19, 21.39, -42.95, 54.15, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.91, \"ymin\": 325.22, \"xmax\": 658.48, \"ymax\": 394.27}, \"waypoint_deltas\": [{\"dx\": 0.04, \"dy\": 0.55, \"dz\": 0.16, \"dpitch\": 0.08, \"dyaw\": -1.02, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": 1.04, \"dz\": 0.3, \"dpitch\": -0.18, \"dyaw\": -0.85, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": 1.53, \"dz\": 0.45, \"dpitch\": -0.44, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": 0.29, \"dy\": 2.02, \"dz\": 0.61, \"dpitch\": -0.33, \"dyaw\": -1.51, \"droll\": 0.0}, {\"dx\": 0.4, \"dy\": 2.52, \"dz\": 0.68, \"dpitch\": -0.5, \"dyaw\": -1.27, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.09, "window_alt_abs_m": 0.22, "target_px_mean_hist": 620.5, "cur_frame_id": 22, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-131.03, 53.71, 22.07, -45.32, 57.48, 0.0]\n  Target bbox: [563.11, 295.54, 604.3, 364.27]\n\nFrame 2:\n  Drone pose: [-130.85, 54.23, 22.13, -43.31, 52.2, 0.0]\n  Target bbox: [617.92, 324.04, 662.06, 395.6]\n\nFrame 3:\n  Drone pose: [-130.47, 54.85, 22.18, -43.79, 52.79, 0.0]\n  Target bbox: [622.18, 325.72, 658.21, 393.76]\n\nFrame 4:\n  Drone pose: [-129.83, 55.65, 22.33, -40.69, 56.77, 0.0]\n  Target bbox: [569.36, 388.76, 609.64, 458.82]\n\nFrame 5 (current):\n  Drone pose: [-127.65, 56.07, 22.61, -46.39, 57.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.26, \"ymin\": 324.8, \"xmax\": 661.12, \"ymax\": 394.62}, \"waypoint_deltas\": [{\"dx\": 3.65, \"dy\": 0.24, \"dz\": 0.8, \"dpitch\": -2.84, \"dyaw\": 8.34, \"droll\": 0.0}, {\"dx\": 8.39, \"dy\": 0.31, \"dz\": 0.87, \"dpitch\": -4.26, \"dyaw\": 21.03, \"droll\": 0.0}, {\"dx\": 12.69, \"dy\": 0.36, \"dz\": 1.15, \"dpitch\": -4.49, \"dyaw\": 32.9, \"droll\": 0.0}, {\"dx\": 16.46, \"dy\": 0.37, \"dz\": 0.35, \"dpitch\": -2.29, \"dyaw\": 42.66, \"droll\": 0.0}, {\"dx\": 18.72, \"dy\": 0.56, \"dz\": -0.41, \"dpitch\": -0.29, \"dyaw\": 47.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.64, "window_alt_abs_m": 0.54, "target_px_mean_hist": 576.5, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00041/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.35, 57.06, 22.04, -46.33, 105.16, 0.0]\n  Target bbox: [621.2, 324.68, 658.66, 394.58]\n\nFrame 2:\n  Drone pose: [-107.87, 57.74, 22.19, -48.28, 105.01, 0.0]\n  Target bbox: [618.24, 296.49, 667.31, 373.24]\n\nFrame 3:\n  Drone pose: [-107.42, 58.15, 21.93, -46.34, 105.04, 0.0]\n  Target bbox: [616.78, 321.45, 662.98, 398.01]\n\nFrame 4:\n  Drone pose: [-107.09, 58.66, 21.86, -46.32, 104.59, 0.0]\n  Target bbox: [619.38, 323.52, 660.45, 395.84]\n\nFrame 5 (current):\n  Drone pose: [-106.53, 59.25, 21.9, -50.08, 108.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 575.97, \"ymin\": 263.09, \"xmax\": 620.39, \"ymax\": 336.7}, \"waypoint_deltas\": [{\"dx\": 0.6, \"dy\": 0.38, \"dz\": -0.26, \"dpitch\": 4.18, \"dyaw\": -3.38, \"droll\": 0.0}, {\"dx\": 1.1, \"dy\": 0.88, \"dz\": -0.45, \"dpitch\": 4.43, \"dyaw\": -3.38, \"droll\": 0.0}, {\"dx\": 1.6, \"dy\": 1.38, \"dz\": -0.54, \"dpitch\": 4.56, \"dyaw\": -3.38, \"droll\": 0.0}, {\"dx\": 2.1, \"dy\": 1.88, \"dz\": -0.64, \"dpitch\": 4.7, \"dyaw\": -3.38, \"droll\": 0.0}, {\"dx\": 2.17, \"dy\": 2.35, \"dz\": -0.73, \"dpitch\": 4.71, \"dyaw\": -4.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.43, "window_alt_abs_m": 0.52, "target_px_mean_hist": 615.5, "cur_frame_id": 41, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.47, 61.67, 21.16, -43.94, 104.22, 0.0]\n  Target bbox: [607.94, 347.25, 655.04, 424.34]\n\nFrame 2:\n  Drone pose: [-104.23, 62.0, 20.98, -45.09, 102.7, 0.0]\n  Target bbox: [616.88, 321.38, 662.98, 398.04]\n\nFrame 3:\n  Drone pose: [-104.51, 62.55, 21.0, -45.41, 100.52, 0.0]\n  Target bbox: [616.5, 321.01, 663.38, 398.42]\n\nFrame 4:\n  Drone pose: [-104.39, 63.08, 20.98, -48.77, 99.48, 0.0]\n  Target bbox: [617.57, 266.86, 661.29, 342.86]\n\nFrame 5 (current):\n  Drone pose: [-104.69, 63.36, 20.8, -45.44, 102.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 555.36, \"ymin\": 316.86, \"xmax\": 604.96, \"ymax\": 394.86}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": 0.59, \"dz\": -0.08, \"dpitch\": 0.23, \"dyaw\": -6.59, \"droll\": 0.0}, {\"dx\": -0.22, \"dy\": 1.06, \"dz\": -0.15, \"dpitch\": 0.3, \"dyaw\": -8.54, \"droll\": 0.0}, {\"dx\": -0.39, \"dy\": 1.52, \"dz\": -0.21, \"dpitch\": 0.4, \"dyaw\": -10.48, \"droll\": 0.0}, {\"dx\": -0.55, \"dy\": 1.99, \"dz\": -0.27, \"dpitch\": 0.53, \"dyaw\": -12.43, \"droll\": 0.0}, {\"dx\": -0.71, \"dy\": 2.46, \"dz\": -0.33, \"dpitch\": 0.69, \"dyaw\": -14.36, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.3, "window_alt_abs_m": 0.4, "target_px_mean_hist": 650.0, "cur_frame_id": 50, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-105.4, 65.82, 20.47, -40.06, 89.93, 0.0]\n  Target bbox: [587.79, 398.82, 637.3, 479.37]\n\nFrame 2:\n  Drone pose: [-105.57, 66.28, 20.42, -44.57, 85.76, 0.0]\n  Target bbox: [616.57, 320.19, 663.51, 399.27]\n\nFrame 3:\n  Drone pose: [-105.75, 66.93, 20.35, -44.6, 83.73, 0.0]\n  Target bbox: [615.24, 319.53, 664.88, 400.01]\n\nFrame 4:\n  Drone pose: [-106.0, 67.11, 20.4, -44.07, 81.69, 0.0]\n  Target bbox: [615.42, 319.9, 664.73, 399.7]\n\nFrame 5 (current):\n  Drone pose: [-106.06, 67.68, 20.29, -43.88, 80.1, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.79, \"ymin\": 319.01, \"xmax\": 665.4, \"ymax\": 400.6}, \"waypoint_deltas\": [{\"dx\": -0.16, \"dy\": 0.46, \"dz\": -0.04, \"dpitch\": 0.28, \"dyaw\": -1.84, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": 0.93, \"dz\": -0.07, \"dpitch\": 0.58, \"dyaw\": -3.64, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 1.39, \"dz\": -0.1, \"dpitch\": 0.9, \"dyaw\": -5.41, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": 1.86, \"dz\": -0.12, \"dpitch\": 1.24, \"dyaw\": -7.15, \"droll\": 0.0}, {\"dx\": -0.82, \"dy\": 2.33, \"dz\": -0.14, \"dpitch\": 1.38, \"dyaw\": -7.55, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.83, "window_alt_abs_m": 0.28, "target_px_mean_hist": 679.8, "cur_frame_id": 59, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00069/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-106.91, 70.5, 20.12, -42.44, 72.47, 0.0]\n  Target bbox: [615.38, 322.97, 664.32, 396.55]\n\nFrame 2:\n  Drone pose: [-107.2, 70.94, 20.11, -43.54, 75.49, 0.0]\n  Target bbox: [571.04, 302.36, 614.93, 374.74]\n\nFrame 3:\n  Drone pose: [-107.37, 71.4, 20.09, -44.04, 76.32, 0.0]\n  Target bbox: [554.13, 292.1, 600.93, 365.02]\n\nFrame 4:\n  Drone pose: [-107.38, 72.0, 20.09, -42.2, 71.24, 0.0]\n  Target bbox: [617.16, 322.88, 662.55, 396.62]\n\nFrame 5 (current):\n  Drone pose: [-107.69, 72.33, 20.07, -38.8, 75.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 552.8, \"ymin\": 375.45, \"xmax\": 600.65, \"ymax\": 448.64}, \"waypoint_deltas\": [{\"dx\": -0.17, \"dy\": 0.47, \"dz\": -0.01, \"dpitch\": -2.88, \"dyaw\": -5.38, \"droll\": 0.0}, {\"dx\": -0.33, \"dy\": 0.93, \"dz\": -0.02, \"dpitch\": -2.75, \"dyaw\": -5.76, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 1.4, \"dz\": -0.03, \"dpitch\": -2.62, \"dyaw\": -6.14, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": 1.87, \"dz\": -0.04, \"dpitch\": -2.49, \"dyaw\": -6.51, \"droll\": 0.0}, {\"dx\": -0.82, \"dy\": 2.33, \"dz\": -0.04, \"dpitch\": -2.36, \"dyaw\": -6.88, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.29, "window_alt_abs_m": 0.06, "target_px_mean_hist": 696.0, "cur_frame_id": 69, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.45, 74.65, 19.99, -38.59, 69.19, 0.0]\n  Target bbox: [613.73, 365.22, 657.7, 439.05]\n\nFrame 2:\n  Drone pose: [-108.67, 75.13, 20.02, -41.04, 68.35, 0.0]\n  Target bbox: [618.71, 323.14, 661.03, 396.42]\n\nFrame 3:\n  Drone pose: [-108.67, 75.58, 20.02, -45.03, 66.81, 0.0]\n  Target bbox: [636.25, 254.69, 684.97, 328.77]\n\nFrame 4:\n  Drone pose: [-108.62, 76.02, 20.01, -40.52, 68.09, 0.0]\n  Target bbox: [622.29, 329.52, 670.04, 403.7]\n\nFrame 5 (current):\n  Drone pose: [-108.32, 76.35, 20.05, -45.02, 73.14, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 569.28, \"ymin\": 255.34, \"xmax\": 615.99, \"ymax\": 327.72}, \"waypoint_deltas\": [{\"dx\": 0.11, \"dy\": 0.54, \"dz\": -0.04, \"dpitch\": 4.07, \"dyaw\": -3.43, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": 0.98, \"dz\": -0.04, \"dpitch\": 4.06, \"dyaw\": -2.89, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": 1.42, \"dz\": -0.04, \"dpitch\": 4.06, \"dyaw\": -2.38, \"droll\": 0.0}, {\"dx\": 0.68, \"dy\": 1.86, \"dz\": -0.05, \"dpitch\": 4.05, \"dyaw\": -1.88, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": 2.31, \"dz\": -0.05, \"dpitch\": 4.05, \"dyaw\": -1.41, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.71, "window_alt_abs_m": 0.07, "target_px_mean_hist": 656.8, "cur_frame_id": 78, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-107.47, 78.66, 20.0, -40.97, 71.73, 0.0]\n  Target bbox: [617.32, 322.93, 662.36, 396.67]\n\nFrame 2:\n  Drone pose: [-107.3, 79.11, 20.0, -42.99, 67.19, 0.0]\n  Target bbox: [679.53, 290.68, 728.02, 364.95]\n\nFrame 3:\n  Drone pose: [-107.24, 79.51, 20.13, -41.08, 72.43, 0.0]\n  Target bbox: [618.06, 323.57, 661.63, 396.02]\n\nFrame 4:\n  Drone pose: [-106.96, 79.91, 20.16, -41.09, 73.19, 0.0]\n  Target bbox: [620.51, 324.11, 659.17, 395.45]\n\nFrame 5 (current):\n  Drone pose: [-106.78, 80.42, 19.95, -40.87, 73.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.93, \"ymin\": 324.1, \"xmax\": 658.75, \"ymax\": 395.45}, \"waypoint_deltas\": [{\"dx\": 0.08, \"dy\": 0.51, \"dz\": 0.05, \"dpitch\": -0.11, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": 0.97, \"dz\": 0.05, \"dpitch\": -0.11, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": 1.43, \"dz\": 0.05, \"dpitch\": -0.11, \"dyaw\": 0.92, \"droll\": 0.0}, {\"dx\": 0.48, \"dy\": 1.9, \"dz\": 0.05, \"dpitch\": -0.1, \"dyaw\": 1.27, \"droll\": 0.0}, {\"dx\": 0.6, \"dy\": 2.36, \"dz\": 0.05, \"dpitch\": -0.1, \"dyaw\": 1.6, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.99, "window_alt_abs_m": 0.36, "target_px_mean_hist": 666.0, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776024350", "difficulty_score": 0.3864, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [119.17, -76.52, 22.0, -46.27, 171.47, 0.0]\n  Target bbox: [622.65, 328.09, 657.07, 391.21]\n\nFrame 2:\n  Drone pose: [117.6, -77.61, 21.2, -46.4, 167.77, 0.0]\n  Target bbox: [620.94, 324.85, 658.72, 394.46]\n\nFrame 3:\n  Drone pose: [116.65, -78.07, 20.67, -46.15, 166.11, 0.0]\n  Target bbox: [618.41, 320.72, 661.14, 398.59]\n\nFrame 4:\n  Drone pose: [116.08, -78.08, 20.64, -46.21, 166.03, 0.0]\n  Target bbox: [619.41, 321.09, 660.16, 398.22]\n\nFrame 5 (current):\n  Drone pose: [115.49, -78.09, 20.62, -46.29, 165.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.4, \"ymin\": 323.68, \"xmax\": 660.27, \"ymax\": 395.56}, \"waypoint_deltas\": [{\"dx\": -0.56, \"dy\": 0.02, \"dz\": -0.03, \"dpitch\": -0.06, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 0.13, \"dz\": -0.05, \"dpitch\": -0.09, \"dyaw\": 0.33, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": 0.25, \"dz\": -0.07, \"dpitch\": 0.06, \"dyaw\": -0.81, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": 0.37, \"dz\": -0.09, \"dpitch\": 0.22, \"dyaw\": -1.97, \"droll\": 0.0}, {\"dx\": -2.64, \"dy\": 0.47, \"dz\": -0.2, \"dpitch\": 0.31, \"dyaw\": -1.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.54, "window_alt_abs_m": 1.38, "target_px_mean_hist": 633.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [112.34, -77.54, 20.39, -46.0, 164.49, 0.0]\n  Target bbox: [619.65, 324.26, 660.08, 394.94]\n\nFrame 2:\n  Drone pose: [111.82, -77.46, 20.36, -46.01, 164.7, 0.0]\n  Target bbox: [618.61, 321.45, 661.0, 397.83]\n\nFrame 3:\n  Drone pose: [111.3, -77.39, 20.33, -46.01, 164.91, 0.0]\n  Target bbox: [618.1, 322.72, 661.57, 396.51]\n\nFrame 4:\n  Drone pose: [110.79, -77.31, 20.3, -46.02, 165.14, 0.0]\n  Target bbox: [620.14, 325.96, 659.64, 393.21]\n\nFrame 5 (current):\n  Drone pose: [110.28, -77.21, 20.27, -46.04, 165.42, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.47, \"ymin\": 324.63, \"xmax\": 660.27, \"ymax\": 394.56}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.11, \"dz\": -0.03, \"dpitch\": -0.02, \"dyaw\": 0.33, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 0.24, \"dz\": -0.05, \"dpitch\": 0.14, \"dyaw\": -0.81, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": 0.38, \"dz\": -0.08, \"dpitch\": 0.11, \"dyaw\": -0.42, \"droll\": 0.0}, {\"dx\": -2.07, \"dy\": 0.52, \"dz\": -0.1, \"dpitch\": 0.06, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -2.57, \"dy\": 0.68, \"dz\": -0.12, \"dpitch\": 0.01, \"dyaw\": 0.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.93, "window_alt_abs_m": 0.12, "target_px_mean_hist": 677.8, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.71, -76.21, 20.12, -45.91, 165.34, 0.0]\n  Target bbox: [618.92, 320.44, 660.64, 398.86]\n\nFrame 2:\n  Drone pose: [106.21, -76.07, 20.1, -45.93, 165.79, 0.0]\n  Target bbox: [617.77, 318.88, 661.7, 400.53]\n\nFrame 3:\n  Drone pose: [105.73, -75.94, 20.09, -45.94, 166.21, 0.0]\n  Target bbox: [619.16, 324.95, 660.6, 394.17]\n\nFrame 4:\n  Drone pose: [105.25, -75.82, 20.08, -45.94, 166.57, 0.0]\n  Target bbox: [620.13, 326.43, 659.67, 392.68]\n\nFrame 5 (current):\n  Drone pose: [104.76, -75.71, 20.07, -45.94, 166.9, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.38, \"ymin\": 319.75, \"xmax\": 661.1, \"ymax\": 399.68}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.09, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.18, \"dz\": -0.02, \"dpitch\": -0.01, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": 0.26, \"dz\": -0.03, \"dpitch\": -0.02, \"dyaw\": 0.81, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": 0.34, \"dz\": -0.03, \"dpitch\": -0.04, \"dyaw\": 1.05, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": 0.42, \"dz\": -0.04, \"dpitch\": -0.06, \"dyaw\": 1.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.56, "window_alt_abs_m": 0.05, "target_px_mean_hist": 712.0, "cur_frame_id": 25, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.78, -75.21, 20.03, -46.01, 168.44, 0.0]\n  Target bbox: [618.97, 319.49, 660.51, 399.88]\n\nFrame 2:\n  Drone pose: [101.28, -75.13, 20.02, -46.03, 168.7, 0.0]\n  Target bbox: [619.15, 320.35, 660.39, 398.89]\n\nFrame 3:\n  Drone pose: [100.77, -75.05, 20.02, -46.06, 168.95, 0.0]\n  Target bbox: [618.35, 322.14, 661.24, 397.04]\n\nFrame 4:\n  Drone pose: [100.27, -74.96, 20.02, -46.09, 169.21, 0.0]\n  Target bbox: [619.47, 325.64, 660.27, 393.43]\n\nFrame 5 (current):\n  Drone pose: [99.75, -74.87, 20.01, -46.14, 169.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.46, \"ymin\": 321.06, \"xmax\": 660.09, \"ymax\": 398.14}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 0.2, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.59, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.3, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 0.89, \"droll\": 0.0}, {\"dx\": -2.11, \"dy\": 0.38, \"dz\": 0.0, \"dpitch\": -0.27, \"dyaw\": 1.13, \"droll\": 0.0}, {\"dx\": -2.68, \"dy\": 0.44, \"dz\": -0.01, \"dpitch\": -0.38, \"dyaw\": 1.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.03, "window_alt_abs_m": 0.01, "target_px_mean_hist": 720.2, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [95.92, -74.38, 20.0, -46.77, 170.83, 0.0]\n  Target bbox: [620.39, 326.74, 659.37, 392.29]\n\nFrame 2:\n  Drone pose: [95.35, -74.38, 20.0, -46.88, 170.81, 0.0]\n  Target bbox: [620.1, 323.88, 659.54, 395.24]\n\nFrame 3:\n  Drone pose: [94.8, -74.38, 20.0, -46.96, 170.8, 0.0]\n  Target bbox: [619.09, 319.47, 660.36, 399.83]\n\nFrame 4:\n  Drone pose: [94.27, -74.37, 20.0, -47.01, 170.82, 0.0]\n  Target bbox: [620.24, 326.72, 659.52, 392.29]\n\nFrame 5 (current):\n  Drone pose: [93.76, -74.35, 20.0, -47.04, 170.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.97, \"ymin\": 322.61, \"xmax\": 659.64, \"ymax\": 396.49}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": -1.97, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": -2.45, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.08, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.09, "window_alt_abs_m": 0.0, "target_px_mean_hist": 729.8, "cur_frame_id": 46, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [90.34, -74.39, 20.0, -46.92, 170.78, 0.0]\n  Target bbox: [618.98, 318.81, 660.46, 400.43]\n\nFrame 2:\n  Drone pose: [89.85, -74.42, 20.0, -47.1, 170.69, 0.0]\n  Target bbox: [620.36, 326.91, 659.41, 392.11]\n\nFrame 3:\n  Drone pose: [89.35, -74.45, 20.0, -47.08, 170.58, 0.0]\n  Target bbox: [620.06, 323.39, 659.58, 395.7]\n\nFrame 4:\n  Drone pose: [88.87, -74.51, 20.0, -47.03, 170.4, 0.0]\n  Target bbox: [618.21, 319.96, 661.29, 399.19]\n\nFrame 5 (current):\n  Drone pose: [88.39, -74.61, 20.0, -46.99, 170.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.97, \"ymin\": 319.44, \"xmax\": 660.48, \"ymax\": 399.88}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": -2.16, \"dy\": 0.6, \"dz\": 0.0, \"dpitch\": -0.41, \"dyaw\": 1.84, \"droll\": 0.0}, {\"dx\": -2.74, \"dy\": 1.62, \"dz\": 0.0, \"dpitch\": -0.72, \"dyaw\": 5.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.67, "window_alt_abs_m": 0.0, "target_px_mean_hist": 721.8, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [85.04, -71.56, 20.0, -48.0, 179.98, 0.0]\n  Target bbox: [621.16, 328.7, 658.83, 390.17]\n\nFrame 2:\n  Drone pose: [84.48, -69.96, 20.0, -47.97, -174.66, 0.0]\n  Target bbox: [621.33, 321.44, 659.16, 397.7]\n\nFrame 3:\n  Drone pose: [83.94, -68.42, 20.0, -47.68, -169.57, 0.0]\n  Target bbox: [618.86, 322.78, 661.1, 396.37]\n\nFrame 4:\n  Drone pose: [83.41, -67.19, 20.0, -47.49, -167.19, 0.0]\n  Target bbox: [614.43, 318.86, 665.45, 400.36]\n\nFrame 5 (current):\n  Drone pose: [82.86, -66.33, 20.0, -47.43, -166.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.87, \"ymin\": 322.88, \"xmax\": 660.02, \"ymax\": 396.23}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -1.1, \"dy\": 0.79, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 1.02, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": 0.93, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 1.48, \"droll\": 0.0}, {\"dx\": -2.16, \"dy\": 0.98, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 1.69, \"droll\": 0.0}, {\"dx\": -2.68, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 1.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.99, "window_alt_abs_m": 0.0, "target_px_mean_hist": 729.2, "cur_frame_id": 67, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [79.2, -65.3, 20.0, -47.47, -164.19, 0.0]\n  Target bbox: [619.2, 323.8, 661.07, 395.32]\n\nFrame 2:\n  Drone pose: [78.71, -65.28, 20.0, -47.44, -164.16, 0.0]\n  Target bbox: [618.6, 322.14, 661.73, 396.99]\n\nFrame 3:\n  Drone pose: [78.22, -65.27, 20.0, -47.41, -164.11, 0.0]\n  Target bbox: [618.39, 320.99, 661.99, 398.16]\n\nFrame 4:\n  Drone pose: [77.74, -65.24, 20.0, -47.38, -164.05, 0.0]\n  Target bbox: [618.34, 318.98, 662.11, 400.2]\n\nFrame 5 (current):\n  Drone pose: [77.25, -65.21, 20.0, -47.34, -163.97, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.45, \"ymin\": 317.87, \"xmax\": 663.07, \"ymax\": 401.42}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.21, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 0.33, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": -2.52, \"dy\": 0.18, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.22, "window_alt_abs_m": 0.0, "target_px_mean_hist": 723.0, "cur_frame_id": 78, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [74.2, -65.0, 20.0, -47.33, -163.27, 0.0]\n  Target bbox: [617.59, 319.14, 662.9, 400.2]\n\nFrame 2:\n  Drone pose: [73.67, -64.98, 20.0, -47.37, -163.17, 0.0]\n  Target bbox: [618.93, 323.63, 661.33, 395.48]\n\nFrame 3:\n  Drone pose: [73.14, -64.97, 20.0, -47.4, -163.1, 0.0]\n  Target bbox: [618.3, 319.78, 662.11, 399.43]\n\nFrame 4:\n  Drone pose: [72.63, -64.96, 20.0, -47.42, -163.06, 0.0]\n  Target bbox: [618.65, 322.41, 661.64, 396.68]\n\nFrame 5 (current):\n  Drone pose: [72.12, -64.95, 20.0, -47.44, -163.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.21, \"ymin\": 317.75, \"xmax\": 663.3, \"ymax\": 401.55}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -1.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.23, "window_alt_abs_m": 0.0, "target_px_mean_hist": 717.8, "cur_frame_id": 88, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/ORI/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [68.61, -64.95, 20.0, -47.45, -163.01, 0.0]\n  Target bbox: [617.82, 318.86, 662.62, 400.33]\n\nFrame 2:\n  Drone pose: [68.1, -64.94, 20.0, -47.45, -162.99, 0.0]\n  Target bbox: [617.19, 318.18, 663.33, 401.17]\n\nFrame 3:\n  Drone pose: [67.61, -64.95, 20.0, -47.44, -163.02, 0.0]\n  Target bbox: [618.98, 323.87, 661.26, 395.22]\n\nFrame 4:\n  Drone pose: [67.11, -64.93, 20.0, -47.43, -162.96, 0.0]\n  Target bbox: [618.25, 319.62, 662.17, 399.58]\n\nFrame 5 (current):\n  Drone pose: [66.64, -64.94, 20.0, -47.39, -163.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.05, \"ymin\": 318.84, \"xmax\": 662.39, \"ymax\": 400.34}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": -1.93, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": 0.38, \"droll\": 0.0}, {\"dx\": -2.42, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": 0.32, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.14, "window_alt_abs_m": 0.0, "target_px_mean_hist": 717.0, "cur_frame_id": 99, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [119.09, -76.4, 21.99, -44.36, 180.05, 0.0]\n  Target bbox: [611.78, 384.04, 650.45, 452.76]\n\nFrame 2:\n  Drone pose: [117.47, -77.65, 21.08, -51.32, 168.94, 0.0]\n  Target bbox: [650.76, 240.33, 694.75, 311.91]\n\nFrame 3:\n  Drone pose: [116.77, -78.21, 20.68, -54.49, 160.18, 0.0]\n  Target bbox: [669.15, 267.68, 716.46, 350.41]\n\nFrame 4:\n  Drone pose: [116.08, -78.08, 20.64, -43.84, 161.65, 0.0]\n  Target bbox: [670.16, 364.7, 712.2, 436.91]\n\nFrame 5 (current):\n  Drone pose: [115.49, -78.09, 20.62, -48.63, 166.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 609.0, \"ymin\": 279.41, \"xmax\": 660.57, \"ymax\": 361.28}, \"waypoint_deltas\": [{\"dx\": -0.56, \"dy\": 0.02, \"dz\": -0.03, \"dpitch\": 2.28, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 0.13, \"dz\": -0.05, \"dpitch\": 2.25, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": 0.25, \"dz\": -0.07, \"dpitch\": 2.4, \"dyaw\": -1.25, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": 0.37, \"dz\": -0.09, \"dpitch\": 2.56, \"dyaw\": -2.41, \"droll\": 0.0}, {\"dx\": -2.64, \"dy\": 0.47, \"dz\": -0.2, \"dpitch\": 2.65, \"dyaw\": -2.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 26.06, "window_alt_abs_m": 1.37, "target_px_mean_hist": 648.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [112.34, -77.54, 20.39, -48.26, 160.94, 0.0]\n  Target bbox: [660.14, 286.13, 703.38, 359.09]\n\nFrame 2:\n  Drone pose: [111.82, -77.46, 20.36, -46.01, 164.7, 0.0]\n  Target bbox: [619.26, 322.12, 660.37, 397.16]\n\nFrame 3:\n  Drone pose: [111.3, -77.39, 20.33, -46.01, 164.91, 0.0]\n  Target bbox: [619.73, 324.72, 660.0, 394.51]\n\nFrame 4:\n  Drone pose: [110.79, -77.31, 20.3, -42.46, 165.65, 0.0]\n  Target bbox: [613.09, 382.12, 654.19, 456.78]\n\nFrame 5 (current):\n  Drone pose: [110.28, -77.21, 20.27, -48.92, 162.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 648.56, \"ymin\": 278.14, \"xmax\": 688.88, \"ymax\": 345.17}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.11, \"dz\": -0.03, \"dpitch\": 2.86, \"dyaw\": 2.76, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 0.24, \"dz\": -0.05, \"dpitch\": 3.02, \"dyaw\": 1.62, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": 0.38, \"dz\": -0.08, \"dpitch\": 2.99, \"dyaw\": 2.01, \"droll\": 0.0}, {\"dx\": -2.07, \"dy\": 0.52, \"dz\": -0.1, \"dpitch\": 2.94, \"dyaw\": 2.44, \"droll\": 0.0}, {\"dx\": -2.57, \"dy\": 0.68, \"dz\": -0.12, \"dpitch\": 2.89, \"dyaw\": 2.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.38, "window_alt_abs_m": 0.12, "target_px_mean_hist": 682.0, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.87, -76.2, 20.2, -49.07, 170.68, 0.0]\n  Target bbox: [616.13, 318.54, 663.44, 400.54]\n\nFrame 2:\n  Drone pose: [106.21, -76.16, 20.18, -41.51, 177.41, 0.0]\n  Target bbox: [572.29, 377.96, 614.64, 453.37]\n\nFrame 3:\n  Drone pose: [105.79, -75.96, 20.16, -46.8, 166.52, 0.0]\n  Target bbox: [645.02, 364.26, 681.59, 439.21]\n\nFrame 4:\n  Drone pose: [105.25, -75.82, 20.08, -45.94, 166.57, 0.0]\n  Target bbox: [617.91, 319.04, 661.58, 400.27]\n\nFrame 5 (current):\n  Drone pose: [104.76, -75.71, 20.07, -45.94, 166.9, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.23, \"ymin\": 323.6, \"xmax\": 660.44, \"ymax\": 395.6}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.09, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.18, \"dz\": -0.02, \"dpitch\": -0.01, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": 0.26, \"dz\": -0.03, \"dpitch\": -0.02, \"dyaw\": 0.81, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": 0.34, \"dz\": -0.03, \"dpitch\": -0.04, \"dyaw\": 1.05, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": 0.42, \"dz\": -0.04, \"dpitch\": -0.06, \"dyaw\": 1.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.0, "window_alt_abs_m": 0.13, "target_px_mean_hist": 697.5, "cur_frame_id": 25, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.78, -75.21, 20.03, -45.68, 173.44, 0.0]\n  Target bbox: [560.9, 335.04, 601.42, 398.99]\n\nFrame 2:\n  Drone pose: [101.28, -75.13, 20.02, -50.61, 170.51, 0.0]\n  Target bbox: [597.25, 247.76, 639.04, 317.85]\n\nFrame 3:\n  Drone pose: [100.71, -74.96, 19.95, -41.59, 168.08, 0.0]\n  Target bbox: [561.94, 394.74, 608.31, 469.2]\n\nFrame 4:\n  Drone pose: [100.27, -74.96, 20.02, -41.09, 167.95, 0.0]\n  Target bbox: [634.47, 405.87, 675.17, 481.89]\n\nFrame 5 (current):\n  Drone pose: [99.75, -74.87, 20.01, -47.48, 174.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.47, \"ymin\": 300.6, \"xmax\": 601.75, \"ymax\": 377.09}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": 1.29, \"dyaw\": -4.71, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 0.2, \"dz\": 0.0, \"dpitch\": 1.23, \"dyaw\": -4.41, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.3, \"dz\": 0.0, \"dpitch\": 1.16, \"dyaw\": -4.11, \"droll\": 0.0}, {\"dx\": -2.11, \"dy\": 0.38, \"dz\": 0.0, \"dpitch\": 1.07, \"dyaw\": -3.87, \"droll\": 0.0}, {\"dx\": -2.68, \"dy\": 0.44, \"dz\": -0.01, \"dpitch\": 0.96, \"dyaw\": -3.71, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.02, "window_alt_abs_m": 0.14, "target_px_mean_hist": 706.2, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [95.98, -74.47, 19.89, -42.75, 170.1, 0.0]\n  Target bbox: [637.61, 307.21, 671.0, 368.74]\n\nFrame 2:\n  Drone pose: [95.35, -74.38, 20.0, -48.72, 169.67, 0.0]\n  Target bbox: [633.18, 294.53, 673.37, 362.9]\n\nFrame 3:\n  Drone pose: [94.8, -74.38, 20.0, -46.96, 170.8, 0.0]\n  Target bbox: [620.06, 321.93, 659.5, 397.26]\n\nFrame 4:\n  Drone pose: [94.27, -74.37, 20.0, -45.94, 175.82, 0.0]\n  Target bbox: [561.68, 345.93, 602.62, 412.81]\n\nFrame 5 (current):\n  Drone pose: [93.76, -74.35, 20.0, -46.04, 167.97, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 651.58, \"ymin\": 339.97, \"xmax\": 695.13, \"ymax\": 413.65}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": -1.0, \"dyaw\": 2.93, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": -0.99, \"dyaw\": 2.96, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": -0.97, \"dyaw\": 2.99, \"droll\": 0.0}, {\"dx\": -1.97, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": -0.95, \"dyaw\": 2.99, \"droll\": 0.0}, {\"dx\": -2.45, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": -0.92, \"dyaw\": 2.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.44, "window_alt_abs_m": 0.11, "target_px_mean_hist": 706.2, "cur_frame_id": 46, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [90.34, -74.39, 20.0, -42.3, 169.78, 0.0]\n  Target bbox: [631.5, 401.65, 671.7, 472.82]\n\nFrame 2:\n  Drone pose: [89.85, -74.42, 20.0, -42.73, 174.16, 0.0]\n  Target bbox: [579.75, 397.86, 619.5, 469.72]\n\nFrame 3:\n  Drone pose: [89.35, -74.45, 20.0, -50.6, 175.08, 0.0]\n  Target bbox: [567.26, 265.0, 608.31, 338.82]\n\nFrame 4:\n  Drone pose: [88.87, -74.51, 20.0, -43.82, 165.4, 0.0]\n  Target bbox: [676.67, 379.19, 718.46, 451.51]\n\nFrame 5 (current):\n  Drone pose: [88.39, -74.61, 20.0, -46.99, 170.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.5, \"ymin\": 321.81, \"xmax\": 660.08, \"ymax\": 397.33}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": -2.16, \"dy\": 0.6, \"dz\": 0.0, \"dpitch\": -0.41, \"dyaw\": 1.84, \"droll\": 0.0}, {\"dx\": -2.74, \"dy\": 1.62, \"dz\": 0.0, \"dpitch\": -0.72, \"dyaw\": 5.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.69, "window_alt_abs_m": 0.0, "target_px_mean_hist": 731.5, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [85.04, -71.56, 20.0, -53.0, 177.18, 0.0]\n  Target bbox: [652.94, 238.37, 690.77, 313.54]\n\nFrame 2:\n  Drone pose: [84.48, -69.96, 20.0, -51.2, -174.93, 0.0]\n  Target bbox: [620.29, 267.59, 665.72, 342.95]\n\nFrame 3:\n  Drone pose: [84.1, -68.43, 20.08, -47.11, -162.58, 0.0]\n  Target bbox: [557.66, 282.34, 605.11, 359.66]\n\nFrame 4:\n  Drone pose: [83.41, -67.19, 20.0, -45.55, -162.61, 0.0]\n  Target bbox: [565.39, 356.16, 609.67, 430.92]\n\nFrame 5 (current):\n  Drone pose: [82.91, -66.41, 19.92, -50.46, -161.76, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 563.12, \"ymin\": 247.6, \"xmax\": 607.3, \"ymax\": 323.57}, \"waypoint_deltas\": [{\"dx\": -0.6, \"dy\": 0.59, \"dz\": 0.08, \"dpitch\": 2.95, \"dyaw\": -4.18, \"droll\": 0.0}, {\"dx\": -1.15, \"dy\": 0.87, \"dz\": 0.08, \"dpitch\": 2.98, \"dyaw\": -3.25, \"droll\": 0.0}, {\"dx\": -1.69, \"dy\": 1.01, \"dz\": 0.08, \"dpitch\": 2.98, \"dyaw\": -2.79, \"droll\": 0.0}, {\"dx\": -2.21, \"dy\": 1.06, \"dz\": 0.08, \"dpitch\": 2.97, \"dyaw\": -2.58, \"droll\": 0.0}, {\"dx\": -2.73, \"dy\": 1.09, \"dz\": 0.08, \"dpitch\": 2.96, \"dyaw\": -2.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.12, "window_alt_abs_m": 0.24, "target_px_mean_hist": 703.8, "cur_frame_id": 67, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [79.2, -65.3, 20.0, -51.7, -159.19, 0.0]\n  Target bbox: [559.93, 252.5, 605.44, 328.24]\n\nFrame 2:\n  Drone pose: [78.71, -65.28, 20.0, -45.96, -169.16, 0.0]\n  Target bbox: [675.52, 346.53, 719.54, 425.9]\n\nFrame 3:\n  Drone pose: [78.22, -65.27, 20.0, -47.41, -164.11, 0.0]\n  Target bbox: [619.39, 323.51, 660.89, 395.61]\n\nFrame 4:\n  Drone pose: [77.74, -65.24, 20.0, -45.38, -159.16, 0.0]\n  Target bbox: [562.12, 359.72, 605.68, 429.68]\n\nFrame 5 (current):\n  Drone pose: [77.25, -65.21, 20.0, -43.84, -167.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 659.14, \"ymin\": 378.0, \"xmax\": 705.76, \"ymax\": 460.96}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": -3.48, \"dyaw\": 3.75, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": -3.45, \"dyaw\": 3.86, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": -3.44, \"dyaw\": 3.98, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": -3.45, \"dyaw\": 4.11, \"droll\": 0.0}, {\"dx\": -2.52, \"dy\": 0.18, \"dz\": 0.0, \"dpitch\": -3.46, \"dyaw\": 4.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 28.44, "window_alt_abs_m": 0.0, "target_px_mean_hist": 725.5, "cur_frame_id": 78, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [74.22, -65.02, 19.92, -50.21, -153.54, 0.0]\n  Target bbox: [582.29, 255.45, 620.7, 324.86]\n\nFrame 2:\n  Drone pose: [73.67, -64.98, 20.0, -42.51, -160.91, 0.0]\n  Target bbox: [591.07, 401.24, 636.74, 482.35]\n\nFrame 3:\n  Drone pose: [73.14, -64.97, 20.0, -47.4, -163.1, 0.0]\n  Target bbox: [618.9, 323.87, 661.34, 395.22]\n\nFrame 4:\n  Drone pose: [72.63, -64.96, 20.0, -45.31, -162.7, 0.0]\n  Target bbox: [614.69, 358.15, 657.24, 431.75]\n\nFrame 5 (current):\n  Drone pose: [72.12, -64.95, 20.0, -47.93, -168.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 676.05, \"ymin\": 319.48, \"xmax\": 718.66, \"ymax\": 386.48}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.49, \"dyaw\": 5.01, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.48, \"dyaw\": 5.02, \"droll\": 0.0}, {\"dx\": -1.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.48, \"dyaw\": 5.02, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.48, \"dyaw\": 5.02, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.48, \"dyaw\": 5.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.29, "window_alt_abs_m": 0.08, "target_px_mean_hist": 672.8, "cur_frame_id": 88, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642/aug_001/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [68.61, -64.95, 20.0, -45.09, -158.01, 0.0]\n  Target bbox: [557.85, 358.35, 607.9, 443.97]\n\nFrame 2:\n  Drone pose: [68.1, -64.94, 20.0, -47.45, -162.99, 0.0]\n  Target bbox: [618.52, 321.72, 661.83, 397.47]\n\nFrame 3:\n  Drone pose: [67.61, -64.95, 20.0, -47.44, -163.02, 0.0]\n  Target bbox: [619.16, 325.46, 660.99, 393.55]\n\nFrame 4:\n  Drone pose: [67.11, -64.93, 20.0, -47.43, -162.96, 0.0]\n  Target bbox: [616.94, 318.21, 663.53, 400.99]\n\nFrame 5 (current):\n  Drone pose: [66.64, -64.94, 20.0, -49.02, -158.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.48, \"ymin\": 298.92, \"xmax\": 604.93, \"ymax\": 369.17}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 1.65, \"dyaw\": -4.92, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 1.69, \"dyaw\": -4.93, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": 1.73, \"dyaw\": -4.72, \"droll\": 0.0}, {\"dx\": -1.93, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": 1.81, \"dyaw\": -4.62, \"droll\": 0.0}, {\"dx\": -2.42, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": 1.82, \"dyaw\": -4.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 728.0, "cur_frame_id": 99, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776206642", "difficulty_score": 0.2266, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-60.11, -66.56, 22.0, -46.26, 90.0, 0.0]\n  Target bbox: [625.1, 327.73, 654.9, 391.47]\n\nFrame 2:\n  Drone pose: [-61.39, -66.77, 21.2, -44.09, 86.44, 0.0]\n  Target bbox: [623.97, 327.17, 655.69, 392.17]\n\nFrame 3:\n  Drone pose: [-61.94, -66.61, 20.67, -42.81, 85.02, 0.0]\n  Target bbox: [623.67, 326.97, 655.99, 392.45]\n\nFrame 4:\n  Drone pose: [-62.11, -66.24, 20.64, -42.56, 84.61, 0.0]\n  Target bbox: [623.52, 326.83, 656.14, 392.58]\n\nFrame 5 (current):\n  Drone pose: [-62.12, -65.81, 20.62, -42.43, 84.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.77, \"ymin\": 326.26, \"xmax\": 655.9, \"ymax\": 393.12}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.45, \"dz\": -0.03, \"dpitch\": 0.09, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 0.91, \"dz\": -0.05, \"dpitch\": 0.18, \"dyaw\": 0.22, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 1.36, \"dz\": -0.07, \"dpitch\": 0.27, \"dyaw\": 0.32, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 1.82, \"dz\": -0.09, \"dpitch\": 0.35, \"dyaw\": 0.4, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": 2.29, \"dz\": -0.2, \"dpitch\": 0.55, \"dyaw\": 0.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.41, "window_alt_abs_m": 1.38, "target_px_mean_hist": 633.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-61.95, -63.03, 20.39, -41.82, 85.11, 0.0]\n  Target bbox: [623.77, 327.03, 655.89, 392.43]\n\nFrame 2:\n  Drone pose: [-61.94, -62.53, 20.36, -41.78, 85.13, 0.0]\n  Target bbox: [623.65, 326.4, 656.01, 393.0]\n\nFrame 3:\n  Drone pose: [-61.94, -62.02, 20.33, -41.75, 85.13, 0.0]\n  Target bbox: [620.54, 326.66, 659.06, 392.79]\n\nFrame 4:\n  Drone pose: [-61.94, -61.51, 20.3, -41.72, 85.13, 0.0]\n  Target bbox: [621.11, 326.28, 658.49, 393.15]\n\nFrame 5 (current):\n  Drone pose: [-61.94, -61.0, 20.27, -41.69, 85.13, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.67, \"ymin\": 326.35, \"xmax\": 655.99, \"ymax\": 393.06}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 0.03, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": -0.05, \"dpitch\": 0.06, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.52, \"dz\": -0.08, \"dpitch\": 0.09, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": 2.01, \"dz\": -0.1, \"dpitch\": 0.13, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": 2.49, \"dz\": -0.12, \"dpitch\": 0.18, \"dyaw\": 0.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.03, "window_alt_abs_m": 0.12, "target_px_mean_hist": 641.0, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-61.89, -57.59, 20.12, -41.35, 85.28, 0.0]\n  Target bbox: [623.09, 326.1, 656.55, 393.32]\n\nFrame 2:\n  Drone pose: [-61.87, -57.14, 20.1, -41.27, 85.34, 0.0]\n  Target bbox: [623.17, 326.87, 656.48, 392.58]\n\nFrame 3:\n  Drone pose: [-61.85, -56.69, 20.09, -41.19, 85.39, 0.0]\n  Target bbox: [621.1, 326.76, 658.5, 392.72]\n\nFrame 4:\n  Drone pose: [-61.85, -56.22, 20.08, -41.13, 85.41, 0.0]\n  Target bbox: [620.41, 325.94, 659.18, 393.49]\n\nFrame 5 (current):\n  Drone pose: [-61.82, -55.76, 20.07, -41.06, 85.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.53, \"ymin\": 326.6, \"xmax\": 659.06, \"ymax\": 392.88}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.41, \"dz\": -0.01, \"dpitch\": 0.12, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": 0.76, \"dz\": -0.02, \"dpitch\": 0.32, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": 0.33, \"dy\": 1.05, \"dz\": -0.03, \"dpitch\": 0.58, \"dyaw\": 0.94, \"droll\": 0.0}, {\"dx\": 0.6, \"dy\": 1.35, \"dz\": -0.03, \"dpitch\": 0.82, \"dyaw\": 1.67, \"droll\": 0.0}, {\"dx\": 0.97, \"dy\": 1.76, \"dz\": -0.04, \"dpitch\": 0.93, \"dyaw\": 2.62, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.21, "window_alt_abs_m": 0.05, "target_px_mean_hist": 646.2, "cur_frame_id": 25, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-60.38, -53.46, 20.03, -40.19, 89.31, 0.0]\n  Target bbox: [624.92, 327.71, 654.74, 391.76]\n\nFrame 2:\n  Drone pose: [-59.83, -52.81, 20.02, -40.37, 90.71, 0.0]\n  Target bbox: [625.3, 328.01, 655.05, 391.47]\n\nFrame 3:\n  Drone pose: [-59.3, -52.09, 20.02, -40.63, 92.1, 0.0]\n  Target bbox: [622.17, 326.61, 658.24, 392.82]\n\nFrame 4:\n  Drone pose: [-58.9, -51.35, 20.02, -40.91, 93.16, 0.0]\n  Target bbox: [621.97, 326.62, 658.43, 392.83]\n\nFrame 5 (current):\n  Drone pose: [-58.68, -50.66, 20.01, -41.14, 93.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.27, \"ymin\": 326.04, \"xmax\": 658.13, \"ymax\": 393.38}, \"waypoint_deltas\": [{\"dx\": 0.19, \"dy\": 0.63, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": 0.52, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": 1.19, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": 1.11, \"droll\": 0.0}, {\"dx\": 0.45, \"dy\": 1.72, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": 1.24, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 2.23, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": 1.25, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 2.73, \"dz\": -0.01, \"dpitch\": -0.24, \"dyaw\": 1.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.48, "window_alt_abs_m": 0.01, "target_px_mean_hist": 650.2, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.21, -46.92, 20.0, -41.39, 95.07, 0.0]\n  Target bbox: [621.52, 326.46, 658.88, 393.01]\n\nFrame 2:\n  Drone pose: [-58.3, -46.42, 20.0, -41.4, 94.83, 0.0]\n  Target bbox: [623.91, 326.48, 656.44, 392.98]\n\nFrame 3:\n  Drone pose: [-58.32, -45.92, 20.0, -41.4, 94.79, 0.0]\n  Target bbox: [622.72, 325.47, 657.67, 393.94]\n\nFrame 4:\n  Drone pose: [-58.3, -45.41, 20.0, -41.41, 94.84, 0.0]\n  Target bbox: [623.82, 325.97, 656.53, 393.43]\n\nFrame 5 (current):\n  Drone pose: [-58.32, -44.91, 20.0, -41.41, 94.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.91, \"ymin\": 326.34, \"xmax\": 656.44, \"ymax\": 393.1}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": 1.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 1.99, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": 2.52, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.08, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.38, "window_alt_abs_m": 0.0, "target_px_mean_hist": 647.2, "cur_frame_id": 46, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.39, -41.15, 20.0, -41.77, 94.65, 0.0]\n  Target bbox: [623.86, 326.68, 656.49, 392.77]\n\nFrame 2:\n  Drone pose: [-58.43, -40.25, 20.0, -42.31, 94.63, 0.0]\n  Target bbox: [623.6, 325.01, 656.77, 394.34]\n\nFrame 3:\n  Drone pose: [-58.23, -39.12, 20.0, -43.16, 95.36, 0.0]\n  Target bbox: [620.09, 324.6, 660.34, 394.71]\n\nFrame 4:\n  Drone pose: [-58.55, -37.98, 20.0, -44.12, 94.59, 0.0]\n  Target bbox: [623.0, 324.85, 657.38, 394.42]\n\nFrame 5 (current):\n  Drone pose: [-59.13, -37.01, 20.0, -44.88, 92.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.34, \"ymin\": 324.7, \"xmax\": 657.04, \"ymax\": 394.48}, \"waypoint_deltas\": [{\"dx\": -0.74, \"dy\": 0.7, \"dz\": 0.0, \"dpitch\": -0.34, \"dyaw\": -2.24, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 0.96, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -2.39, \"droll\": 0.0}, {\"dx\": -0.84, \"dy\": 1.22, \"dz\": 0.0, \"dpitch\": 0.38, \"dyaw\": -2.54, \"droll\": 0.0}, {\"dx\": -0.89, \"dy\": 1.49, \"dz\": 0.0, \"dpitch\": 0.73, \"dyaw\": -2.69, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": 1.75, \"dz\": 0.0, \"dpitch\": 1.07, \"dyaw\": -2.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.14, "window_alt_abs_m": 0.0, "target_px_mean_hist": 677.0, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-60.11, -35.0, 20.0, -43.46, 89.98, 0.0]\n  Target bbox: [621.47, 326.28, 658.52, 393.0]\n\nFrame 2:\n  Drone pose: [-60.16, -34.74, 20.0, -43.12, 89.84, 0.0]\n  Target bbox: [624.21, 326.49, 655.71, 392.81]\n\nFrame 3:\n  Drone pose: [-60.21, -34.47, 20.0, -42.79, 89.71, 0.0]\n  Target bbox: [624.24, 326.04, 655.6, 393.25]\n\nFrame 4:\n  Drone pose: [-60.26, -34.21, 20.0, -42.46, 89.58, 0.0]\n  Target bbox: [621.33, 325.82, 658.44, 393.48]\n\nFrame 5 (current):\n  Drone pose: [-60.31, -33.95, 20.0, -42.13, 89.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.3, \"ymin\": 326.21, \"xmax\": 655.4, \"ymax\": 393.1}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": 0.27, \"dz\": 0.0, \"dpitch\": 0.32, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": 0.64, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": 0.83, \"dz\": 0.0, \"dpitch\": 0.9, \"dyaw\": -0.24, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": 1.14, \"dz\": 0.0, \"dpitch\": 1.15, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": 1.49, \"dz\": 0.0, \"dpitch\": 1.34, \"dyaw\": -0.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.53, "window_alt_abs_m": 0.0, "target_px_mean_hist": 683.2, "cur_frame_id": 67, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-59.63, -31.29, 20.0, -41.01, 91.25, 0.0]\n  Target bbox: [625.08, 326.55, 655.27, 392.84]\n\nFrame 2:\n  Drone pose: [-59.27, -30.58, 20.0, -41.27, 92.24, 0.0]\n  Target bbox: [624.5, 326.63, 655.85, 392.77]\n\nFrame 3:\n  Drone pose: [-58.94, -29.92, 20.0, -41.46, 93.13, 0.0]\n  Target bbox: [623.2, 326.44, 657.17, 392.97]\n\nFrame 4:\n  Drone pose: [-58.56, -29.36, 20.0, -41.5, 94.15, 0.0]\n  Target bbox: [623.85, 326.05, 656.5, 393.34]\n\nFrame 5 (current):\n  Drone pose: [-58.78, -28.72, 20.0, -41.71, 93.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.28, \"ymin\": 326.24, \"xmax\": 659.14, \"ymax\": 393.18}, \"waypoint_deltas\": [{\"dx\": 0.04, \"dy\": 0.58, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": 1.14, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": 1.68, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 2.22, \"dz\": 0.0, \"dpitch\": -0.29, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": 2.75, \"dz\": 0.0, \"dpitch\": -0.34, \"dyaw\": 0.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.46, "window_alt_abs_m": 0.0, "target_px_mean_hist": 672.2, "cur_frame_id": 78, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.8, -25.44, 20.0, -42.09, 93.58, 0.0]\n  Target bbox: [623.9, 326.08, 656.46, 393.29]\n\nFrame 2:\n  Drone pose: [-58.79, -24.92, 20.0, -42.11, 93.61, 0.0]\n  Target bbox: [624.0, 326.56, 656.36, 392.85]\n\nFrame 3:\n  Drone pose: [-58.74, -24.42, 20.0, -42.11, 93.74, 0.0]\n  Target bbox: [622.69, 325.62, 657.69, 393.74]\n\nFrame 4:\n  Drone pose: [-58.72, -23.92, 20.0, -42.11, 93.8, 0.0]\n  Target bbox: [620.74, 325.99, 659.68, 393.41]\n\nFrame 5 (current):\n  Drone pose: [-58.75, -23.42, 20.0, -42.12, 93.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.46, \"ymin\": 322.52, \"xmax\": 661.49, \"ymax\": 397.06}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.39, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.45, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -2.83, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 2.02, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -2.52, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 2.53, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -1.97, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.31, "window_alt_abs_m": 0.0, "target_px_mean_hist": 676.5, "cur_frame_id": 88, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/ORI/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.04, -19.83, 20.0, -42.29, 91.58, 0.0]\n  Target bbox: [619.58, 323.39, 660.36, 396.15]\n\nFrame 2:\n  Drone pose: [-57.82, -19.28, 20.0, -42.36, 90.81, 0.0]\n  Target bbox: [624.78, 325.92, 655.58, 393.38]\n\nFrame 3:\n  Drone pose: [-57.59, -18.72, 20.0, -42.43, 91.45, 0.0]\n  Target bbox: [618.33, 322.32, 661.64, 397.25]\n\nFrame 4:\n  Drone pose: [-57.35, -18.15, 20.0, -42.54, 90.73, 0.0]\n  Target bbox: [624.5, 326.48, 655.87, 392.85]\n\nFrame 5 (current):\n  Drone pose: [-57.14, -17.56, 20.0, -42.66, 91.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.69, \"ymin\": 322.16, \"xmax\": 662.31, \"ymax\": 397.45}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": 0.62, \"dz\": 0.0, \"dpitch\": -0.4, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": 1.21, \"dz\": 0.0, \"dpitch\": -0.53, \"dyaw\": -0.47, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 1.79, \"dz\": 0.0, \"dpitch\": -0.65, \"dyaw\": -1.49, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": 2.38, \"dz\": 0.0, \"dpitch\": -0.78, \"dyaw\": -1.15, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": 3.0, \"dz\": 0.0, \"dpitch\": -0.94, \"dyaw\": -0.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.69, "window_alt_abs_m": 0.0, "target_px_mean_hist": 387.5, "cur_frame_id": 99, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-60.11, -66.56, 22.0, -44.94, 91.19, 0.0]\n  Target bbox: [610.03, 349.95, 642.01, 413.82]\n\nFrame 2:\n  Drone pose: [-61.39, -66.77, 21.2, -49.09, 86.13, 0.0]\n  Target bbox: [627.75, 242.57, 659.66, 308.58]\n\nFrame 3:\n  Drone pose: [-61.91, -66.72, 20.68, -42.67, 85.13, 0.0]\n  Target bbox: [622.73, 326.38, 656.91, 393.0]\n\nFrame 4:\n  Drone pose: [-62.03, -66.25, 20.77, -46.06, 79.82, 0.0]\n  Target bbox: [681.33, 271.45, 722.6, 340.49]\n\nFrame 5 (current):\n  Drone pose: [-62.12, -65.81, 20.62, -37.43, 84.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.5, \"ymin\": 410.6, \"xmax\": 663.25, \"ymax\": 477.04}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.45, \"dz\": -0.03, \"dpitch\": -4.91, \"dyaw\": 0.66, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 0.91, \"dz\": -0.05, \"dpitch\": -4.82, \"dyaw\": 0.78, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 1.36, \"dz\": -0.07, \"dpitch\": -4.73, \"dyaw\": 0.88, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 1.82, \"dz\": -0.09, \"dpitch\": -4.65, \"dyaw\": 0.96, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": 2.29, \"dz\": -0.2, \"dpitch\": -4.45, \"dyaw\": 1.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.58, "window_alt_abs_m": 1.57, "target_px_mean_hist": 621.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-61.95, -63.03, 20.39, -41.21, 90.11, 0.0]\n  Target bbox: [558.96, 339.45, 594.9, 404.01]\n\nFrame 2:\n  Drone pose: [-61.86, -62.72, 20.33, -41.51, 85.36, 0.0]\n  Target bbox: [623.77, 326.87, 655.89, 392.57]\n\nFrame 3:\n  Drone pose: [-62.03, -62.03, 20.28, -39.91, 89.04, 0.0]\n  Target bbox: [568.74, 357.7, 605.81, 422.58]\n\nFrame 4:\n  Drone pose: [-61.98, -61.43, 20.28, -41.79, 85.0, 0.0]\n  Target bbox: [623.58, 325.91, 656.07, 393.49]\n\nFrame 5 (current):\n  Drone pose: [-61.93, -61.14, 20.17, -39.78, 88.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 582.36, \"ymin\": 353.91, \"xmax\": 617.01, \"ymax\": 419.97}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.65, \"dz\": 0.07, \"dpitch\": -1.88, \"dyaw\": -3.21, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 1.15, \"dz\": 0.05, \"dpitch\": -1.85, \"dyaw\": -3.21, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 1.66, \"dz\": 0.02, \"dpitch\": -1.82, \"dyaw\": -3.2, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.15, \"dz\": 0.0, \"dpitch\": -1.78, \"dyaw\": -3.19, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": 2.63, \"dz\": -0.02, \"dpitch\": -1.73, \"dyaw\": -3.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.78, "window_alt_abs_m": 0.22, "target_px_mean_hist": 642.0, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-61.89, -57.59, 20.12, -41.35, 85.28, 0.0]\n  Target bbox: [623.73, 326.9, 655.92, 392.57]\n\nFrame 2:\n  Drone pose: [-61.95, -56.99, 20.04, -44.53, 86.2, 0.0]\n  Target bbox: [605.97, 272.5, 645.01, 340.37]\n\nFrame 3:\n  Drone pose: [-61.73, -56.65, 20.1, -41.78, 87.5, 0.0]\n  Target bbox: [600.7, 318.51, 632.74, 383.92]\n\nFrame 4:\n  Drone pose: [-61.93, -56.19, 20.11, -40.51, 87.66, 0.0]\n  Target bbox: [588.83, 339.22, 627.89, 404.54]\n\nFrame 5 (current):\n  Drone pose: [-61.76, -55.62, 20.2, -41.44, 85.61, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.86, \"ymin\": 326.09, \"xmax\": 655.79, \"ymax\": 393.31}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.27, \"dz\": -0.14, \"dpitch\": 0.5, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": 0.62, \"dz\": -0.15, \"dpitch\": 0.7, \"dyaw\": 0.34, \"droll\": 0.0}, {\"dx\": 0.27, \"dy\": 0.91, \"dz\": -0.16, \"dpitch\": 0.96, \"dyaw\": 0.82, \"droll\": 0.0}, {\"dx\": 0.54, \"dy\": 1.21, \"dz\": -0.16, \"dpitch\": 1.2, \"dyaw\": 1.55, \"droll\": 0.0}, {\"dx\": 0.91, \"dy\": 1.62, \"dz\": -0.17, \"dpitch\": 1.31, \"dyaw\": 2.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.42, "window_alt_abs_m": 0.24, "target_px_mean_hist": 650.0, "cur_frame_id": 25, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-60.44, -53.46, 19.99, -38.95, 86.91, 0.0]\n  Target bbox: [651.82, 347.39, 686.09, 412.4]\n\nFrame 2:\n  Drone pose: [-59.83, -52.81, 20.02, -40.37, 90.71, 0.0]\n  Target bbox: [622.23, 326.94, 658.14, 392.49]\n\nFrame 3:\n  Drone pose: [-59.3, -52.14, 20.12, -39.62, 87.09, 0.0]\n  Target bbox: [684.83, 347.28, 723.11, 412.56]\n\nFrame 4:\n  Drone pose: [-58.91, -51.27, 20.11, -41.15, 93.15, 0.0]\n  Target bbox: [622.93, 326.28, 657.46, 393.13]\n\nFrame 5 (current):\n  Drone pose: [-58.69, -50.69, 20.03, -41.13, 93.74, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.86, \"ymin\": 326.61, \"xmax\": 657.52, \"ymax\": 392.84}, \"waypoint_deltas\": [{\"dx\": 0.2, \"dy\": 0.66, \"dz\": -0.02, \"dpitch\": -0.16, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": 0.42, \"dy\": 1.22, \"dz\": -0.02, \"dpitch\": -0.22, \"dyaw\": 1.16, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 1.75, \"dz\": -0.02, \"dpitch\": -0.24, \"dyaw\": 1.29, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": 2.26, \"dz\": -0.02, \"dpitch\": -0.25, \"dyaw\": 1.3, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": 2.76, \"dz\": -0.03, \"dpitch\": -0.25, \"dyaw\": 1.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.07, "window_alt_abs_m": 0.22, "target_px_mean_hist": 638.8, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.21, -46.92, 20.0, -41.39, 95.07, 0.0]\n  Target bbox: [621.25, 326.11, 659.15, 393.34]\n\nFrame 2:\n  Drone pose: [-58.25, -46.53, 20.11, -41.41, 94.95, 0.0]\n  Target bbox: [621.26, 326.02, 659.14, 393.42]\n\nFrame 3:\n  Drone pose: [-58.32, -45.92, 20.0, -41.4, 94.79, 0.0]\n  Target bbox: [623.88, 325.99, 656.47, 393.42]\n\nFrame 4:\n  Drone pose: [-58.23, -45.45, 20.07, -44.73, 100.02, 0.0]\n  Target bbox: [558.82, 271.88, 594.8, 340.84]\n\nFrame 5 (current):\n  Drone pose: [-58.38, -44.84, 20.1, -40.18, 95.19, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.78, \"ymin\": 350.85, \"xmax\": 649.52, \"ymax\": 418.39}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.43, \"dz\": -0.1, \"dpitch\": -1.24, \"dyaw\": -0.4, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": 0.94, \"dz\": -0.1, \"dpitch\": -1.25, \"dyaw\": -0.47, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 1.42, \"dz\": -0.1, \"dpitch\": -1.22, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": 1.92, \"dz\": -0.1, \"dpitch\": -1.22, \"dyaw\": -0.44, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 2.45, \"dz\": -0.1, \"dpitch\": -1.26, \"dyaw\": -0.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.35, "window_alt_abs_m": 0.32, "target_px_mean_hist": 656.8, "cur_frame_id": 46, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.39, -41.14, 20.18, -42.05, 94.66, 0.0]\n  Target bbox: [623.71, 326.18, 656.64, 393.21]\n\nFrame 2:\n  Drone pose: [-58.43, -40.25, 20.0, -42.31, 94.63, 0.0]\n  Target bbox: [620.88, 326.07, 659.53, 393.34]\n\nFrame 3:\n  Drone pose: [-58.17, -39.15, 19.97, -48.01, 94.06, 0.0]\n  Target bbox: [640.65, 241.5, 675.92, 312.05]\n\nFrame 4:\n  Drone pose: [-58.55, -37.98, 20.0, -45.62, 90.1, 0.0]\n  Target bbox: [677.19, 301.98, 712.15, 369.84]\n\nFrame 5 (current):\n  Drone pose: [-59.23, -37.15, 19.94, -44.58, 92.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.12, \"ymin\": 324.5, \"xmax\": 658.29, \"ymax\": 394.67}, \"waypoint_deltas\": [{\"dx\": -0.64, \"dy\": 0.84, \"dz\": 0.06, \"dpitch\": -0.64, \"dyaw\": -1.92, \"droll\": 0.0}, {\"dx\": -0.69, \"dy\": 1.1, \"dz\": 0.06, \"dpitch\": -0.28, \"dyaw\": -2.07, \"droll\": 0.0}, {\"dx\": -0.74, \"dy\": 1.36, \"dz\": 0.06, \"dpitch\": 0.08, \"dyaw\": -2.22, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 1.63, \"dz\": 0.06, \"dpitch\": 0.43, \"dyaw\": -2.37, \"droll\": 0.0}, {\"dx\": -0.84, \"dy\": 1.89, \"dz\": 0.06, \"dpitch\": 0.77, \"dyaw\": -2.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.1, "window_alt_abs_m": 0.3, "target_px_mean_hist": 688.0, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-60.07, -35.1, 20.03, -41.82, 89.27, 0.0]\n  Target bbox: [634.36, 352.06, 666.17, 419.45]\n\nFrame 2:\n  Drone pose: [-60.07, -34.76, 20.06, -45.12, 88.72, 0.0]\n  Target bbox: [639.03, 293.86, 675.42, 360.64]\n\nFrame 3:\n  Drone pose: [-60.16, -34.46, 20.01, -43.56, 90.6, 0.0]\n  Target bbox: [611.82, 313.35, 649.17, 381.41]\n\nFrame 4:\n  Drone pose: [-60.28, -34.14, 20.08, -42.67, 89.53, 0.0]\n  Target bbox: [624.38, 326.3, 655.36, 393.01]\n\nFrame 5 (current):\n  Drone pose: [-60.33, -33.89, 19.84, -39.78, 85.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 670.06, \"ymin\": 362.67, \"xmax\": 703.01, \"ymax\": 431.99}, \"waypoint_deltas\": [{\"dx\": -0.03, \"dy\": 0.21, \"dz\": 0.16, \"dpitch\": -2.03, \"dyaw\": 3.65, \"droll\": 0.0}, {\"dx\": -0.08, \"dy\": 0.47, \"dz\": 0.16, \"dpitch\": -1.71, \"dyaw\": 3.53, \"droll\": 0.0}, {\"dx\": -0.08, \"dy\": 0.77, \"dz\": 0.16, \"dpitch\": -1.45, \"dyaw\": 3.54, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": 1.08, \"dz\": 0.16, \"dpitch\": -1.2, \"dyaw\": 3.57, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.43, \"dz\": 0.16, \"dpitch\": -1.01, \"dyaw\": 3.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.36, "window_alt_abs_m": 0.37, "target_px_mean_hist": 691.0, "cur_frame_id": 67, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-59.54, -31.14, 20.02, -41.23, 91.51, 0.0]\n  Target bbox: [623.82, 326.88, 656.55, 392.52]\n\nFrame 2:\n  Drone pose: [-59.3, -30.55, 19.88, -43.73, 88.96, 0.0]\n  Target bbox: [662.37, 283.13, 699.37, 350.12]\n\nFrame 3:\n  Drone pose: [-58.94, -29.92, 20.0, -37.8, 90.41, 0.0]\n  Target bbox: [655.35, 389.09, 694.22, 454.49]\n\nFrame 4:\n  Drone pose: [-58.67, -29.52, 20.04, -37.54, 98.83, 0.0]\n  Target bbox: [557.8, 391.0, 595.69, 460.66]\n\nFrame 5 (current):\n  Drone pose: [-58.81, -28.74, 19.84, -39.78, 98.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 559.22, \"ymin\": 354.77, \"xmax\": 594.62, \"ymax\": 423.99}, \"waypoint_deltas\": [{\"dx\": 0.07, \"dy\": 0.6, \"dz\": 0.16, \"dpitch\": -2.04, \"dyaw\": -4.81, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 1.16, \"dz\": 0.16, \"dpitch\": -2.11, \"dyaw\": -4.77, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": 1.7, \"dz\": 0.16, \"dpitch\": -2.17, \"dyaw\": -4.88, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 2.24, \"dz\": 0.16, \"dpitch\": -2.22, \"dyaw\": -4.82, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": 2.77, \"dz\": 0.16, \"dpitch\": -2.27, \"dyaw\": -4.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.75, "window_alt_abs_m": 0.51, "target_px_mean_hist": 651.0, "cur_frame_id": 78, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.77, -25.5, 19.96, -41.73, 98.64, 0.0]\n  Target bbox: [559.6, 330.24, 595.24, 399.9]\n\nFrame 2:\n  Drone pose: [-58.79, -24.92, 20.0, -42.25, 91.55, 0.0]\n  Target bbox: [649.79, 324.08, 682.37, 391.15]\n\nFrame 3:\n  Drone pose: [-58.74, -24.42, 20.0, -41.91, 98.26, 0.0]\n  Target bbox: [562.57, 330.09, 604.42, 399.02]\n\nFrame 4:\n  Drone pose: [-58.76, -23.89, 20.18, -42.43, 93.69, 0.0]\n  Target bbox: [624.04, 325.62, 656.32, 393.7]\n\nFrame 5 (current):\n  Drone pose: [-58.75, -23.42, 20.0, -39.9, 96.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 583.11, \"ymin\": 360.67, \"xmax\": 627.14, \"ymax\": 434.54}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -2.25, \"dyaw\": -4.16, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": -2.25, \"dyaw\": -4.22, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": -2.28, \"dyaw\": -5.6, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 2.02, \"dz\": 0.0, \"dpitch\": -2.29, \"dyaw\": -5.29, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 2.53, \"dz\": 0.0, \"dpitch\": -2.31, \"dyaw\": -4.74, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.16, "window_alt_abs_m": 0.4, "target_px_mean_hist": 668.2, "cur_frame_id": 88, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144/aug_001/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.01, -19.8, 19.82, -40.57, 96.65, 0.0]\n  Target bbox: [553.53, 348.42, 600.8, 424.4]\n\nFrame 2:\n  Drone pose: [-57.7, -19.19, 19.99, -42.47, 91.14, 0.0]\n  Target bbox: [624.65, 325.79, 655.71, 393.51]\n\nFrame 3:\n  Drone pose: [-57.59, -18.72, 20.0, -42.43, 91.45, 0.0]\n  Target bbox: [618.51, 322.0, 661.46, 397.52]\n\nFrame 4:\n  Drone pose: [-57.35, -18.15, 20.0, -42.54, 90.73, 0.0]\n  Target bbox: [624.65, 325.8, 655.72, 393.49]\n\nFrame 5 (current):\n  Drone pose: [-57.14, -17.56, 20.0, -40.17, 87.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 668.0, \"ymin\": 364.33, \"xmax\": 715.03, \"ymax\": 441.48}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": 0.62, \"dz\": 0.0, \"dpitch\": -2.89, \"dyaw\": 3.22, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": 1.21, \"dz\": 0.0, \"dpitch\": -3.02, \"dyaw\": 3.66, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 1.79, \"dz\": 0.0, \"dpitch\": -3.14, \"dyaw\": 2.64, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": 2.38, \"dz\": 0.0, \"dpitch\": -3.27, \"dyaw\": 2.98, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": 3.0, \"dz\": 0.0, \"dpitch\": -3.43, \"dyaw\": 3.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.1, "window_alt_abs_m": 0.18, "target_px_mean_hist": 381.5, "cur_frame_id": 99, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-13/trajectory_1776048144", "difficulty_score": 0.2286, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [22.87, 4.45, 22.0, -46.27, 177.14, 0.0]\n  Target bbox: [627.45, 325.29, 652.23, 394.06]\n\nFrame 2:\n  Drone pose: [21.3, 3.39, 21.2, -46.6, 173.79, 0.0]\n  Target bbox: [624.58, 322.12, 654.99, 397.28]\n\nFrame 3:\n  Drone pose: [20.57, 3.2, 20.67, -46.14, 173.16, 0.0]\n  Target bbox: [626.1, 328.03, 653.67, 391.16]\n\nFrame 4:\n  Drone pose: [19.77, 2.95, 20.64, -46.51, 172.26, 0.0]\n  Target bbox: [623.62, 321.25, 655.95, 398.07]\n\nFrame 5 (current):\n  Drone pose: [19.19, 2.94, 20.62, -46.6, 172.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.39, \"ymin\": 323.23, \"xmax\": 655.25, \"ymax\": 396.0}, \"waypoint_deltas\": [{\"dx\": -0.56, \"dy\": 0.03, \"dz\": -0.03, \"dpitch\": -0.05, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 0.11, \"dz\": -0.05, \"dpitch\": 0.06, \"dyaw\": -1.22, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": 0.19, \"dz\": -0.07, \"dpitch\": 0.05, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": -2.1, \"dy\": 0.25, \"dz\": -0.09, \"dpitch\": 0.05, \"dyaw\": -0.82, \"droll\": 0.0}, {\"dx\": -2.61, \"dy\": 0.3, \"dz\": -0.2, \"dpitch\": 0.18, \"dyaw\": -0.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.92, "window_alt_abs_m": 1.38, "target_px_mean_hist": 551.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [16.58, 3.24, 20.42, -46.42, 171.53, 0.0]\n  Target bbox: [623.63, 321.93, 655.97, 397.34]\n\nFrame 2:\n  Drone pose: [16.07, 3.28, 20.39, -46.4, 171.65, 0.0]\n  Target bbox: [624.7, 324.83, 655.0, 394.35]\n\nFrame 3:\n  Drone pose: [15.56, 3.3, 20.36, -46.38, 171.73, 0.0]\n  Target bbox: [623.55, 321.15, 655.99, 398.28]\n\nFrame 4:\n  Drone pose: [15.05, 3.33, 20.33, -46.36, 171.79, 0.0]\n  Target bbox: [626.13, 327.82, 653.66, 391.31]\n\nFrame 5 (current):\n  Drone pose: [14.54, 3.34, 20.3, -46.34, 171.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.9, \"ymin\": 326.68, \"xmax\": 653.85, \"ymax\": 392.48}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.01, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": 0.01, \"dz\": -0.06, \"dpitch\": 0.05, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": 0.01, \"dz\": -0.08, \"dpitch\": 0.07, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -2.05, \"dy\": -0.01, \"dz\": -0.11, \"dpitch\": 0.1, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.04, \"dz\": -0.13, \"dpitch\": 0.12, \"dyaw\": -0.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.3, "window_alt_abs_m": 0.12, "target_px_mean_hist": 582.0, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [11.47, 3.26, 20.15, -46.21, 171.54, 0.0]\n  Target bbox: [624.51, 321.3, 655.09, 397.95]\n\nFrame 2:\n  Drone pose: [10.95, 3.2, 20.13, -46.19, 171.37, 0.0]\n  Target bbox: [623.89, 322.35, 655.71, 396.98]\n\nFrame 3:\n  Drone pose: [10.44, 3.14, 20.12, -46.18, 171.17, 0.0]\n  Target bbox: [625.33, 325.13, 654.39, 394.05]\n\nFrame 4:\n  Drone pose: [9.92, 3.07, 20.1, -46.16, 170.93, 0.0]\n  Target bbox: [622.71, 319.75, 656.8, 399.63]\n\nFrame 5 (current):\n  Drone pose: [9.4, 2.98, 20.09, -46.15, 170.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.99, \"ymin\": 323.07, \"xmax\": 655.64, \"ymax\": 396.23}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": -0.09, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": -0.19, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": -0.29, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": -0.93, \"droll\": 0.0}, {\"dx\": -2.08, \"dy\": -0.38, \"dz\": -0.04, \"dpitch\": 0.04, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": -2.6, \"dy\": -0.47, \"dz\": -0.05, \"dpitch\": 0.04, \"dyaw\": -1.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.87, "window_alt_abs_m": 0.06, "target_px_mean_hist": 606.5, "cur_frame_id": 23, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [6.27, 2.44, 20.04, -46.12, 168.93, 0.0]\n  Target bbox: [622.75, 321.4, 656.85, 397.86]\n\nFrame 2:\n  Drone pose: [5.74, 2.41, 20.03, -46.15, 168.8, 0.0]\n  Target bbox: [622.74, 321.25, 656.83, 398.14]\n\nFrame 3:\n  Drone pose: [5.2, 2.42, 20.03, -46.2, 168.81, 0.0]\n  Target bbox: [623.5, 320.63, 656.09, 398.63]\n\nFrame 4:\n  Drone pose: [4.66, 2.47, 20.02, -46.27, 168.94, 0.0]\n  Target bbox: [617.84, 321.18, 662.29, 398.11]\n\nFrame 5 (current):\n  Drone pose: [4.12, 2.55, 20.02, -46.2, 167.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.76, \"ymin\": 321.4, \"xmax\": 662.41, \"ymax\": 397.96}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": -1.31, \"droll\": 0.0}, {\"dx\": -1.1, \"dy\": 0.14, \"dz\": -0.01, \"dpitch\": 0.18, \"dyaw\": -2.7, \"droll\": 0.0}, {\"dx\": -1.66, \"dy\": 0.16, \"dz\": -0.01, \"dpitch\": 0.31, \"dyaw\": -4.22, \"droll\": 0.0}, {\"dx\": -2.22, \"dy\": 0.11, \"dz\": -0.01, \"dpitch\": 0.23, \"dyaw\": -4.42, \"droll\": 0.0}, {\"dx\": -2.79, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.18, \"dyaw\": -4.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.59, "window_alt_abs_m": 0.02, "target_px_mean_hist": 598.2, "cur_frame_id": 33, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00043/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [0.75, 2.4, 20.01, -46.07, 162.28, 0.0]\n  Target bbox: [624.51, 326.65, 655.32, 392.49]\n\nFrame 2:\n  Drone pose: [0.16, 2.21, 20.0, -46.11, 161.65, 0.0]\n  Target bbox: [623.99, 325.35, 655.8, 393.79]\n\nFrame 3:\n  Drone pose: [-0.43, 2.02, 20.0, -46.14, 160.96, 0.0]\n  Target bbox: [623.9, 325.64, 655.9, 393.53]\n\nFrame 4:\n  Drone pose: [-1.03, 1.82, 20.0, -46.18, 160.28, 0.0]\n  Target bbox: [619.47, 318.81, 660.07, 400.57]\n\nFrame 5 (current):\n  Drone pose: [-1.63, 1.66, 20.0, -46.25, 159.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.08, \"ymin\": 318.62, \"xmax\": 660.43, \"ymax\": 400.84}, \"waypoint_deltas\": [{\"dx\": -0.61, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -0.44, \"droll\": 0.0}, {\"dx\": -1.22, \"dy\": -0.14, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": -1.84, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": -0.36, \"dyaw\": -2.19, \"droll\": 0.0}, {\"dx\": -2.45, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": -0.28, \"dyaw\": -3.52, \"droll\": 0.0}, {\"dx\": -3.06, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": -4.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.6, "window_alt_abs_m": 0.0, "target_px_mean_hist": 507.8, "cur_frame_id": 43, "source": "ORI", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00053/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-5.31, 2.0, 20.0, -46.43, 154.0, 0.0]\n  Target bbox: [619.51, 322.33, 660.77, 396.84]\n\nFrame 2:\n  Drone pose: [-5.93, 2.24, 20.0, -46.42, 153.1, 0.0]\n  Target bbox: [622.67, 322.9, 657.62, 396.29]\n\nFrame 3:\n  Drone pose: [-6.52, 2.51, 20.0, -46.38, 152.32, 0.0]\n  Target bbox: [622.28, 324.15, 657.99, 395.07]\n\nFrame 4:\n  Drone pose: [-7.12, 2.8, 20.0, -46.36, 151.61, 0.0]\n  Target bbox: [623.28, 325.07, 656.95, 394.08]\n\nFrame 5 (current):\n  Drone pose: [-7.72, 3.11, 20.0, -46.36, 150.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.42, \"ymin\": 322.75, \"xmax\": 657.89, \"ymax\": 396.42}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": 0.33, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": -1.13, \"dy\": 0.68, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -1.1, \"droll\": 0.0}, {\"dx\": -1.69, \"dy\": 1.04, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -1.56, \"droll\": 0.0}, {\"dx\": -2.26, \"dy\": 1.41, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -2.03, \"droll\": 0.0}, {\"dx\": -2.81, \"dy\": 1.78, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -2.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.08, "window_alt_abs_m": 0.0, "target_px_mean_hist": 454.2, "cur_frame_id": 53, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-10.53, 4.89, 20.0, -46.21, 148.47, 0.0]\n  Target bbox: [625.54, 325.38, 654.72, 393.85]\n\nFrame 2:\n  Drone pose: [-11.02, 5.29, 20.0, -46.11, 148.21, 0.0]\n  Target bbox: [620.82, 323.2, 659.53, 396.05]\n\nFrame 3:\n  Drone pose: [-11.41, 5.74, 20.0, -45.92, 148.27, 0.0]\n  Target bbox: [620.21, 322.65, 660.16, 396.6]\n\nFrame 4:\n  Drone pose: [-12.04, 6.04, 20.0, -45.93, 147.55, 0.0]\n  Target bbox: [623.27, 327.48, 656.91, 391.61]\n\nFrame 5 (current):\n  Drone pose: [-13.08, 6.04, 20.0, -46.2, 145.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.91, \"ymin\": 325.31, \"xmax\": 656.36, \"ymax\": 393.82}, \"waypoint_deltas\": [{\"dx\": -1.31, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": -0.38, \"dyaw\": -3.26, \"droll\": 0.0}, {\"dx\": -3.25, \"dy\": -0.84, \"dz\": 0.0, \"dpitch\": -0.92, \"dyaw\": -9.1, \"droll\": 0.0}, {\"dx\": -5.6, \"dy\": -1.81, \"dz\": 0.0, \"dpitch\": -1.82, \"dyaw\": -15.74, \"droll\": 0.0}, {\"dx\": -9.65, \"dy\": -3.97, \"dz\": 0.0, \"dpitch\": -1.98, \"dyaw\": -29.67, \"droll\": 0.0}, {\"dx\": -13.59, \"dy\": -6.0, \"dz\": 0.0, \"dpitch\": -0.68, \"dyaw\": -42.57, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.28, "window_alt_abs_m": 0.0, "target_px_mean_hist": 594.2, "cur_frame_id": 62, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-30.05, -1.6, 20.0, -44.97, 93.17, 0.0]\n  Target bbox: [617.91, 326.59, 662.48, 392.52]\n\nFrame 2:\n  Drone pose: [-32.66, -2.75, 20.0, -43.3, 87.02, 0.0]\n  Target bbox: [620.55, 327.45, 659.1, 391.76]\n\nFrame 3:\n  Drone pose: [-34.52, -3.43, 20.0, -42.21, 83.41, 0.0]\n  Target bbox: [619.47, 328.22, 660.22, 391.16]\n\nFrame 4:\n  Drone pose: [-35.53, -3.51, 20.0, -42.01, 82.06, 0.0]\n  Target bbox: [617.51, 327.63, 662.17, 391.77]\n\nFrame 5 (current):\n  Drone pose: [-36.47, -3.51, 20.0, -41.93, 80.89, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.55, \"ymin\": 327.76, \"xmax\": 656.44, \"ymax\": 391.69}, \"waypoint_deltas\": [{\"dx\": -0.88, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": 0.69, \"dyaw\": -0.8, \"droll\": 0.0}, {\"dx\": -1.46, \"dy\": 0.39, \"dz\": 0.0, \"dpitch\": 0.89, \"dyaw\": -0.93, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": 0.81, \"dz\": 0.0, \"dpitch\": 1.01, \"dyaw\": -0.99, \"droll\": 0.0}, {\"dx\": -2.52, \"dy\": 1.26, \"dz\": 0.0, \"dpitch\": 1.08, \"dyaw\": -1.03, \"droll\": 0.0}, {\"dx\": -3.04, \"dy\": 1.73, \"dz\": 0.0, \"dpitch\": 1.12, \"dyaw\": -1.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.28, "window_alt_abs_m": 0.0, "target_px_mean_hist": 534.8, "cur_frame_id": 72, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00082/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-40.04, -1.29, 20.0, -40.79, 79.75, 0.0]\n  Target bbox: [620.58, 324.87, 659.49, 394.72]\n\nFrame 2:\n  Drone pose: [-40.56, -0.79, 20.0, -40.78, 79.7, 0.0]\n  Target bbox: [620.32, 324.83, 659.79, 394.83]\n\nFrame 3:\n  Drone pose: [-41.05, -0.29, 20.0, -40.79, 79.72, 0.0]\n  Target bbox: [624.2, 328.52, 655.8, 390.97]\n\nFrame 4:\n  Drone pose: [-41.54, 0.21, 20.0, -40.79, 79.74, 0.0]\n  Target bbox: [620.7, 324.34, 659.44, 395.28]\n\nFrame 5 (current):\n  Drone pose: [-42.04, 0.71, 20.0, -40.79, 79.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.65, \"ymin\": 325.99, \"xmax\": 658.43, \"ymax\": 393.61}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 1.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 2.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 2.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.1, "window_alt_abs_m": 0.0, "target_px_mean_hist": 527.5, "cur_frame_id": 82, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/ORI/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.04, 3.71, 20.0, -40.79, 79.75, 0.0]\n  Target bbox: [620.52, 324.26, 659.62, 395.37]\n\nFrame 2:\n  Drone pose: [-45.54, 4.21, 20.0, -40.79, 79.76, 0.0]\n  Target bbox: [621.6, 324.78, 658.54, 394.81]\n\nFrame 3:\n  Drone pose: [-46.04, 4.71, 20.0, -40.79, 79.75, 0.0]\n  Target bbox: [623.94, 328.4, 656.05, 391.1]\n\nFrame 4:\n  Drone pose: [-46.52, 5.2, 20.0, -40.79, 79.8, 0.0]\n  Target bbox: [621.37, 325.8, 658.72, 393.82]\n\nFrame 5 (current):\n  Drone pose: [-47.01, 5.72, 20.0, -40.81, 79.81, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.3, \"ymin\": 326.88, \"xmax\": 655.84, \"ymax\": 392.66}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": 1.04, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": 1.54, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": -2.05, \"dy\": 2.05, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": -2.25, \"dy\": 2.55, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.62, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.07, "window_alt_abs_m": 0.0, "target_px_mean_hist": 511.8, "cur_frame_id": 92, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [22.94, 4.56, 21.92, -51.06, 175.37, 0.0]\n  Target bbox: [651.81, 244.43, 677.5, 307.38]\n\nFrame 2:\n  Drone pose: [21.42, 3.47, 21.28, -46.53, 174.07, 0.0]\n  Target bbox: [626.85, 326.83, 652.88, 392.4]\n\nFrame 3:\n  Drone pose: [20.61, 3.07, 20.68, -46.07, 172.76, 0.0]\n  Target bbox: [626.31, 328.6, 653.49, 390.52]\n\nFrame 4:\n  Drone pose: [19.79, 2.98, 20.62, -47.65, 170.7, 0.0]\n  Target bbox: [643.72, 303.6, 675.16, 375.97]\n\nFrame 5 (current):\n  Drone pose: [19.25, 2.89, 20.75, -50.35, 176.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 578.0, \"ymin\": 264.69, \"xmax\": 606.66, \"ymax\": 333.64}, \"waypoint_deltas\": [{\"dx\": -0.62, \"dy\": 0.08, \"dz\": -0.16, \"dpitch\": 3.7, \"dyaw\": -3.89, \"droll\": 0.0}, {\"dx\": -1.13, \"dy\": 0.16, \"dz\": -0.18, \"dpitch\": 3.81, \"dyaw\": -5.17, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": 0.24, \"dz\": -0.2, \"dpitch\": 3.8, \"dyaw\": -4.95, \"droll\": 0.0}, {\"dx\": -2.16, \"dy\": 0.3, \"dz\": -0.22, \"dpitch\": 3.8, \"dyaw\": -4.77, \"droll\": 0.0}, {\"dx\": -2.67, \"dy\": 0.35, \"dz\": -0.33, \"dpitch\": 3.93, \"dyaw\": -4.63, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.13, "window_alt_abs_m": 1.43, "target_px_mean_hist": 544.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [16.52, 3.13, 20.48, -46.59, 171.16, 0.0]\n  Target bbox: [624.66, 322.75, 654.97, 396.53]\n\nFrame 2:\n  Drone pose: [16.13, 3.4, 20.36, -46.29, 172.05, 0.0]\n  Target bbox: [624.69, 321.67, 654.91, 397.61]\n\nFrame 3:\n  Drone pose: [15.59, 3.43, 20.3, -42.54, 177.12, 0.0]\n  Target bbox: [565.48, 387.4, 597.22, 461.14]\n\nFrame 4:\n  Drone pose: [15.14, 3.39, 20.37, -42.59, 174.87, 0.0]\n  Target bbox: [592.29, 388.96, 620.59, 455.73]\n\nFrame 5 (current):\n  Drone pose: [14.54, 3.34, 20.3, -51.32, 175.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 583.94, \"ymin\": 244.72, \"xmax\": 613.34, \"ymax\": 308.53}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.01, \"dz\": -0.03, \"dpitch\": 5.0, \"dyaw\": -3.5, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": 0.01, \"dz\": -0.06, \"dpitch\": 5.03, \"dyaw\": -3.5, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": 0.01, \"dz\": -0.08, \"dpitch\": 5.05, \"dyaw\": -3.53, \"droll\": 0.0}, {\"dx\": -2.05, \"dy\": -0.01, \"dz\": -0.11, \"dpitch\": 5.08, \"dyaw\": -3.59, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.04, \"dz\": -0.13, \"dpitch\": 5.1, \"dyaw\": -3.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.7, "window_alt_abs_m": 0.32, "target_px_mean_hist": 593.5, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [11.52, 3.31, 20.01, -41.85, 176.1, 0.0]\n  Target bbox: [572.53, 391.67, 603.95, 466.97]\n\nFrame 2:\n  Drone pose: [10.95, 3.2, 20.13, -44.78, 166.37, 0.0]\n  Target bbox: [682.81, 352.53, 713.82, 417.52]\n\nFrame 3:\n  Drone pose: [10.54, 3.2, 20.01, -45.92, 176.4, 0.0]\n  Target bbox: [566.32, 328.96, 595.98, 391.89]\n\nFrame 4:\n  Drone pose: [9.95, 3.1, 20.07, -46.07, 171.04, 0.0]\n  Target bbox: [625.6, 327.34, 654.2, 391.73]\n\nFrame 5 (current):\n  Drone pose: [9.4, 2.98, 20.09, -46.15, 170.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.3, \"ymin\": 325.56, \"xmax\": 654.43, \"ymax\": 393.61}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": -0.09, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": -0.19, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": -0.29, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": -0.93, \"droll\": 0.0}, {\"dx\": -2.08, \"dy\": -0.38, \"dz\": -0.04, \"dpitch\": 0.04, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": -2.6, \"dy\": -0.47, \"dz\": -0.05, \"dpitch\": 0.04, \"dyaw\": -1.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 25.49, "window_alt_abs_m": 0.34, "target_px_mean_hist": 610.0, "cur_frame_id": 23, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [6.27, 2.44, 20.04, -47.84, 164.85, 0.0]\n  Target bbox: [669.5, 293.69, 705.7, 370.36]\n\nFrame 2:\n  Drone pose: [5.78, 2.25, 20.06, -46.09, 168.32, 0.0]\n  Target bbox: [623.54, 321.64, 656.07, 397.68]\n\nFrame 3:\n  Drone pose: [5.07, 2.37, 20.08, -47.61, 173.58, 0.0]\n  Target bbox: [566.27, 310.83, 597.29, 374.17]\n\nFrame 4:\n  Drone pose: [4.66, 2.47, 20.02, -46.27, 168.94, 0.0]\n  Target bbox: [621.19, 323.85, 658.91, 395.36]\n\nFrame 5 (current):\n  Drone pose: [3.98, 2.65, 20.07, -46.52, 167.82, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.67, \"ymin\": 321.41, \"xmax\": 662.5, \"ymax\": 397.93}, \"waypoint_deltas\": [{\"dx\": -0.41, \"dy\": -0.01, \"dz\": -0.05, \"dpitch\": 0.41, \"dyaw\": -1.51, \"droll\": 0.0}, {\"dx\": -0.96, \"dy\": 0.04, \"dz\": -0.06, \"dpitch\": 0.5, \"dyaw\": -2.9, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": 0.06, \"dz\": -0.06, \"dpitch\": 0.63, \"dyaw\": -4.42, \"droll\": 0.0}, {\"dx\": -2.08, \"dy\": 0.01, \"dz\": -0.06, \"dpitch\": 0.55, \"dyaw\": -4.62, \"droll\": 0.0}, {\"dx\": -2.65, \"dy\": -0.1, \"dz\": -0.06, \"dpitch\": 0.5, \"dyaw\": -5.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.49, "window_alt_abs_m": 0.15, "target_px_mean_hist": 605.5, "cur_frame_id": 33, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00043/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [0.75, 2.4, 20.01, -46.07, 162.28, 0.0]\n  Target bbox: [621.51, 320.07, 658.07, 399.25]\n\nFrame 2:\n  Drone pose: [0.0, 2.18, 20.05, -43.49, 160.19, 0.0]\n  Target bbox: [636.79, 373.1, 670.95, 443.89]\n\nFrame 3:\n  Drone pose: [-0.43, 2.02, 20.0, -46.14, 160.96, 0.0]\n  Target bbox: [621.11, 319.94, 658.48, 399.38]\n\nFrame 4:\n  Drone pose: [-1.16, 1.86, 20.03, -46.84, 156.79, 0.0]\n  Target bbox: [661.15, 314.67, 699.7, 392.72]\n\nFrame 5 (current):\n  Drone pose: [-1.63, 1.66, 20.0, -44.05, 154.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 679.01, \"ymin\": 361.39, \"xmax\": 717.65, \"ymax\": 435.29}, \"waypoint_deltas\": [{\"dx\": -0.61, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": -2.3, \"dyaw\": 4.56, \"droll\": 0.0}, {\"dx\": -1.22, \"dy\": -0.14, \"dz\": 0.0, \"dpitch\": -2.44, \"dyaw\": 4.32, \"droll\": 0.0}, {\"dx\": -1.84, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": -2.56, \"dyaw\": 2.81, \"droll\": 0.0}, {\"dx\": -2.45, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": -2.48, \"dyaw\": 1.48, \"droll\": 0.0}, {\"dx\": -3.06, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": -2.41, \"dyaw\": 0.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.14, "window_alt_abs_m": 0.14, "target_px_mean_hist": 455.8, "cur_frame_id": 43, "source": "aug_001", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00053/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-5.31, 2.0, 20.0, -42.4, 149.42, 0.0]\n  Target bbox: [674.3, 391.83, 713.04, 465.62]\n\nFrame 2:\n  Drone pose: [-5.95, 2.4, 19.99, -48.57, 156.3, 0.0]\n  Target bbox: [589.24, 289.01, 625.96, 363.19]\n\nFrame 3:\n  Drone pose: [-6.52, 2.51, 20.0, -46.38, 152.32, 0.0]\n  Target bbox: [620.59, 324.11, 659.64, 394.98]\n\nFrame 4:\n  Drone pose: [-7.12, 2.8, 20.0, -45.65, 146.74, 0.0]\n  Target bbox: [680.76, 338.84, 713.14, 407.62]\n\nFrame 5 (current):\n  Drone pose: [-7.69, 3.08, 20.04, -46.35, 150.9, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.98, \"ymin\": 322.61, \"xmax\": 660.33, \"ymax\": 396.56}, \"waypoint_deltas\": [{\"dx\": -0.6, \"dy\": 0.36, \"dz\": -0.04, \"dpitch\": 0.03, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": -1.16, \"dy\": 0.71, \"dz\": -0.04, \"dpitch\": 0.06, \"dyaw\": -1.07, \"droll\": 0.0}, {\"dx\": -1.72, \"dy\": 1.07, \"dz\": -0.04, \"dpitch\": 0.09, \"dyaw\": -1.53, \"droll\": 0.0}, {\"dx\": -2.29, \"dy\": 1.44, \"dz\": -0.04, \"dpitch\": 0.11, \"dyaw\": -2.0, \"droll\": 0.0}, {\"dx\": -2.84, \"dy\": 1.81, \"dz\": -0.04, \"dpitch\": 0.14, \"dyaw\": -2.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.61, "window_alt_abs_m": 0.07, "target_px_mean_hist": 462.2, "cur_frame_id": 53, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-10.53, 4.89, 20.0, -46.21, 148.47, 0.0]\n  Target bbox: [620.36, 322.72, 659.99, 396.45]\n\nFrame 2:\n  Drone pose: [-11.02, 5.29, 20.0, -50.12, 151.75, 0.0]\n  Target bbox: [578.59, 256.03, 618.36, 330.14]\n\nFrame 3:\n  Drone pose: [-11.54, 5.75, 19.98, -42.59, 150.2, 0.0]\n  Target bbox: [598.47, 385.06, 631.54, 451.91]\n\nFrame 4:\n  Drone pose: [-12.06, 5.93, 20.01, -43.57, 147.28, 0.0]\n  Target bbox: [623.56, 362.47, 655.12, 433.88]\n\nFrame 5 (current):\n  Drone pose: [-13.0, 5.88, 19.95, -42.85, 140.05, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 679.33, \"ymin\": 376.54, \"xmax\": 718.94, \"ymax\": 448.06}, \"waypoint_deltas\": [{\"dx\": -1.39, \"dy\": -0.03, \"dz\": 0.05, \"dpitch\": -3.73, \"dyaw\": 2.0, \"droll\": 0.0}, {\"dx\": -3.33, \"dy\": -0.68, \"dz\": 0.05, \"dpitch\": -4.27, \"dyaw\": -3.84, \"droll\": 0.0}, {\"dx\": -5.68, \"dy\": -1.65, \"dz\": 0.05, \"dpitch\": -5.17, \"dyaw\": -10.48, \"droll\": 0.0}, {\"dx\": -9.73, \"dy\": -3.81, \"dz\": 0.05, \"dpitch\": -5.33, \"dyaw\": -24.41, \"droll\": 0.0}, {\"dx\": -13.67, \"dy\": -5.84, \"dz\": 0.05, \"dpitch\": -4.03, \"dyaw\": -37.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.98, "window_alt_abs_m": 0.1, "target_px_mean_hist": 591.0, "cur_frame_id": 62, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-30.12, -1.54, 19.84, -44.82, 92.98, 0.0]\n  Target bbox: [619.5, 327.1, 660.85, 391.98]\n\nFrame 2:\n  Drone pose: [-32.66, -2.75, 20.0, -43.3, 87.02, 0.0]\n  Target bbox: [618.06, 327.85, 661.56, 391.39]\n\nFrame 3:\n  Drone pose: [-34.51, -3.56, 19.87, -41.84, 83.48, 0.0]\n  Target bbox: [617.9, 326.99, 661.75, 392.35]\n\nFrame 4:\n  Drone pose: [-35.53, -3.51, 20.0, -44.62, 84.21, 0.0]\n  Target bbox: [600.79, 285.79, 625.08, 346.54]\n\nFrame 5 (current):\n  Drone pose: [-36.47, -3.51, 20.0, -39.51, 83.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 591.88, \"ymin\": 365.28, \"xmax\": 632.0, \"ymax\": 436.29}, \"waypoint_deltas\": [{\"dx\": -0.88, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": -1.73, \"dyaw\": -3.02, \"droll\": 0.0}, {\"dx\": -1.46, \"dy\": 0.39, \"dz\": 0.0, \"dpitch\": -1.53, \"dyaw\": -3.15, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": 0.81, \"dz\": 0.0, \"dpitch\": -1.41, \"dyaw\": -3.21, \"droll\": 0.0}, {\"dx\": -2.52, \"dy\": 1.26, \"dz\": 0.0, \"dpitch\": -1.34, \"dyaw\": -3.25, \"droll\": 0.0}, {\"dx\": -3.04, \"dy\": 1.73, \"dz\": 0.0, \"dpitch\": -1.3, \"dyaw\": -3.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.33, "window_alt_abs_m": 0.42, "target_px_mean_hist": 527.8, "cur_frame_id": 72, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00082/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-39.98, -1.16, 20.05, -41.05, 79.84, 0.0]\n  Target bbox: [623.76, 328.28, 656.24, 391.23]\n\nFrame 2:\n  Drone pose: [-40.71, -0.83, 20.03, -40.74, 79.32, 0.0]\n  Target bbox: [621.6, 326.61, 658.45, 393.01]\n\nFrame 3:\n  Drone pose: [-41.11, -0.23, 19.88, -38.26, 77.93, 0.0]\n  Target bbox: [642.51, 367.0, 679.37, 434.07]\n\nFrame 4:\n  Drone pose: [-41.54, 0.21, 20.0, -43.52, 75.72, 0.0]\n  Target bbox: [673.24, 282.03, 709.73, 348.18]\n\nFrame 5 (current):\n  Drone pose: [-42.04, 0.71, 20.0, -40.77, 79.25, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.58, \"ymin\": 325.92, \"xmax\": 665.49, \"ymax\": 394.44}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 1.5, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 2.0, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 2.5, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.64, "window_alt_abs_m": 0.28, "target_px_mean_hist": 484.5, "cur_frame_id": 82, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808/aug_001/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.04, 3.71, 20.0, -40.79, 79.75, 0.0]\n  Target bbox: [623.44, 326.08, 656.72, 393.48]\n\nFrame 2:\n  Drone pose: [-45.35, 4.14, 19.95, -39.71, 78.12, 0.0]\n  Target bbox: [648.83, 342.55, 686.02, 409.56]\n\nFrame 3:\n  Drone pose: [-46.03, 4.63, 20.02, -38.03, 75.68, 0.0]\n  Target bbox: [673.3, 372.52, 712.63, 440.03]\n\nFrame 4:\n  Drone pose: [-46.52, 5.18, 19.98, -40.73, 79.82, 0.0]\n  Target bbox: [621.35, 326.13, 658.72, 393.5]\n\nFrame 5 (current):\n  Drone pose: [-46.88, 5.8, 20.03, -39.17, 85.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 557.94, \"ymin\": 358.48, \"xmax\": 594.74, \"ymax\": 425.69}, \"waypoint_deltas\": [{\"dx\": -0.62, \"dy\": 0.43, \"dz\": -0.03, \"dpitch\": -1.65, \"dyaw\": -5.28, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": 0.96, \"dz\": -0.03, \"dpitch\": -1.68, \"dyaw\": -5.35, \"droll\": 0.0}, {\"dx\": -1.66, \"dy\": 1.46, \"dz\": -0.03, \"dpitch\": -1.68, \"dyaw\": -5.41, \"droll\": 0.0}, {\"dx\": -2.18, \"dy\": 1.97, \"dz\": -0.03, \"dpitch\": -1.7, \"dyaw\": -5.46, \"droll\": 0.0}, {\"dx\": -2.38, \"dy\": 2.47, \"dz\": -0.03, \"dpitch\": -1.76, \"dyaw\": -4.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.51, "window_alt_abs_m": 0.22, "target_px_mean_hist": 523.0, "cur_frame_id": 92, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776500808", "difficulty_score": 0.4998, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-50.61, 45.44, 22.0, -46.87, 90.0, 0.0]\n  Target bbox: [629.56, 340.94, 650.44, 378.72]\n\nFrame 2:\n  Drone pose: [-51.64, 45.04, 21.2, -44.47, 87.19, 0.0]\n  Target bbox: [630.9, 339.86, 649.0, 379.82]\n\nFrame 3:\n  Drone pose: [-52.07, 45.17, 20.67, -43.19, 86.08, 0.0]\n  Target bbox: [628.23, 339.4, 651.65, 380.27]\n\nFrame 4:\n  Drone pose: [-52.2, 45.57, 20.64, -43.01, 85.75, 0.0]\n  Target bbox: [631.38, 339.65, 648.52, 380.06]\n\nFrame 5 (current):\n  Drone pose: [-52.21, 46.07, 20.62, -42.97, 85.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.86, \"ymin\": 339.43, \"xmax\": 651.03, \"ymax\": 380.23}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.03, \"dz\": -0.05, \"dpitch\": 0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.54, \"dz\": -0.07, \"dpitch\": 0.05, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.05, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.56, \"dz\": -0.2, \"dpitch\": 0.2, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.29, "window_alt_abs_m": 1.38, "target_px_mean_hist": 251.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.21, 49.66, 20.36, -42.71, 85.69, 0.0]\n  Target bbox: [632.2, 339.84, 647.72, 379.86]\n\nFrame 2:\n  Drone pose: [-52.21, 50.17, 20.33, -42.68, 85.69, 0.0]\n  Target bbox: [629.45, 339.61, 650.44, 380.07]\n\nFrame 3:\n  Drone pose: [-52.21, 50.68, 20.3, -42.65, 85.69, 0.0]\n  Target bbox: [628.45, 339.09, 651.42, 380.61]\n\nFrame 4:\n  Drone pose: [-52.21, 51.19, 20.27, -42.63, 85.68, 0.0]\n  Target bbox: [627.72, 339.76, 652.15, 379.95]\n\nFrame 5 (current):\n  Drone pose: [-52.21, 51.7, 20.24, -42.6, 85.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.44, \"ymin\": 339.0, \"xmax\": 651.44, \"ymax\": 380.65}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.02, \"dz\": -0.05, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.52, \"dz\": -0.07, \"dpitch\": 0.07, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.03, \"dz\": -0.09, \"dpitch\": 0.09, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.54, \"dz\": -0.11, \"dpitch\": 0.1, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.11, "target_px_mean_hist": 257.0, "cur_frame_id": 15, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00027/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.21, 55.76, 20.09, -42.46, 85.66, 0.0]\n  Target bbox: [628.11, 338.94, 651.76, 380.74]\n\nFrame 2:\n  Drone pose: [-52.21, 56.27, 20.08, -42.45, 85.66, 0.0]\n  Target bbox: [629.94, 339.24, 649.95, 380.43]\n\nFrame 3:\n  Drone pose: [-52.21, 56.78, 20.07, -42.45, 85.66, 0.0]\n  Target bbox: [629.98, 339.95, 649.91, 379.77]\n\nFrame 4:\n  Drone pose: [-52.21, 57.28, 20.06, -42.44, 85.66, 0.0]\n  Target bbox: [628.12, 339.12, 651.75, 380.56]\n\nFrame 5 (current):\n  Drone pose: [-52.21, 57.79, 20.05, -42.44, 85.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.4, \"ymin\": 339.36, \"xmax\": 651.47, \"ymax\": 380.33}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.02, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.52, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.04, "target_px_mean_hist": 265.5, "cur_frame_id": 27, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.22, 61.83, 20.01, -42.43, 85.65, 0.0]\n  Target bbox: [633.52, 339.26, 646.4, 380.44]\n\nFrame 2:\n  Drone pose: [-52.22, 62.33, 20.01, -42.44, 85.65, 0.0]\n  Target bbox: [631.05, 339.12, 648.84, 380.58]\n\nFrame 3:\n  Drone pose: [-52.22, 62.83, 20.01, -42.44, 85.65, 0.0]\n  Target bbox: [630.16, 339.06, 649.73, 380.64]\n\nFrame 4:\n  Drone pose: [-52.22, 63.34, 20.01, -42.44, 85.64, 0.0]\n  Target bbox: [629.76, 339.04, 650.12, 380.66]\n\nFrame 5 (current):\n  Drone pose: [-52.22, 63.84, 20.01, -42.44, 85.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 633.57, \"ymin\": 339.26, \"xmax\": 646.36, \"ymax\": 380.44}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.01, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.52, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.01, "target_px_mean_hist": 269.5, "cur_frame_id": 39, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.22, 67.36, 20.0, -42.47, 85.64, 0.0]\n  Target bbox: [628.11, 338.84, 651.76, 380.82]\n\nFrame 2:\n  Drone pose: [-52.22, 67.87, 20.0, -42.47, 85.64, 0.0]\n  Target bbox: [628.72, 339.41, 651.16, 380.3]\n\nFrame 3:\n  Drone pose: [-52.22, 68.37, 20.0, -42.47, 85.64, 0.0]\n  Target bbox: [627.93, 338.76, 651.94, 380.92]\n\nFrame 4:\n  Drone pose: [-52.22, 68.87, 20.0, -42.47, 85.64, 0.0]\n  Target bbox: [628.12, 339.15, 651.75, 380.53]\n\nFrame 5 (current):\n  Drone pose: [-52.22, 69.37, 20.0, -42.48, 85.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.19, \"ymin\": 339.19, \"xmax\": 651.69, \"ymax\": 380.5}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 268.2, "cur_frame_id": 50, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.24, 73.39, 20.0, -42.5, 85.57, 0.0]\n  Target bbox: [631.08, 339.23, 648.82, 380.46]\n\nFrame 2:\n  Drone pose: [-52.25, 73.89, 20.0, -42.5, 85.54, 0.0]\n  Target bbox: [627.9, 338.78, 651.97, 380.89]\n\nFrame 3:\n  Drone pose: [-52.27, 74.39, 20.0, -42.5, 85.49, 0.0]\n  Target bbox: [632.56, 339.22, 647.35, 380.49]\n\nFrame 4:\n  Drone pose: [-52.29, 74.9, 20.0, -42.5, 85.42, 0.0]\n  Target bbox: [630.42, 339.04, 649.47, 380.66]\n\nFrame 5 (current):\n  Drone pose: [-52.33, 75.4, 20.0, -42.5, 85.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.89, \"ymin\": 338.72, \"xmax\": 651.98, \"ymax\": 380.96}, \"waypoint_deltas\": [{\"dx\": -0.04, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": -0.15, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": 2.02, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": 2.53, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.24, "window_alt_abs_m": 0.0, "target_px_mean_hist": 267.0, "cur_frame_id": 62, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.98, 79.46, 20.0, -42.5, 83.56, 0.0]\n  Target bbox: [632.5, 339.44, 647.41, 380.27]\n\nFrame 2:\n  Drone pose: [-53.11, 79.98, 20.0, -42.5, 83.2, 0.0]\n  Target bbox: [628.3, 338.54, 651.57, 381.17]\n\nFrame 3:\n  Drone pose: [-53.24, 80.5, 20.0, -42.51, 82.83, 0.0]\n  Target bbox: [627.21, 339.23, 652.67, 380.49]\n\nFrame 4:\n  Drone pose: [-53.38, 81.02, 20.0, -42.51, 82.46, 0.0]\n  Target bbox: [629.39, 338.82, 650.5, 380.88]\n\nFrame 5 (current):\n  Drone pose: [-53.51, 81.54, 20.0, -42.52, 82.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.58, \"ymin\": 338.84, \"xmax\": 651.31, \"ymax\": 380.84}, \"waypoint_deltas\": [{\"dx\": -0.1, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 1.06, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": -0.25, \"dy\": 1.59, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.71, \"droll\": 0.0}, {\"dx\": -0.28, \"dy\": 2.13, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -2.16, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": 2.67, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -2.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.45, "window_alt_abs_m": 0.0, "target_px_mean_hist": 271.2, "cur_frame_id": 74, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.81, 85.3, 20.0, -42.56, 78.5, 0.0]\n  Target bbox: [631.37, 338.68, 648.54, 381.02]\n\nFrame 2:\n  Drone pose: [-53.79, 85.84, 20.0, -42.62, 78.52, 0.0]\n  Target bbox: [629.55, 338.59, 650.47, 381.14]\n\nFrame 3:\n  Drone pose: [-53.76, 86.39, 20.0, -42.55, 77.25, 0.0]\n  Target bbox: [628.14, 338.61, 651.76, 381.13]\n\nFrame 4:\n  Drone pose: [-53.72, 86.93, 20.0, -42.62, 77.34, 0.0]\n  Target bbox: [627.42, 336.19, 652.63, 383.56]\n\nFrame 5 (current):\n  Drone pose: [-53.67, 87.48, 20.0, -42.54, 76.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.01, \"ymin\": 338.2, \"xmax\": 651.9, \"ymax\": 381.5}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.54, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 1.09, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -1.06, \"droll\": 0.0}, {\"dx\": 0.19, \"dy\": 1.63, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": 2.18, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -2.09, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": 2.72, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -1.95, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.6, "window_alt_abs_m": 0.0, "target_px_mean_hist": 268.2, "cur_frame_id": 85, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.15, 91.81, 20.0, -42.62, 73.29, 0.0]\n  Target bbox: [628.3, 337.25, 651.76, 382.49]\n\nFrame 2:\n  Drone pose: [-53.07, 92.34, 20.0, -42.5, 72.18, 0.0]\n  Target bbox: [629.47, 338.54, 650.45, 381.19]\n\nFrame 3:\n  Drone pose: [-52.98, 92.88, 20.0, -42.58, 72.39, 0.0]\n  Target bbox: [629.2, 338.02, 650.85, 381.71]\n\nFrame 4:\n  Drone pose: [-52.87, 93.41, 20.0, -42.47, 71.34, 0.0]\n  Target bbox: [628.02, 338.18, 651.9, 381.58]\n\nFrame 5 (current):\n  Drone pose: [-52.76, 93.95, 20.0, -42.56, 71.6, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.98, \"ymin\": 336.06, \"xmax\": 653.11, \"ymax\": 383.73}, \"waypoint_deltas\": [{\"dx\": 0.12, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -1.01, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": 1.07, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": 1.61, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -1.75, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 2.14, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -1.5, \"droll\": 0.0}, {\"dx\": 0.57, \"dy\": 2.68, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": -2.54, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.64, "window_alt_abs_m": 0.0, "target_px_mean_hist": 270.2, "cur_frame_id": 97, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/ORI/frames_playback/frame_00109/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-51.86, 98.22, 20.0, -42.39, 68.54, 0.0]\n  Target bbox: [627.39, 337.9, 652.54, 381.88]\n\nFrame 2:\n  Drone pose: [-51.73, 98.75, 20.0, -42.5, 68.84, 0.0]\n  Target bbox: [627.27, 336.33, 652.83, 383.46]\n\nFrame 3:\n  Drone pose: [-51.59, 99.28, 20.0, -42.35, 67.89, 0.0]\n  Target bbox: [628.31, 337.82, 651.63, 381.9]\n\nFrame 4:\n  Drone pose: [-51.45, 99.81, 20.0, -42.46, 68.23, 0.0]\n  Target bbox: [627.73, 336.56, 652.35, 383.18]\n\nFrame 5 (current):\n  Drone pose: [-51.3, 100.34, 20.0, -42.32, 67.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.49, \"ymin\": 337.23, \"xmax\": 653.46, \"ymax\": 382.52}, \"waypoint_deltas\": [{\"dx\": 0.16, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": 0.33, \"dy\": 1.04, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.48, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": 1.56, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": 2.07, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.84, \"droll\": 0.0}, {\"dx\": 0.91, \"dy\": 2.57, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.47, "window_alt_abs_m": 0.0, "target_px_mean_hist": 255.2, "cur_frame_id": 109, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-50.61, 45.44, 22.0, -46.87, 90.0, 0.0]\n  Target bbox: [628.63, 339.65, 651.44, 379.97]\n\nFrame 2:\n  Drone pose: [-51.72, 45.18, 21.25, -43.97, 89.5, 0.0]\n  Target bbox: [632.8, 356.69, 658.78, 403.59]\n\nFrame 3:\n  Drone pose: [-52.07, 45.17, 20.67, -43.19, 86.08, 0.0]\n  Target bbox: [628.7, 339.82, 651.18, 379.88]\n\nFrame 4:\n  Drone pose: [-52.2, 45.57, 20.64, -43.01, 85.75, 0.0]\n  Target bbox: [628.76, 338.32, 651.22, 381.45]\n\nFrame 5 (current):\n  Drone pose: [-52.07, 46.12, 20.58, -42.4, 82.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.34, \"ymin\": 340.4, \"xmax\": 651.8, \"ymax\": 379.32}, \"waypoint_deltas\": [{\"dx\": -0.14, \"dy\": 0.46, \"dz\": 0.01, \"dpitch\": -0.55, \"dyaw\": 2.79, \"droll\": 0.0}, {\"dx\": -0.14, \"dy\": 0.98, \"dz\": -0.01, \"dpitch\": -0.53, \"dyaw\": 2.79, \"droll\": 0.0}, {\"dx\": -0.14, \"dy\": 1.49, \"dz\": -0.03, \"dpitch\": -0.52, \"dyaw\": 2.78, \"droll\": 0.0}, {\"dx\": -0.14, \"dy\": 2.0, \"dz\": -0.05, \"dpitch\": -0.51, \"dyaw\": 2.78, \"droll\": 0.0}, {\"dx\": -0.14, \"dy\": 2.51, \"dz\": -0.16, \"dpitch\": -0.37, \"dyaw\": 2.78, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.08, "window_alt_abs_m": 1.42, "target_px_mean_hist": 263.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.21, 49.66, 20.36, -45.02, 82.73, 0.0]\n  Target bbox: [664.08, 301.16, 689.22, 342.24]\n\nFrame 2:\n  Drone pose: [-52.21, 50.17, 20.33, -42.68, 85.69, 0.0]\n  Target bbox: [631.66, 339.53, 648.25, 380.17]\n\nFrame 3:\n  Drone pose: [-52.21, 50.68, 20.3, -42.65, 85.69, 0.0]\n  Target bbox: [632.82, 339.96, 647.1, 379.75]\n\nFrame 4:\n  Drone pose: [-52.21, 51.19, 20.27, -41.58, 82.72, 0.0]\n  Target bbox: [665.0, 357.27, 688.54, 398.61]\n\nFrame 5 (current):\n  Drone pose: [-52.17, 51.68, 20.18, -46.93, 87.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.66, \"ymin\": 337.83, \"xmax\": 651.22, \"ymax\": 381.76}, \"waypoint_deltas\": [{\"dx\": -0.04, \"dy\": 0.53, \"dz\": 0.04, \"dpitch\": 4.36, \"dyaw\": -1.85, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 1.04, \"dz\": 0.01, \"dpitch\": 4.38, \"dyaw\": -1.85, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 1.54, \"dz\": -0.01, \"dpitch\": 4.4, \"dyaw\": -1.86, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 2.05, \"dz\": -0.03, \"dpitch\": 4.42, \"dyaw\": -1.86, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 2.56, \"dz\": -0.05, \"dpitch\": 4.43, \"dyaw\": -1.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.74, "window_alt_abs_m": 0.18, "target_px_mean_hist": 258.5, "cur_frame_id": 15, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00027/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.09, 55.71, 20.18, -42.88, 83.43, 0.0]\n  Target bbox: [626.26, 335.79, 653.79, 384.02]\n\nFrame 2:\n  Drone pose: [-52.21, 56.27, 20.08, -40.72, 90.24, 0.0]\n  Target bbox: [571.76, 370.3, 594.38, 410.43]\n\nFrame 3:\n  Drone pose: [-52.21, 56.78, 20.07, -42.68, 89.88, 0.0]\n  Target bbox: [576.13, 336.34, 599.33, 377.93]\n\nFrame 4:\n  Drone pose: [-52.17, 57.39, 20.06, -43.15, 89.12, 0.0]\n  Target bbox: [626.65, 335.8, 653.31, 383.96]\n\nFrame 5 (current):\n  Drone pose: [-52.21, 57.79, 20.05, -41.56, 89.18, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 584.25, \"ymin\": 354.86, \"xmax\": 608.15, \"ymax\": 395.95}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": -0.01, \"dpitch\": -0.87, \"dyaw\": -3.52, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": -0.01, \"dpitch\": -0.87, \"dyaw\": -3.53, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": -0.02, \"dpitch\": -0.87, \"dyaw\": -3.53, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.02, \"dz\": -0.02, \"dpitch\": -0.87, \"dyaw\": -3.53, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.52, \"dz\": -0.03, \"dpitch\": -0.87, \"dyaw\": -3.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.99, "window_alt_abs_m": 0.12, "target_px_mean_hist": 254.2, "cur_frame_id": 27, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.1, 61.84, 20.17, -46.49, 92.59, 0.0]\n  Target bbox: [635.06, 264.91, 649.25, 305.68]\n\nFrame 2:\n  Drone pose: [-52.22, 62.33, 20.01, -39.1, 84.61, 0.0]\n  Target bbox: [639.57, 391.78, 666.51, 440.25]\n\nFrame 3:\n  Drone pose: [-52.07, 62.76, 19.97, -42.54, 80.23, 0.0]\n  Target bbox: [619.36, 388.31, 645.79, 437.01]\n\nFrame 4:\n  Drone pose: [-52.22, 63.34, 20.01, -42.44, 85.64, 0.0]\n  Target bbox: [629.16, 339.72, 650.72, 379.99]\n\nFrame 5 (current):\n  Drone pose: [-52.22, 63.84, 20.01, -42.44, 85.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.8, \"ymin\": 339.69, \"xmax\": 651.31, \"ymax\": 380.01}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.01, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.52, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.78, "window_alt_abs_m": 0.24, "target_px_mean_hist": 255.5, "cur_frame_id": 39, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.22, 67.36, 20.0, -44.91, 85.23, 0.0]\n  Target bbox: [635.21, 298.18, 655.0, 339.48]\n\nFrame 2:\n  Drone pose: [-52.22, 67.87, 20.0, -42.47, 85.64, 0.0]\n  Target bbox: [627.75, 339.6, 652.12, 380.11]\n\nFrame 3:\n  Drone pose: [-52.22, 68.37, 20.0, -37.47, 85.67, 0.0]\n  Target bbox: [627.87, 422.66, 651.26, 465.04]\n\nFrame 4:\n  Drone pose: [-52.22, 68.87, 20.0, -42.81, 85.82, 0.0]\n  Target bbox: [625.05, 330.75, 650.24, 377.75]\n\nFrame 5 (current):\n  Drone pose: [-52.17, 69.2, 19.95, -45.41, 92.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 574.63, \"ymin\": 329.77, \"xmax\": 605.42, \"ymax\": 377.92}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": 0.68, \"dz\": 0.05, \"dpitch\": 2.93, \"dyaw\": -7.04, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": 1.18, \"dz\": 0.05, \"dpitch\": 2.93, \"dyaw\": -7.04, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": 1.68, \"dz\": 0.05, \"dpitch\": 2.92, \"dyaw\": -7.04, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": 2.18, \"dz\": 0.05, \"dpitch\": 2.92, \"dyaw\": -7.04, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": 2.68, \"dz\": 0.05, \"dpitch\": 2.92, \"dyaw\": -7.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.45, "window_alt_abs_m": 0.06, "target_px_mean_hist": 261.5, "cur_frame_id": 50, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.24, 73.39, 20.0, -42.5, 85.57, 0.0]\n  Target bbox: [633.55, 339.24, 646.37, 380.46]\n\nFrame 2:\n  Drone pose: [-52.25, 73.89, 20.0, -42.5, 85.54, 0.0]\n  Target bbox: [632.33, 339.38, 647.59, 380.33]\n\nFrame 3:\n  Drone pose: [-52.27, 74.54, 19.95, -47.18, 83.83, 0.0]\n  Target bbox: [627.07, 337.04, 652.81, 382.51]\n\nFrame 4:\n  Drone pose: [-52.29, 74.9, 20.0, -41.37, 85.89, 0.0]\n  Target bbox: [623.47, 358.3, 644.68, 399.29]\n\nFrame 5 (current):\n  Drone pose: [-52.33, 75.4, 20.0, -43.03, 90.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 570.62, \"ymin\": 332.42, \"xmax\": 585.44, \"ymax\": 373.17}, \"waypoint_deltas\": [{\"dx\": -0.04, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.53, \"dyaw\": -5.12, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": 0.53, \"dyaw\": -5.26, \"droll\": 0.0}, {\"dx\": -0.15, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": 0.53, \"dyaw\": -5.43, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": 2.02, \"dz\": 0.0, \"dpitch\": 0.53, \"dyaw\": -5.63, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": 2.53, \"dz\": 0.0, \"dpitch\": 0.53, \"dyaw\": -5.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.24, "window_alt_abs_m": 0.09, "target_px_mean_hist": 275.5, "cur_frame_id": 62, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.98, 79.46, 20.0, -41.87, 84.45, 0.0]\n  Target bbox: [617.84, 349.74, 639.68, 391.13]\n\nFrame 2:\n  Drone pose: [-53.11, 79.98, 20.0, -44.29, 78.2, 0.0]\n  Target bbox: [689.87, 309.79, 714.23, 353.77]\n\nFrame 3:\n  Drone pose: [-53.12, 80.39, 19.98, -44.52, 76.39, 0.0]\n  Target bbox: [626.48, 335.3, 653.57, 384.43]\n\nFrame 4:\n  Drone pose: [-53.38, 81.02, 20.0, -45.37, 85.12, 0.0]\n  Target bbox: [595.45, 290.33, 618.41, 334.56]\n\nFrame 5 (current):\n  Drone pose: [-53.57, 81.56, 19.91, -44.85, 77.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.69, \"ymin\": 341.51, \"xmax\": 650.15, \"ymax\": 388.07}, \"waypoint_deltas\": [{\"dx\": -0.04, \"dy\": 0.51, \"dz\": 0.09, \"dpitch\": 2.32, \"dyaw\": 4.5, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": 1.04, \"dz\": 0.09, \"dpitch\": 2.3, \"dyaw\": 4.27, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 1.57, \"dz\": 0.09, \"dpitch\": 2.26, \"dyaw\": 4.1, \"droll\": 0.0}, {\"dx\": -0.22, \"dy\": 2.11, \"dz\": 0.09, \"dpitch\": 2.32, \"dyaw\": 2.65, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": 2.65, \"dz\": 0.09, \"dpitch\": 2.27, \"dyaw\": 2.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.61, "window_alt_abs_m": 0.13, "target_px_mean_hist": 259.2, "cur_frame_id": 74, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.81, 85.3, 20.0, -42.56, 78.5, 0.0]\n  Target bbox: [627.21, 338.2, 652.69, 381.5]\n\nFrame 2:\n  Drone pose: [-53.79, 85.84, 20.0, -42.62, 78.52, 0.0]\n  Target bbox: [627.4, 337.09, 652.64, 382.7]\n\nFrame 3:\n  Drone pose: [-53.76, 86.39, 20.0, -42.55, 77.25, 0.0]\n  Target bbox: [628.1, 338.3, 651.8, 381.39]\n\nFrame 4:\n  Drone pose: [-53.72, 86.93, 20.0, -44.89, 78.56, 0.0]\n  Target bbox: [611.54, 297.99, 637.9, 345.83]\n\nFrame 5 (current):\n  Drone pose: [-53.67, 87.48, 20.0, -38.43, 75.42, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 635.32, \"ymin\": 407.31, \"xmax\": 662.16, \"ymax\": 450.69}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.54, \"dz\": 0.0, \"dpitch\": -4.19, \"dyaw\": 0.82, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 1.09, \"dz\": 0.0, \"dpitch\": -4.11, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": 0.19, \"dy\": 1.63, \"dz\": 0.0, \"dpitch\": -4.19, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": 2.18, \"dz\": 0.0, \"dpitch\": -4.09, \"dyaw\": -1.39, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": 2.72, \"dz\": 0.0, \"dpitch\": -4.17, \"dyaw\": -1.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.75, "window_alt_abs_m": 0.0, "target_px_mean_hist": 264.5, "cur_frame_id": 85, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.15, 91.81, 20.0, -42.62, 73.29, 0.0]\n  Target bbox: [627.85, 336.75, 652.22, 383.0]\n\nFrame 2:\n  Drone pose: [-53.07, 92.34, 20.0, -43.33, 75.33, 0.0]\n  Target bbox: [590.14, 325.44, 611.43, 367.81]\n\nFrame 3:\n  Drone pose: [-52.98, 92.88, 20.0, -47.58, 70.64, 0.0]\n  Target bbox: [648.82, 253.71, 675.12, 298.44]\n\nFrame 4:\n  Drone pose: [-52.8, 93.3, 19.98, -39.88, 70.77, 0.0]\n  Target bbox: [629.25, 338.61, 650.83, 381.18]\n\nFrame 5 (current):\n  Drone pose: [-52.76, 93.95, 20.0, -43.11, 76.6, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 566.43, \"ymin\": 330.79, \"xmax\": 589.76, \"ymax\": 374.0}, \"waypoint_deltas\": [{\"dx\": 0.12, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": 0.67, \"dyaw\": -6.01, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": 1.07, \"dz\": 0.0, \"dpitch\": 0.58, \"dyaw\": -5.74, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": 1.61, \"dz\": 0.0, \"dpitch\": 0.7, \"dyaw\": -6.75, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 2.14, \"dz\": 0.0, \"dpitch\": 0.61, \"dyaw\": -6.5, \"droll\": 0.0}, {\"dx\": 0.57, \"dy\": 2.68, \"dz\": 0.0, \"dpitch\": 0.75, \"dyaw\": -7.54, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.71, "window_alt_abs_m": 0.04, "target_px_mean_hist": 255.2, "cur_frame_id": 97, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403/aug_001/frames_playback/frame_00109/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-51.86, 98.22, 20.0, -42.39, 68.54, 0.0]\n  Target bbox: [629.2, 338.23, 650.89, 381.46]\n\nFrame 2:\n  Drone pose: [-51.83, 98.78, 19.97, -40.59, 66.23, 0.0]\n  Target bbox: [687.21, 372.54, 716.57, 420.58]\n\nFrame 3:\n  Drone pose: [-51.59, 99.28, 20.0, -38.47, 62.89, 0.0]\n  Target bbox: [688.4, 404.08, 716.13, 449.82]\n\nFrame 4:\n  Drone pose: [-51.45, 99.81, 20.0, -45.9, 72.28, 0.0]\n  Target bbox: [577.87, 281.13, 601.37, 325.58]\n\nFrame 5 (current):\n  Drone pose: [-51.44, 100.46, 20.06, -45.39, 62.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.75, \"ymin\": 335.58, \"xmax\": 654.24, \"ymax\": 384.15}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": 0.4, \"dz\": -0.06, \"dpitch\": 2.96, \"dyaw\": 5.08, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": 0.92, \"dz\": -0.06, \"dpitch\": 3.11, \"dyaw\": 4.23, \"droll\": 0.0}, {\"dx\": 0.64, \"dy\": 1.44, \"dz\": -0.06, \"dpitch\": 3.0, \"dyaw\": 4.65, \"droll\": 0.0}, {\"dx\": 0.83, \"dy\": 1.95, \"dz\": -0.06, \"dpitch\": 3.15, \"dyaw\": 3.87, \"droll\": 0.0}, {\"dx\": 1.05, \"dy\": 2.45, \"dz\": -0.06, \"dpitch\": 3.03, \"dyaw\": 4.4, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.7, "window_alt_abs_m": 0.13, "target_px_mean_hist": 268.5, "cur_frame_id": 109, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260403", "difficulty_score": 0.295, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [67.39, -60.06, 22.0, -46.92, 180.0, 0.0]\n  Target bbox: [631.05, 341.77, 648.95, 377.89]\n\nFrame 2:\n  Drone pose: [65.63, -61.53, 21.2, -47.6, 175.51, 0.0]\n  Target bbox: [629.93, 339.28, 649.95, 380.37]\n\nFrame 3:\n  Drone pose: [64.51, -62.26, 20.67, -47.7, 173.07, 0.0]\n  Target bbox: [629.15, 337.72, 650.71, 381.95]\n\nFrame 4:\n  Drone pose: [63.73, -62.57, 20.64, -48.03, 171.97, 0.0]\n  Target bbox: [629.08, 337.93, 650.79, 381.72]\n\nFrame 5 (current):\n  Drone pose: [63.11, -62.7, 20.62, -48.15, 171.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.98, \"ymin\": 338.69, \"xmax\": 650.91, \"ymax\": 380.94}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": -0.03, \"dz\": -0.03, \"dpitch\": -0.03, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -0.04, \"dz\": -0.05, \"dpitch\": -0.02, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": -0.05, \"dz\": -0.07, \"dpitch\": -0.01, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -0.05, \"dz\": -0.09, \"dpitch\": 0.0, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": -2.6, \"dy\": -0.05, \"dz\": -0.2, \"dpitch\": 0.14, \"dyaw\": -0.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.48, "window_alt_abs_m": 1.38, "target_px_mean_hist": 231.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [60.51, -62.75, 20.42, -48.01, 171.29, 0.0]\n  Target bbox: [629.35, 337.53, 650.51, 382.14]\n\nFrame 2:\n  Drone pose: [60.0, -62.75, 20.39, -47.98, 171.29, 0.0]\n  Target bbox: [628.83, 337.97, 651.04, 381.65]\n\nFrame 3:\n  Drone pose: [59.49, -62.76, 20.36, -47.96, 171.28, 0.0]\n  Target bbox: [629.52, 337.32, 650.35, 382.32]\n\nFrame 4:\n  Drone pose: [58.98, -62.76, 20.33, -47.93, 171.27, 0.0]\n  Target bbox: [629.2, 338.93, 650.7, 380.67]\n\nFrame 5 (current):\n  Drone pose: [58.47, -62.76, 20.3, -47.91, 171.25, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.43, \"ymin\": 338.32, \"xmax\": 650.46, \"ymax\": 381.31}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.03, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": 0.0, \"dz\": -0.06, \"dpitch\": 0.05, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": 0.0, \"dz\": -0.08, \"dpitch\": 0.06, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": 0.01, \"dz\": -0.11, \"dpitch\": 0.07, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -2.58, \"dy\": 0.03, \"dz\": -0.13, \"dpitch\": 0.06, \"dyaw\": 0.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.04, "window_alt_abs_m": 0.12, "target_px_mean_hist": 247.2, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [55.36, -62.71, 20.15, -47.87, 171.35, 0.0]\n  Target bbox: [628.24, 338.42, 651.64, 381.22]\n\nFrame 2:\n  Drone pose: [54.84, -62.69, 20.14, -47.88, 171.42, 0.0]\n  Target bbox: [628.17, 337.94, 651.69, 381.7]\n\nFrame 3:\n  Drone pose: [54.32, -62.66, 20.12, -47.9, 171.51, 0.0]\n  Target bbox: [629.21, 338.01, 650.67, 381.61]\n\nFrame 4:\n  Drone pose: [53.8, -62.63, 20.1, -47.92, 171.6, 0.0]\n  Target bbox: [628.96, 336.8, 650.89, 382.84]\n\nFrame 5 (current):\n  Drone pose: [53.29, -62.59, 20.09, -47.93, 171.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.75, \"ymin\": 337.87, \"xmax\": 651.13, \"ymax\": 381.73}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.02, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.04, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": 0.03, \"dz\": -0.03, \"dpitch\": 0.09, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": 0.01, \"dz\": -0.04, \"dpitch\": 0.16, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": -2.4, \"dy\": -0.02, \"dz\": -0.05, \"dpitch\": 0.25, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.35, "window_alt_abs_m": 0.06, "target_px_mean_hist": 258.2, "cur_frame_id": 23, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [50.43, -62.65, 20.04, -47.6, 171.58, 0.0]\n  Target bbox: [628.84, 336.96, 651.01, 382.68]\n\nFrame 2:\n  Drone pose: [49.97, -62.7, 20.03, -47.53, 171.45, 0.0]\n  Target bbox: [628.74, 337.75, 651.14, 381.86]\n\nFrame 3:\n  Drone pose: [49.49, -62.75, 20.03, -47.48, 171.31, 0.0]\n  Target bbox: [628.82, 338.79, 651.08, 380.8]\n\nFrame 4:\n  Drone pose: [49.01, -62.79, 20.02, -47.43, 171.17, 0.0]\n  Target bbox: [628.88, 338.86, 651.02, 380.74]\n\nFrame 5 (current):\n  Drone pose: [48.51, -62.83, 20.02, -47.41, 171.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.67, \"ymin\": 338.05, \"xmax\": 651.22, \"ymax\": 381.55}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": -0.06, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": -0.08, \"dz\": -0.01, \"dpitch\": -0.05, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": -2.08, \"dy\": -0.08, \"dz\": -0.01, \"dpitch\": -0.1, \"dyaw\": -0.29, \"droll\": 0.0}, {\"dx\": -2.61, \"dy\": -0.07, \"dz\": -0.01, \"dpitch\": -0.14, \"dyaw\": -0.28, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.53, "window_alt_abs_m": 0.02, "target_px_mean_hist": 255.5, "cur_frame_id": 33, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00042/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [45.9, -62.9, 20.01, -47.55, 170.76, 0.0]\n  Target bbox: [628.88, 336.68, 650.97, 382.97]\n\nFrame 2:\n  Drone pose: [45.38, -62.89, 20.01, -47.59, 170.8, 0.0]\n  Target bbox: [629.25, 336.85, 650.62, 382.77]\n\nFrame 3:\n  Drone pose: [44.86, -62.86, 20.0, -47.62, 170.88, 0.0]\n  Target bbox: [629.08, 338.12, 650.8, 381.5]\n\nFrame 4:\n  Drone pose: [44.35, -62.82, 20.0, -47.65, 170.99, 0.0]\n  Target bbox: [628.24, 336.65, 651.61, 383.01]\n\nFrame 5 (current):\n  Drone pose: [43.84, -62.78, 20.0, -47.66, 171.13, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.16, \"ymin\": 336.92, \"xmax\": 651.69, \"ymax\": 382.75}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": 0.17, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": -1.96, \"dy\": 0.25, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.81, \"droll\": 0.0}, {\"dx\": -2.43, \"dy\": 0.32, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 1.06, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.37, "window_alt_abs_m": 0.0, "target_px_mean_hist": 256.8, "cur_frame_id": 42, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [40.93, -62.39, 20.0, -47.62, 172.42, 0.0]\n  Target bbox: [629.48, 338.14, 650.4, 381.49]\n\nFrame 2:\n  Drone pose: [40.44, -62.33, 20.0, -47.61, 172.62, 0.0]\n  Target bbox: [628.45, 338.63, 651.44, 380.97]\n\nFrame 3:\n  Drone pose: [39.93, -62.29, 20.0, -47.63, 172.76, 0.0]\n  Target bbox: [629.46, 337.25, 650.4, 382.41]\n\nFrame 4:\n  Drone pose: [39.41, -62.26, 20.0, -47.68, 172.84, 0.0]\n  Target bbox: [629.28, 338.63, 650.61, 380.99]\n\nFrame 5 (current):\n  Drone pose: [38.87, -62.24, 20.0, -47.74, 172.87, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.43, \"ymin\": 337.24, \"xmax\": 650.43, \"ymax\": 382.42}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -1.65, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -2.22, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": -0.45, \"dyaw\": 1.63, \"droll\": 0.0}, {\"dx\": -2.82, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": -0.61, \"dyaw\": 1.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.44, "window_alt_abs_m": 0.0, "target_px_mean_hist": 256.5, "cur_frame_id": 52, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [35.45, -62.15, 20.0, -48.6, 176.34, 0.0]\n  Target bbox: [629.88, 337.31, 649.98, 382.31]\n\nFrame 2:\n  Drone pose: [34.84, -62.09, 20.0, -48.78, 176.5, 0.0]\n  Target bbox: [625.96, 335.35, 654.1, 384.31]\n\nFrame 3:\n  Drone pose: [34.24, -62.03, 20.0, -48.99, 178.4, 0.0]\n  Target bbox: [630.24, 337.57, 649.62, 382.03]\n\nFrame 4:\n  Drone pose: [33.65, -61.96, 20.0, -49.14, 178.6, 0.0]\n  Target bbox: [630.05, 337.48, 649.8, 382.13]\n\nFrame 5 (current):\n  Drone pose: [33.08, -61.91, 20.0, -49.27, 178.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.91, \"ymin\": 335.45, \"xmax\": 654.07, \"ymax\": 384.19}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 1.88, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 1.98, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 3.75, \"droll\": 0.0}, {\"dx\": -2.14, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": 3.73, \"droll\": 0.0}, {\"dx\": -2.65, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": -0.19, \"dyaw\": 5.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.46, "window_alt_abs_m": 0.0, "target_px_mean_hist": 266.5, "cur_frame_id": 62, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00071/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [30.43, -61.85, 20.0, -49.46, -175.82, 0.0]\n  Target bbox: [628.6, 338.2, 651.54, 381.38]\n\nFrame 2:\n  Drone pose: [29.91, -61.88, 20.0, -49.49, -175.92, 0.0]\n  Target bbox: [624.9, 334.57, 655.05, 385.1]\n\nFrame 3:\n  Drone pose: [29.39, -61.91, 20.0, -49.46, -174.32, 0.0]\n  Target bbox: [629.59, 337.01, 650.56, 382.59]\n\nFrame 4:\n  Drone pose: [28.87, -61.96, 20.0, -49.5, -174.46, 0.0]\n  Target bbox: [626.76, 335.98, 653.19, 383.65]\n\nFrame 5 (current):\n  Drone pose: [28.34, -62.0, 20.0, -49.47, -172.89, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.15, \"ymin\": 336.44, \"xmax\": 650.99, \"ymax\": 383.14}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -0.42, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": -0.22, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 1.02, \"droll\": 0.0}, {\"dx\": -2.11, \"dy\": -0.32, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": 0.67, \"droll\": 0.0}, {\"dx\": -2.63, \"dy\": -0.45, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 1.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.41, "window_alt_abs_m": 0.0, "target_px_mean_hist": 264.2, "cur_frame_id": 71, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00081/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.19, -62.58, 20.0, -49.62, -171.37, 0.0]\n  Target bbox: [624.99, 334.48, 654.93, 385.19]\n\nFrame 2:\n  Drone pose: [24.67, -62.73, 20.0, -49.56, -170.14, 0.0]\n  Target bbox: [627.81, 336.27, 652.35, 383.35]\n\nFrame 3:\n  Drone pose: [24.15, -62.87, 20.0, -49.63, -170.62, 0.0]\n  Target bbox: [625.12, 335.18, 654.87, 384.44]\n\nFrame 4:\n  Drone pose: [23.63, -63.01, 20.0, -49.56, -169.37, 0.0]\n  Target bbox: [628.59, 336.09, 651.55, 383.49]\n\nFrame 5 (current):\n  Drone pose: [23.11, -63.14, 20.0, -49.64, -169.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.25, \"ymin\": 336.34, \"xmax\": 652.71, \"ymax\": 383.25}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 1.3, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -0.24, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.91, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": -0.36, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 0.52, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 1.85, \"droll\": 0.0}, {\"dx\": -2.66, \"dy\": -0.61, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 1.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.4, "window_alt_abs_m": 0.0, "target_px_mean_hist": 266.8, "cur_frame_id": 81, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/ORI/frames_playback/frame_00091/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [19.92, -63.88, 20.0, -49.7, -167.06, 0.0]\n  Target bbox: [628.19, 336.09, 651.95, 383.52]\n\nFrame 2:\n  Drone pose: [19.38, -64.01, 20.0, -49.8, -167.51, 0.0]\n  Target bbox: [625.04, 335.16, 654.93, 384.46]\n\nFrame 3:\n  Drone pose: [18.85, -64.15, 20.0, -49.71, -166.26, 0.0]\n  Target bbox: [627.77, 337.27, 652.33, 382.28]\n\nFrame 4:\n  Drone pose: [18.33, -64.3, 20.0, -49.82, -166.74, 0.0]\n  Target bbox: [627.18, 335.98, 652.75, 383.62]\n\nFrame 5 (current):\n  Drone pose: [17.89, -64.37, 20.0, -49.53, -165.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.29, \"ymin\": 337.16, \"xmax\": 651.82, \"ymax\": 382.42}, \"waypoint_deltas\": [{\"dx\": -0.42, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.63, \"dyaw\": 1.39, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.91, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 1.19, \"dyaw\": 1.31, \"droll\": 0.0}, {\"dx\": -1.75, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": 1.7, \"dyaw\": 2.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.57, "window_alt_abs_m": 0.0, "target_px_mean_hist": 264.0, "cur_frame_id": 91, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [67.26, -60.09, 22.05, -42.53, 181.77, 0.0]\n  Target bbox: [610.3, 418.43, 627.09, 457.9]\n\nFrame 2:\n  Drone pose: [65.63, -61.5, 21.17, -50.13, 180.59, 0.0]\n  Target bbox: [573.27, 298.82, 593.1, 338.61]\n\nFrame 3:\n  Drone pose: [64.51, -62.26, 20.67, -52.7, 173.52, 0.0]\n  Target bbox: [623.89, 253.72, 645.55, 297.89]\n\nFrame 4:\n  Drone pose: [63.73, -62.57, 20.64, -45.75, 176.84, 0.0]\n  Target bbox: [573.94, 378.54, 596.33, 421.17]\n\nFrame 5 (current):\n  Drone pose: [63.12, -62.82, 20.69, -48.96, 167.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 669.66, \"ymin\": 327.42, \"xmax\": 692.8, \"ymax\": 369.0}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.09, \"dz\": -0.1, \"dpitch\": 0.78, \"dyaw\": 3.92, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 0.08, \"dz\": -0.12, \"dpitch\": 0.79, \"dyaw\": 3.87, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": 0.07, \"dz\": -0.14, \"dpitch\": 0.8, \"dyaw\": 3.85, \"droll\": 0.0}, {\"dx\": -2.1, \"dy\": 0.07, \"dz\": -0.16, \"dpitch\": 0.81, \"dyaw\": 3.84, \"droll\": 0.0}, {\"dx\": -2.61, \"dy\": 0.07, \"dz\": -0.27, \"dpitch\": 0.95, \"dyaw\": 3.82, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.94, "window_alt_abs_m": 1.45, "target_px_mean_hist": 232.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [60.51, -62.75, 20.42, -49.21, 174.37, 0.0]\n  Target bbox: [593.5, 319.3, 617.04, 361.57]\n\nFrame 2:\n  Drone pose: [60.06, -62.73, 20.46, -44.66, 166.45, 0.0]\n  Target bbox: [683.59, 394.84, 707.81, 440.53]\n\nFrame 3:\n  Drone pose: [59.4, -62.84, 20.21, -47.86, 170.95, 0.0]\n  Target bbox: [628.66, 339.25, 651.25, 380.34]\n\nFrame 4:\n  Drone pose: [59.07, -62.9, 20.37, -46.81, 174.32, 0.0]\n  Target bbox: [590.22, 356.4, 611.64, 398.66]\n\nFrame 5 (current):\n  Drone pose: [58.47, -62.76, 20.3, -45.78, 174.87, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 587.33, \"ymin\": 375.8, \"xmax\": 610.88, \"ymax\": 417.06}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": -2.1, \"dyaw\": -3.63, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": 0.0, \"dz\": -0.06, \"dpitch\": -2.08, \"dyaw\": -3.63, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": 0.0, \"dz\": -0.08, \"dpitch\": -2.07, \"dyaw\": -3.63, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": 0.01, \"dz\": -0.11, \"dpitch\": -2.06, \"dyaw\": -3.61, \"droll\": 0.0}, {\"dx\": -2.58, \"dy\": 0.03, \"dz\": -0.13, \"dpitch\": -2.07, \"dyaw\": -3.57, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.34, "window_alt_abs_m": 0.52, "target_px_mean_hist": 254.2, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [55.36, -62.71, 20.15, -47.87, 171.35, 0.0]\n  Target bbox: [629.2, 337.41, 650.67, 382.24]\n\nFrame 2:\n  Drone pose: [54.81, -62.8, 20.07, -47.81, 171.06, 0.0]\n  Target bbox: [628.66, 338.03, 651.22, 381.59]\n\nFrame 3:\n  Drone pose: [54.32, -62.66, 20.12, -47.9, 171.51, 0.0]\n  Target bbox: [628.72, 337.4, 651.14, 382.24]\n\nFrame 4:\n  Drone pose: [53.79, -62.63, 20.05, -50.57, 172.13, 0.0]\n  Target bbox: [622.1, 291.55, 645.43, 337.25]\n\nFrame 5 (current):\n  Drone pose: [53.29, -62.59, 20.09, -52.93, 175.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 585.15, \"ymin\": 255.78, \"xmax\": 607.33, \"ymax\": 297.93}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.02, \"dz\": -0.01, \"dpitch\": 5.01, \"dyaw\": -3.77, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.04, \"dz\": -0.02, \"dpitch\": 5.03, \"dyaw\": -3.73, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": 0.03, \"dz\": -0.03, \"dpitch\": 5.09, \"dyaw\": -3.73, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": 0.01, \"dz\": -0.04, \"dpitch\": 5.16, \"dyaw\": -3.78, \"droll\": 0.0}, {\"dx\": -2.4, \"dy\": -0.02, \"dz\": -0.05, \"dpitch\": 5.25, \"dyaw\": -3.87, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.8, "window_alt_abs_m": 0.25, "target_px_mean_hist": 255.0, "cur_frame_id": 23, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [50.49, -62.59, 20.02, -48.69, 173.39, 0.0]\n  Target bbox: [611.17, 317.29, 632.4, 362.88]\n\nFrame 2:\n  Drone pose: [49.96, -62.62, 19.99, -48.32, 176.68, 0.0]\n  Target bbox: [571.68, 325.6, 594.53, 370.21]\n\nFrame 3:\n  Drone pose: [49.49, -62.75, 20.03, -47.48, 171.31, 0.0]\n  Target bbox: [628.71, 338.62, 651.18, 380.99]\n\nFrame 4:\n  Drone pose: [49.01, -62.79, 20.02, -50.65, 176.17, 0.0]\n  Target bbox: [572.01, 285.22, 593.92, 330.05]\n\nFrame 5 (current):\n  Drone pose: [48.6, -62.7, 19.97, -47.23, 171.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.76, \"ymin\": 337.86, \"xmax\": 651.12, \"ymax\": 381.78}, \"waypoint_deltas\": [{\"dx\": -0.6, \"dy\": -0.17, \"dz\": 0.05, \"dpitch\": -0.18, \"dyaw\": -0.59, \"droll\": 0.0}, {\"dx\": -1.11, \"dy\": -0.19, \"dz\": 0.04, \"dpitch\": -0.2, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": -0.21, \"dz\": 0.04, \"dpitch\": -0.23, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": -2.17, \"dy\": -0.21, \"dz\": 0.04, \"dpitch\": -0.28, \"dyaw\": -0.77, \"droll\": 0.0}, {\"dx\": -2.7, \"dy\": -0.2, \"dz\": 0.04, \"dpitch\": -0.32, \"dyaw\": -0.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.18, "window_alt_abs_m": 0.13, "target_px_mean_hist": 255.5, "cur_frame_id": 33, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00042/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [45.9, -62.9, 20.01, -47.55, 170.76, 0.0]\n  Target bbox: [628.11, 337.0, 651.74, 382.67]\n\nFrame 2:\n  Drone pose: [45.38, -62.85, 20.04, -47.64, 170.91, 0.0]\n  Target bbox: [629.13, 337.96, 650.75, 381.67]\n\nFrame 3:\n  Drone pose: [44.84, -63.04, 20.06, -51.43, 171.64, 0.0]\n  Target bbox: [613.39, 273.9, 635.82, 320.06]\n\nFrame 4:\n  Drone pose: [44.46, -62.79, 20.03, -47.98, 166.87, 0.0]\n  Target bbox: [676.5, 329.98, 700.64, 376.84]\n\nFrame 5 (current):\n  Drone pose: [43.87, -62.81, 19.9, -49.06, 176.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 570.91, \"ymin\": 314.24, \"xmax\": 595.19, \"ymax\": 355.41}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": 0.08, \"dz\": 0.1, \"dpitch\": 1.39, \"dyaw\": -4.75, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": 0.14, \"dz\": 0.1, \"dpitch\": 1.38, \"dyaw\": -4.56, \"droll\": 0.0}, {\"dx\": -1.51, \"dy\": 0.2, \"dz\": 0.1, \"dpitch\": 1.39, \"dyaw\": -4.35, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": 0.28, \"dz\": 0.1, \"dpitch\": 1.4, \"dyaw\": -4.1, \"droll\": 0.0}, {\"dx\": -2.46, \"dy\": 0.35, \"dz\": 0.1, \"dpitch\": 1.42, \"dyaw\": -3.85, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.83, "window_alt_abs_m": 0.21, "target_px_mean_hist": 256.0, "cur_frame_id": 42, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [40.93, -62.39, 20.0, -46.79, 171.44, 0.0]\n  Target bbox: [639.97, 350.83, 662.54, 396.84]\n\nFrame 2:\n  Drone pose: [40.37, -62.3, 19.94, -49.74, 167.69, 0.0]\n  Target bbox: [684.36, 303.99, 708.98, 348.58]\n\nFrame 3:\n  Drone pose: [39.91, -62.3, 19.9, -46.39, 167.71, 0.0]\n  Target bbox: [684.98, 358.44, 708.51, 402.63]\n\nFrame 4:\n  Drone pose: [39.45, -62.13, 19.88, -47.46, 173.28, 0.0]\n  Target bbox: [629.51, 337.23, 650.35, 382.42]\n\nFrame 5 (current):\n  Drone pose: [38.87, -62.24, 20.0, -51.84, 169.26, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 668.52, \"ymin\": 268.47, \"xmax\": 693.23, \"ymax\": 315.24}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 4.03, \"dyaw\": 3.6, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 3.96, \"dyaw\": 3.59, \"droll\": 0.0}, {\"dx\": -1.65, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 3.86, \"dyaw\": 3.59, \"droll\": 0.0}, {\"dx\": -2.22, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 3.65, \"dyaw\": 5.24, \"droll\": 0.0}, {\"dx\": -2.82, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 3.49, \"dyaw\": 5.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.36, "window_alt_abs_m": 0.24, "target_px_mean_hist": 261.5, "cur_frame_id": 52, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [35.47, -62.03, 20.03, -49.82, 179.73, 0.0]\n  Target bbox: [596.78, 318.01, 616.53, 362.21]\n\nFrame 2:\n  Drone pose: [34.69, -62.06, 19.9, -47.83, 171.71, 0.0]\n  Target bbox: [679.03, 355.05, 708.4, 403.4]\n\nFrame 3:\n  Drone pose: [34.27, -61.92, 20.0, -48.95, 178.77, 0.0]\n  Target bbox: [629.61, 338.03, 650.25, 381.53]\n\nFrame 4:\n  Drone pose: [33.65, -61.96, 20.0, -53.72, 173.6, 0.0]\n  Target bbox: [683.59, 261.56, 706.66, 307.89]\n\nFrame 5 (current):\n  Drone pose: [33.08, -61.91, 20.0, -50.33, 177.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 634.87, \"ymin\": 316.89, \"xmax\": 664.6, \"ymax\": 367.31}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 0.96, \"dyaw\": 2.75, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": 0.9, \"dyaw\": 2.85, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": 0.88, \"dyaw\": 4.62, \"droll\": 0.0}, {\"dx\": -2.14, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": 0.85, \"dyaw\": 4.6, \"droll\": 0.0}, {\"dx\": -2.65, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 0.87, \"dyaw\": 6.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.56, "window_alt_abs_m": 0.23, "target_px_mean_hist": 270.5, "cur_frame_id": 62, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00071/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [30.52, -61.87, 19.94, -49.22, -175.91, 0.0]\n  Target bbox: [629.42, 338.98, 650.7, 380.58]\n\nFrame 2:\n  Drone pose: [29.88, -61.94, 20.01, -54.07, -177.12, 0.0]\n  Target bbox: [636.3, 259.14, 665.63, 309.09]\n\nFrame 3:\n  Drone pose: [29.29, -62.01, 20.04, -54.71, -174.33, 0.0]\n  Target bbox: [626.09, 253.59, 647.52, 297.93]\n\nFrame 4:\n  Drone pose: [28.86, -61.86, 19.87, -48.02, -170.12, 0.0]\n  Target bbox: [579.91, 356.21, 612.03, 408.72]\n\nFrame 5 (current):\n  Drone pose: [28.26, -61.96, 20.04, -53.08, -167.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 573.3, \"ymin\": 280.91, \"xmax\": 597.69, \"ymax\": 327.17}, \"waypoint_deltas\": [{\"dx\": -0.45, \"dy\": -0.1, \"dz\": -0.04, \"dpitch\": 3.55, \"dyaw\": -5.37, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.17, \"dz\": -0.04, \"dpitch\": 3.49, \"dyaw\": -5.6, \"droll\": 0.0}, {\"dx\": -1.51, \"dy\": -0.26, \"dz\": -0.04, \"dpitch\": 3.53, \"dyaw\": -4.16, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": -0.36, \"dz\": -0.04, \"dpitch\": 3.46, \"dyaw\": -4.51, \"droll\": 0.0}, {\"dx\": -2.55, \"dy\": -0.49, \"dz\": -0.04, \"dpitch\": 3.52, \"dyaw\": -3.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.62, "window_alt_abs_m": 0.44, "target_px_mean_hist": 269.0, "cur_frame_id": 71, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00081/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.02, -62.59, 19.97, -54.04, -166.29, 0.0]\n  Target bbox: [570.55, 267.21, 600.56, 315.84]\n\nFrame 2:\n  Drone pose: [24.53, -62.67, 20.02, -47.34, -164.88, 0.0]\n  Target bbox: [572.87, 378.61, 598.56, 427.07]\n\nFrame 3:\n  Drone pose: [24.3, -62.92, 19.99, -49.31, -174.94, 0.0]\n  Target bbox: [668.64, 336.9, 700.26, 387.51]\n\nFrame 4:\n  Drone pose: [23.53, -62.96, 20.1, -48.69, -174.15, 0.0]\n  Target bbox: [682.63, 358.02, 706.02, 404.24]\n\nFrame 5 (current):\n  Drone pose: [23.08, -62.97, 19.96, -44.56, -171.83, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 654.85, \"ymin\": 420.2, \"xmax\": 683.18, \"ymax\": 468.5}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.29, \"dz\": 0.04, \"dpitch\": -5.0, \"dyaw\": 3.33, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": -0.41, \"dz\": 0.04, \"dpitch\": -5.09, \"dyaw\": 2.94, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": -0.53, \"dz\": 0.04, \"dpitch\": -5.18, \"dyaw\": 2.55, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -0.65, \"dz\": 0.04, \"dpitch\": -5.11, \"dyaw\": 3.88, \"droll\": 0.0}, {\"dx\": -2.63, \"dy\": -0.78, \"dz\": 0.04, \"dpitch\": -5.21, \"dyaw\": 3.48, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.58, "window_alt_abs_m": 0.33, "target_px_mean_hist": 270.0, "cur_frame_id": 81, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451/aug_001/frames_playback/frame_00091/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [19.96, -63.78, 20.05, -49.65, -166.8, 0.0]\n  Target bbox: [628.1, 338.04, 651.99, 381.51]\n\nFrame 2:\n  Drone pose: [19.38, -64.01, 20.0, -53.93, -162.51, 0.0]\n  Target bbox: [569.81, 268.01, 601.13, 316.53]\n\nFrame 3:\n  Drone pose: [18.85, -64.15, 20.0, -52.61, -161.54, 0.0]\n  Target bbox: [574.76, 288.43, 602.29, 337.14]\n\nFrame 4:\n  Drone pose: [18.33, -64.3, 20.0, -49.82, -166.74, 0.0]\n  Target bbox: [625.44, 335.31, 654.52, 384.29]\n\nFrame 5 (current):\n  Drone pose: [17.89, -64.37, 20.0, -49.53, -165.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.05, \"ymin\": 336.51, \"xmax\": 653.08, \"ymax\": 383.11}, \"waypoint_deltas\": [{\"dx\": -0.42, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.63, \"dyaw\": 1.39, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.91, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 1.19, \"dyaw\": 1.31, \"droll\": 0.0}, {\"dx\": -1.75, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": 1.7, \"dyaw\": 2.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.87, "window_alt_abs_m": 0.05, "target_px_mean_hist": 267.8, "cur_frame_id": 91, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776209451", "difficulty_score": 0.2819, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [32.44, -67.02, 22.0, -54.1, 1.73, 0.0]\n  Target bbox: [625.58, 335.27, 654.4, 384.24]\n\nFrame 2:\n  Drone pose: [32.67, -67.06, 21.34, -52.71, 3.67, 0.0]\n  Target bbox: [630.36, 337.93, 649.76, 381.59]\n\nFrame 3:\n  Drone pose: [32.9, -67.1, 21.35, -52.25, 3.75, 0.0]\n  Target bbox: [625.46, 335.02, 654.52, 384.54]\n\nFrame 4:\n  Drone pose: [33.13, -67.14, 21.36, -51.73, 5.57, 0.0]\n  Target bbox: [625.66, 335.16, 654.32, 384.4]\n\nFrame 5 (current):\n  Drone pose: [33.36, -67.18, 21.38, -51.2, 7.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.44, \"ymin\": 335.9, \"xmax\": 649.69, \"ymax\": 383.61}, \"waypoint_deltas\": [{\"dx\": 0.23, \"dy\": -0.04, \"dz\": 0.02, \"dpitch\": 0.42, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": -0.08, \"dz\": 0.05, \"dpitch\": 0.83, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -0.12, \"dz\": 0.09, \"dpitch\": 1.23, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -0.16, \"dz\": 0.13, \"dpitch\": 1.61, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": 1.15, \"dy\": -0.2, \"dz\": 0.15, \"dpitch\": 2.02, \"dyaw\": 0.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.6, "window_alt_abs_m": 0.7, "target_px_mean_hist": 276.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00016/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [35.2, -67.5, 21.79, -48.27, 7.46, 0.0]\n  Target bbox: [630.45, 336.6, 649.69, 383.06]\n\nFrame 2:\n  Drone pose: [35.43, -67.54, 21.9, -48.02, 7.47, 0.0]\n  Target bbox: [630.75, 337.29, 649.39, 382.41]\n\nFrame 3:\n  Drone pose: [35.66, -67.58, 22.03, -47.78, 7.49, 0.0]\n  Target bbox: [630.67, 336.98, 649.47, 382.69]\n\nFrame 4:\n  Drone pose: [35.89, -67.62, 22.17, -47.57, 7.5, 0.0]\n  Target bbox: [631.27, 337.4, 648.85, 382.26]\n\nFrame 5 (current):\n  Drone pose: [36.12, -67.66, 22.32, -47.37, 7.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 631.25, \"ymin\": 341.32, \"xmax\": 648.81, \"ymax\": 378.32}, \"waypoint_deltas\": [{\"dx\": 0.23, \"dy\": -0.04, \"dz\": 0.17, \"dpitch\": 0.16, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": -0.08, \"dz\": 0.34, \"dpitch\": 0.32, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -0.12, \"dz\": 0.53, \"dpitch\": 0.45, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -0.16, \"dz\": 0.72, \"dpitch\": 0.58, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.4, \"dy\": -0.2, \"dz\": 0.95, \"dpitch\": 0.33, \"dyaw\": 0.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.05, "window_alt_abs_m": 0.53, "target_px_mean_hist": 251.2, "cur_frame_id": 16, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00028/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [39.14, -67.61, 24.02, -48.18, 7.01, 0.0]\n  Target bbox: [631.65, 339.37, 648.46, 380.37]\n\nFrame 2:\n  Drone pose: [39.72, -67.23, 24.28, -48.67, 6.0, 0.0]\n  Target bbox: [631.95, 342.86, 648.11, 376.81]\n\nFrame 3:\n  Drone pose: [40.58, -66.97, 24.47, -49.42, 5.38, 0.0]\n  Target bbox: [631.82, 339.31, 648.29, 380.38]\n\nFrame 4:\n  Drone pose: [41.46, -67.04, 24.66, -50.16, 5.69, 0.0]\n  Target bbox: [631.8, 339.62, 648.31, 380.08]\n\nFrame 5 (current):\n  Drone pose: [42.34, -66.83, 24.85, -50.95, 5.19, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 632.19, \"ymin\": 339.73, \"xmax\": 647.9, \"ymax\": 379.92}, \"waypoint_deltas\": [{\"dx\": 1.1, \"dy\": -1.72, \"dz\": 0.7, \"dpitch\": -1.3, \"dyaw\": 5.25, \"droll\": 0.0}, {\"dx\": 1.92, \"dy\": -3.16, \"dz\": 1.0, \"dpitch\": -1.64, \"dyaw\": 9.64, \"droll\": 0.0}, {\"dx\": 2.63, \"dy\": -4.05, \"dz\": 1.13, \"dpitch\": -1.5, \"dyaw\": 13.76, \"droll\": 0.0}, {\"dx\": 3.3, \"dy\": -4.21, \"dz\": 1.63, \"dpitch\": -1.93, \"dyaw\": 15.74, \"droll\": 0.0}, {\"dx\": 3.96, \"dy\": -4.3, \"dz\": 1.67, \"dpitch\": -1.87, \"dyaw\": 17.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.43, "window_alt_abs_m": 0.83, "target_px_mean_hist": 213.2, "cur_frame_id": 28, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [48.27, -70.84, 25.96, -52.11, 26.51, 0.0]\n  Target bbox: [629.35, 340.85, 650.58, 378.83]\n\nFrame 2:\n  Drone pose: [48.79, -69.15, 25.77, -52.64, 23.32, 0.0]\n  Target bbox: [631.13, 341.43, 648.81, 378.24]\n\nFrame 3:\n  Drone pose: [49.05, -67.02, 24.97, -52.27, 18.48, 0.0]\n  Target bbox: [628.27, 339.24, 651.68, 380.42]\n\nFrame 4:\n  Drone pose: [49.61, -67.02, 24.61, -51.7, 19.98, 0.0]\n  Target bbox: [630.47, 340.87, 649.49, 378.78]\n\nFrame 5 (current):\n  Drone pose: [50.33, -67.2, 24.4, -51.4, 22.15, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.09, \"ymin\": 338.43, \"xmax\": 651.83, \"ymax\": 381.23}, \"waypoint_deltas\": [{\"dx\": 0.79, \"dy\": 0.05, \"dz\": -0.26, \"dpitch\": 0.18, \"dyaw\": 1.59, \"droll\": 0.0}, {\"dx\": 1.41, \"dy\": -0.31, \"dz\": -0.53, \"dpitch\": 0.88, \"dyaw\": 4.1, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": -0.47, \"dz\": -0.76, \"dpitch\": 1.31, \"dyaw\": 6.15, \"droll\": 0.0}, {\"dx\": 2.46, \"dy\": -0.76, \"dz\": -1.01, \"dpitch\": 2.39, \"dyaw\": 7.91, \"droll\": 0.0}, {\"dx\": 2.61, \"dy\": -0.78, \"dz\": -1.28, \"dpitch\": 3.53, \"dyaw\": 8.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.71, "window_alt_abs_m": 1.56, "target_px_mean_hist": 203.0, "cur_frame_id": 40, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [53.81, -67.74, 22.42, -45.33, 32.87, 0.0]\n  Target bbox: [631.59, 340.74, 648.34, 378.96]\n\nFrame 2:\n  Drone pose: [54.27, -67.44, 22.22, -44.87, 33.27, 0.0]\n  Target bbox: [629.26, 339.11, 650.63, 380.65]\n\nFrame 3:\n  Drone pose: [54.74, -67.17, 22.02, -44.4, 33.72, 0.0]\n  Target bbox: [630.37, 340.01, 649.55, 379.7]\n\nFrame 4:\n  Drone pose: [55.19, -66.94, 21.84, -43.9, 34.24, 0.0]\n  Target bbox: [627.98, 337.39, 652.1, 382.42]\n\nFrame 5 (current):\n  Drone pose: [55.63, -66.74, 21.66, -43.75, 33.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.63, \"ymin\": 338.31, \"xmax\": 651.44, \"ymax\": 381.5}, \"waypoint_deltas\": [{\"dx\": 0.44, \"dy\": 0.21, \"dz\": -0.16, \"dpitch\": 0.14, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": 0.45, \"dz\": -0.31, \"dpitch\": 0.25, \"dyaw\": -1.14, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": 0.73, \"dz\": -0.45, \"dpitch\": 0.29, \"dyaw\": -1.84, \"droll\": 0.0}, {\"dx\": 1.81, \"dy\": 1.07, \"dz\": -0.58, \"dpitch\": 0.62, \"dyaw\": -1.51, \"droll\": 0.0}, {\"dx\": 2.3, \"dy\": 1.44, \"dz\": -0.7, \"dpitch\": 0.89, \"dyaw\": -1.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.9, "window_alt_abs_m": 0.76, "target_px_mean_hist": 222.2, "cur_frame_id": 52, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [59.91, -63.83, 20.58, -41.93, 33.59, 0.0]\n  Target bbox: [629.33, 338.79, 650.71, 381.0]\n\nFrame 2:\n  Drone pose: [60.4, -63.53, 20.5, -42.02, 32.93, 0.0]\n  Target bbox: [629.77, 338.57, 650.13, 381.18]\n\nFrame 3:\n  Drone pose: [60.88, -63.27, 20.44, -41.73, 33.41, 0.0]\n  Target bbox: [628.06, 336.31, 652.03, 383.5]\n\nFrame 4:\n  Drone pose: [61.35, -63.03, 20.38, -41.79, 32.85, 0.0]\n  Target bbox: [629.38, 338.46, 650.66, 381.3]\n\nFrame 5 (current):\n  Drone pose: [61.83, -62.8, 20.32, -41.85, 32.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.71, \"ymin\": 338.99, \"xmax\": 649.2, \"ymax\": 380.76}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": 0.23, \"dz\": -0.04, \"dpitch\": 0.28, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": 0.46, \"dz\": -0.08, \"dpitch\": 0.21, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": 0.69, \"dz\": -0.12, \"dpitch\": 0.46, \"dyaw\": 0.6, \"droll\": 0.0}, {\"dx\": 1.92, \"dy\": 0.92, \"dz\": -0.15, \"dpitch\": 0.37, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": 2.42, \"dy\": 1.14, \"dz\": -0.18, \"dpitch\": 0.6, \"dyaw\": 0.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.24, "window_alt_abs_m": 0.25, "target_px_mean_hist": 243.0, "cur_frame_id": 65, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00077/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [65.83, -61.05, 20.08, -41.32, 32.86, 0.0]\n  Target bbox: [629.22, 337.78, 650.67, 381.95]\n\nFrame 2:\n  Drone pose: [66.37, -60.86, 20.07, -41.13, 33.59, 0.0]\n  Target bbox: [627.68, 336.68, 652.42, 383.18]\n\nFrame 3:\n  Drone pose: [66.92, -60.67, 20.06, -41.3, 33.26, 0.0]\n  Target bbox: [627.97, 336.05, 652.13, 383.76]\n\nFrame 4:\n  Drone pose: [67.47, -60.47, 20.05, -41.48, 32.9, 0.0]\n  Target bbox: [629.02, 337.79, 650.86, 381.99]\n\nFrame 5 (current):\n  Drone pose: [68.02, -60.26, 20.04, -41.31, 33.6, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.19, \"ymin\": 337.57, \"xmax\": 651.88, \"ymax\": 382.27}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": 0.22, \"dz\": -0.01, \"dpitch\": -0.19, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": 0.44, \"dz\": -0.02, \"dpitch\": -0.03, \"dyaw\": 0.24, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": 0.67, \"dz\": -0.02, \"dpitch\": -0.22, \"dyaw\": -0.23, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": 0.92, \"dz\": -0.03, \"dpitch\": -0.04, \"dyaw\": 0.36, \"droll\": 0.0}, {\"dx\": 2.64, \"dy\": 1.17, \"dz\": -0.03, \"dpitch\": -0.23, \"dyaw\": -0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.14, "window_alt_abs_m": 0.05, "target_px_mean_hist": 255.8, "cur_frame_id": 77, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00089/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [72.19, -58.19, 20.01, -41.5, 33.68, 0.0]\n  Target bbox: [629.66, 338.57, 650.23, 381.2]\n\nFrame 2:\n  Drone pose: [72.72, -57.83, 20.0, -41.43, 34.03, 0.0]\n  Target bbox: [630.56, 340.13, 649.37, 379.6]\n\nFrame 3:\n  Drone pose: [73.25, -57.45, 20.0, -41.37, 34.34, 0.0]\n  Target bbox: [630.36, 338.66, 649.53, 381.09]\n\nFrame 4:\n  Drone pose: [73.8, -57.05, 20.0, -41.34, 34.62, 0.0]\n  Target bbox: [629.2, 338.46, 650.68, 381.3]\n\nFrame 5 (current):\n  Drone pose: [74.34, -56.64, 20.0, -41.33, 34.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.33, \"ymin\": 337.68, \"xmax\": 650.54, \"ymax\": 382.09}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": 0.44, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": 0.89, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.36, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": 1.35, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.48, \"droll\": 0.0}, {\"dx\": 2.14, \"dy\": 1.83, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": 2.67, \"dy\": 2.31, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.2, "window_alt_abs_m": 0.0, "target_px_mean_hist": 264.5, "cur_frame_id": 89, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00101/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [78.64, -52.88, 20.0, -41.46, 35.83, 0.0]\n  Target bbox: [631.15, 339.63, 648.77, 380.08]\n\nFrame 2:\n  Drone pose: [79.19, -52.42, 20.0, -41.48, 36.01, 0.0]\n  Target bbox: [629.36, 338.58, 650.53, 381.17]\n\nFrame 3:\n  Drone pose: [79.75, -51.99, 20.0, -41.49, 36.25, 0.0]\n  Target bbox: [629.5, 337.75, 650.36, 382.01]\n\nFrame 4:\n  Drone pose: [80.31, -51.59, 20.0, -41.48, 36.55, 0.0]\n  Target bbox: [629.38, 338.1, 650.5, 381.63]\n\nFrame 5 (current):\n  Drone pose: [80.87, -51.23, 20.0, -41.44, 36.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.81, \"ymin\": 339.27, \"xmax\": 650.08, \"ymax\": 380.48}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": 0.29, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 0.52, \"droll\": 0.0}, {\"dx\": 1.1, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": 0.29, \"dyaw\": 1.15, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": 0.67, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": 0.87, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": 0.76, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": 2.64, \"dy\": 0.82, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.11, "window_alt_abs_m": 0.0, "target_px_mean_hist": 261.0, "cur_frame_id": 101, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/ORI/frames_playback/frame_00113/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [85.0, -50.35, 20.0, -41.46, 37.39, 0.0]\n  Target bbox: [627.23, 336.27, 652.85, 383.57]\n\nFrame 2:\n  Drone pose: [85.49, -50.35, 20.0, -41.46, 37.37, 0.0]\n  Target bbox: [629.92, 338.99, 650.08, 380.75]\n\nFrame 3:\n  Drone pose: [85.98, -50.35, 20.0, -41.45, 37.36, 0.0]\n  Target bbox: [628.86, 338.03, 651.17, 381.76]\n\nFrame 4:\n  Drone pose: [86.47, -50.34, 20.0, -41.45, 37.34, 0.0]\n  Target bbox: [627.05, 336.05, 653.04, 383.79]\n\nFrame 5 (current):\n  Drone pose: [86.97, -50.33, 20.0, -41.45, 37.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.52, \"ymin\": 338.51, \"xmax\": 650.48, \"ymax\": 381.24}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": 1.4, \"dy\": 0.12, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.42, \"droll\": 0.0}, {\"dx\": 1.83, \"dy\": 0.22, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": 2.25, \"dy\": 0.36, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.09, "window_alt_abs_m": 0.0, "target_px_mean_hist": 257.2, "cur_frame_id": 113, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [32.44, -67.02, 22.0, -54.1, 1.73, 0.0]\n  Target bbox: [628.13, 337.53, 651.87, 381.97]\n\nFrame 2:\n  Drone pose: [32.67, -67.06, 21.34, -50.48, 6.41, 0.0]\n  Target bbox: [601.57, 378.11, 622.37, 417.33]\n\nFrame 3:\n  Drone pose: [32.9, -67.1, 21.35, -53.79, 8.75, 0.0]\n  Target bbox: [573.26, 310.71, 603.5, 360.6]\n\nFrame 4:\n  Drone pose: [33.13, -67.14, 21.36, -51.73, 5.57, 0.0]\n  Target bbox: [627.44, 336.94, 652.56, 382.62]\n\nFrame 5 (current):\n  Drone pose: [33.36, -67.18, 21.38, -54.77, 12.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 575.6, \"ymin\": 279.45, \"xmax\": 598.87, \"ymax\": 323.93}, \"waypoint_deltas\": [{\"dx\": 0.23, \"dy\": -0.04, \"dz\": 0.02, \"dpitch\": 3.99, \"dyaw\": -4.98, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": -0.08, \"dz\": 0.05, \"dpitch\": 4.4, \"dyaw\": -4.96, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -0.12, \"dz\": 0.09, \"dpitch\": 4.8, \"dyaw\": -4.95, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -0.16, \"dz\": 0.13, \"dpitch\": 5.18, \"dyaw\": -4.93, \"droll\": 0.0}, {\"dx\": 1.15, \"dy\": -0.2, \"dz\": 0.15, \"dpitch\": 5.59, \"dyaw\": -4.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.95, "window_alt_abs_m": 0.7, "target_px_mean_hist": 282.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00016/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [35.2, -67.5, 21.79, -48.27, 7.46, 0.0]\n  Target bbox: [631.1, 337.83, 649.01, 381.79]\n\nFrame 2:\n  Drone pose: [35.43, -67.54, 21.9, -50.12, 2.47, 0.0]\n  Target bbox: [686.55, 308.45, 706.03, 344.28]\n\nFrame 3:\n  Drone pose: [35.66, -67.58, 22.03, -47.78, 7.49, 0.0]\n  Target bbox: [631.17, 337.08, 648.96, 382.56]\n\nFrame 4:\n  Drone pose: [35.89, -67.62, 22.17, -44.79, 7.97, 0.0]\n  Target bbox: [625.34, 385.75, 644.03, 427.18]\n\nFrame 5 (current):\n  Drone pose: [36.05, -67.71, 22.32, -46.5, 12.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 663.03, \"ymin\": 336.91, \"xmax\": 689.3, \"ymax\": 382.09}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": 0.01, \"dz\": 0.17, \"dpitch\": -0.71, \"dyaw\": -5.4, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -0.03, \"dz\": 0.34, \"dpitch\": -0.55, \"dyaw\": -5.39, \"droll\": 0.0}, {\"dx\": 0.76, \"dy\": -0.07, \"dz\": 0.53, \"dpitch\": -0.42, \"dyaw\": -5.38, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": -0.11, \"dz\": 0.72, \"dpitch\": -0.29, \"dyaw\": -5.37, \"droll\": 0.0}, {\"dx\": 1.47, \"dy\": -0.15, \"dz\": 0.95, \"dpitch\": -0.54, \"dyaw\": -5.28, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.44, "window_alt_abs_m": 0.53, "target_px_mean_hist": 255.5, "cur_frame_id": 16, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00028/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [39.14, -67.61, 24.02, -44.19, 8.45, 0.0]\n  Target bbox: [615.6, 408.77, 632.06, 445.36]\n\nFrame 2:\n  Drone pose: [39.72, -67.23, 24.28, -48.67, 6.0, 0.0]\n  Target bbox: [631.76, 340.33, 648.34, 379.39]\n\nFrame 3:\n  Drone pose: [40.58, -66.97, 24.47, -52.89, 0.62, 0.0]\n  Target bbox: [684.09, 283.19, 700.28, 323.37]\n\nFrame 4:\n  Drone pose: [41.46, -67.04, 24.66, -48.75, 5.1, 0.0]\n  Target bbox: [638.21, 367.05, 654.6, 400.04]\n\nFrame 5 (current):\n  Drone pose: [42.34, -66.83, 24.85, -51.67, 10.19, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 577.57, \"ymin\": 328.56, \"xmax\": 596.67, \"ymax\": 370.62}, \"waypoint_deltas\": [{\"dx\": 1.1, \"dy\": -1.72, \"dz\": 0.7, \"dpitch\": -0.58, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": 1.92, \"dy\": -3.16, \"dz\": 1.0, \"dpitch\": -0.92, \"dyaw\": 4.64, \"droll\": 0.0}, {\"dx\": 2.63, \"dy\": -4.05, \"dz\": 1.13, \"dpitch\": -0.78, \"dyaw\": 8.76, \"droll\": 0.0}, {\"dx\": 3.3, \"dy\": -4.21, \"dz\": 1.63, \"dpitch\": -1.21, \"dyaw\": 10.74, \"droll\": 0.0}, {\"dx\": 3.96, \"dy\": -4.3, \"dz\": 1.67, \"dpitch\": -1.15, \"dyaw\": 12.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.4, "window_alt_abs_m": 0.83, "target_px_mean_hist": 209.0, "cur_frame_id": 28, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [48.27, -70.84, 25.96, -52.11, 26.51, 0.0]\n  Target bbox: [630.76, 342.0, 649.2, 377.67]\n\nFrame 2:\n  Drone pose: [48.79, -69.15, 25.77, -52.6, 25.53, 0.0]\n  Target bbox: [606.5, 340.94, 628.27, 380.78]\n\nFrame 3:\n  Drone pose: [49.05, -67.02, 24.97, -50.45, 22.75, 0.0]\n  Target bbox: [586.53, 373.36, 605.36, 409.63]\n\nFrame 4:\n  Drone pose: [49.68, -67.12, 24.73, -48.97, 20.68, 0.0]\n  Target bbox: [631.53, 341.72, 648.55, 377.97]\n\nFrame 5 (current):\n  Drone pose: [50.33, -67.2, 24.4, -51.4, 22.15, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.81, \"ymin\": 340.21, \"xmax\": 649.12, \"ymax\": 379.44}, \"waypoint_deltas\": [{\"dx\": 0.79, \"dy\": 0.05, \"dz\": -0.26, \"dpitch\": 0.18, \"dyaw\": 1.59, \"droll\": 0.0}, {\"dx\": 1.41, \"dy\": -0.31, \"dz\": -0.53, \"dpitch\": 0.88, \"dyaw\": 4.1, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": -0.47, \"dz\": -0.76, \"dpitch\": 1.31, \"dyaw\": 6.15, \"droll\": 0.0}, {\"dx\": 2.46, \"dy\": -0.76, \"dz\": -1.01, \"dpitch\": 2.39, \"dyaw\": 7.91, \"droll\": 0.0}, {\"dx\": 2.61, \"dy\": -0.78, \"dz\": -1.28, \"dpitch\": 3.53, \"dyaw\": 8.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.29, "window_alt_abs_m": 1.56, "target_px_mean_hist": 200.8, "cur_frame_id": 40, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [53.81, -67.74, 22.42, -43.46, 35.57, 0.0]\n  Target bbox: [598.57, 372.68, 617.32, 410.76]\n\nFrame 2:\n  Drone pose: [54.27, -67.44, 22.22, -42.7, 31.68, 0.0]\n  Target bbox: [649.55, 376.87, 668.27, 415.74]\n\nFrame 3:\n  Drone pose: [54.74, -67.17, 22.02, -44.4, 33.72, 0.0]\n  Target bbox: [630.11, 339.62, 649.8, 380.1]\n\nFrame 4:\n  Drone pose: [55.19, -66.94, 21.84, -45.11, 32.75, 0.0]\n  Target bbox: [650.33, 321.39, 665.84, 358.21]\n\nFrame 5 (current):\n  Drone pose: [55.63, -66.72, 21.5, -44.36, 29.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 686.54, \"ymin\": 350.04, \"xmax\": 710.39, \"ymax\": 396.43}, \"waypoint_deltas\": [{\"dx\": 0.44, \"dy\": 0.19, \"dz\": 0.0, \"dpitch\": 0.75, \"dyaw\": 3.67, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": 0.43, \"dz\": -0.15, \"dpitch\": 0.86, \"dyaw\": 3.07, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": 0.71, \"dz\": -0.29, \"dpitch\": 0.9, \"dyaw\": 2.37, \"droll\": 0.0}, {\"dx\": 1.81, \"dy\": 1.05, \"dz\": -0.42, \"dpitch\": 1.23, \"dyaw\": 2.7, \"droll\": 0.0}, {\"dx\": 2.3, \"dy\": 1.42, \"dz\": -0.54, \"dpitch\": 1.5, \"dyaw\": 2.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.16, "window_alt_abs_m": 0.92, "target_px_mean_hist": 227.8, "cur_frame_id": 52, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [59.88, -63.82, 20.6, -41.69, 35.74, 0.0]\n  Target bbox: [629.95, 339.43, 649.95, 380.33]\n\nFrame 2:\n  Drone pose: [60.53, -63.54, 20.48, -36.18, 39.61, 0.0]\n  Target bbox: [596.98, 369.15, 618.27, 409.87]\n\nFrame 3:\n  Drone pose: [60.88, -63.27, 20.44, -41.73, 33.41, 0.0]\n  Target bbox: [627.49, 336.79, 652.61, 383.09]\n\nFrame 4:\n  Drone pose: [61.35, -63.03, 20.38, -42.72, 27.85, 0.0]\n  Target bbox: [691.41, 324.93, 714.0, 367.23]\n\nFrame 5 (current):\n  Drone pose: [61.83, -62.8, 20.32, -46.15, 29.61, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 663.74, \"ymin\": 267.26, \"xmax\": 684.19, \"ymax\": 308.86}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": 0.23, \"dz\": -0.04, \"dpitch\": 4.58, \"dyaw\": 3.27, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": 0.46, \"dz\": -0.08, \"dpitch\": 4.51, \"dyaw\": 2.74, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": 0.69, \"dz\": -0.12, \"dpitch\": 4.76, \"dyaw\": 3.3, \"droll\": 0.0}, {\"dx\": 1.92, \"dy\": 0.92, \"dz\": -0.15, \"dpitch\": 4.67, \"dyaw\": 2.78, \"droll\": 0.0}, {\"dx\": 2.42, \"dy\": 1.14, \"dz\": -0.18, \"dpitch\": 4.9, \"dyaw\": 3.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.4, "window_alt_abs_m": 0.27, "target_px_mean_hist": 229.2, "cur_frame_id": 65, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00077/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [65.68, -60.93, 20.02, -39.86, 30.46, 0.0]\n  Target bbox: [630.02, 338.33, 650.09, 381.53]\n\nFrame 2:\n  Drone pose: [66.24, -60.81, 20.01, -45.11, 34.77, 0.0]\n  Target bbox: [630.22, 339.13, 649.84, 380.48]\n\nFrame 3:\n  Drone pose: [66.92, -60.67, 20.06, -41.3, 33.26, 0.0]\n  Target bbox: [629.57, 338.62, 650.45, 381.14]\n\nFrame 4:\n  Drone pose: [67.47, -60.47, 20.05, -41.48, 32.9, 0.0]\n  Target bbox: [629.06, 337.96, 650.82, 381.81]\n\nFrame 5 (current):\n  Drone pose: [68.02, -60.26, 20.04, -40.09, 35.42, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 606.27, \"ymin\": 359.49, \"xmax\": 627.55, \"ymax\": 401.68}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": 0.22, \"dz\": -0.01, \"dpitch\": -1.41, \"dyaw\": -2.23, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": 0.44, \"dz\": -0.02, \"dpitch\": -1.25, \"dyaw\": -1.58, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": 0.67, \"dz\": -0.02, \"dpitch\": -1.44, \"dyaw\": -2.05, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": 0.92, \"dz\": -0.03, \"dpitch\": -1.26, \"dyaw\": -1.46, \"droll\": 0.0}, {\"dx\": 2.64, \"dy\": 1.17, \"dz\": -0.03, \"dpitch\": -1.45, \"dyaw\": -2.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.71, "window_alt_abs_m": 0.07, "target_px_mean_hist": 258.5, "cur_frame_id": 77, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00089/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [72.19, -58.19, 20.01, -42.95, 37.11, 0.0]\n  Target bbox: [585.47, 314.42, 607.98, 358.51]\n\nFrame 2:\n  Drone pose: [72.72, -57.83, 20.0, -40.7, 38.3, 0.0]\n  Target bbox: [574.66, 352.28, 597.59, 394.37]\n\nFrame 3:\n  Drone pose: [73.25, -57.45, 20.0, -46.06, 35.4, 0.0]\n  Target bbox: [617.81, 260.71, 635.06, 301.78]\n\nFrame 4:\n  Drone pose: [73.8, -57.05, 20.0, -40.0, 30.66, 0.0]\n  Target bbox: [677.09, 360.02, 703.08, 407.05]\n\nFrame 5 (current):\n  Drone pose: [74.29, -56.75, 19.96, -44.43, 39.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.66, \"ymin\": 349.93, \"xmax\": 654.08, \"ymax\": 400.63}, \"waypoint_deltas\": [{\"dx\": 0.59, \"dy\": 0.55, \"dz\": 0.04, \"dpitch\": 3.1, \"dyaw\": -4.78, \"droll\": 0.0}, {\"dx\": 1.13, \"dy\": 1.0, \"dz\": 0.04, \"dpitch\": 3.1, \"dyaw\": -4.62, \"droll\": 0.0}, {\"dx\": 1.66, \"dy\": 1.46, \"dz\": 0.04, \"dpitch\": 3.09, \"dyaw\": -4.5, \"droll\": 0.0}, {\"dx\": 2.19, \"dy\": 1.94, \"dz\": 0.04, \"dpitch\": 3.08, \"dyaw\": -4.41, \"droll\": 0.0}, {\"dx\": 2.72, \"dy\": 2.42, \"dz\": 0.04, \"dpitch\": 3.07, \"dyaw\": -4.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.02, "window_alt_abs_m": 0.04, "target_px_mean_hist": 256.2, "cur_frame_id": 89, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00101/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [78.64, -52.88, 20.0, -37.94, 36.97, 0.0]\n  Target bbox: [614.91, 397.01, 636.03, 440.88]\n\nFrame 2:\n  Drone pose: [79.19, -52.42, 20.0, -42.67, 31.01, 0.0]\n  Target bbox: [693.85, 321.19, 712.19, 362.45]\n\nFrame 3:\n  Drone pose: [79.72, -52.09, 20.03, -37.64, 40.38, 0.0]\n  Target bbox: [563.89, 326.14, 581.47, 365.48]\n\nFrame 4:\n  Drone pose: [80.31, -51.59, 20.0, -41.37, 41.55, 0.0]\n  Target bbox: [566.67, 343.04, 587.3, 384.23]\n\nFrame 5 (current):\n  Drone pose: [80.87, -51.23, 20.0, -36.47, 39.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 598.43, \"ymin\": 423.64, \"xmax\": 616.15, \"ymax\": 463.8}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": 0.29, \"dz\": 0.0, \"dpitch\": -4.86, \"dyaw\": -2.07, \"droll\": 0.0}, {\"dx\": 1.1, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": -4.68, \"dyaw\": -1.44, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": 0.67, \"dz\": 0.0, \"dpitch\": -4.82, \"dyaw\": -1.72, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": 0.76, \"dz\": 0.0, \"dpitch\": -4.91, \"dyaw\": -1.89, \"droll\": 0.0}, {\"dx\": 2.64, \"dy\": 0.82, \"dz\": 0.0, \"dpitch\": -4.96, \"dyaw\": -2.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.52, "window_alt_abs_m": 0.06, "target_px_mean_hist": 248.5, "cur_frame_id": 101, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140/aug_001/frames_playback/frame_00113/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [85.19, -50.34, 20.04, -37.11, 39.04, 0.0]\n  Target bbox: [630.54, 340.39, 649.39, 379.44]\n\nFrame 2:\n  Drone pose: [85.49, -50.35, 20.0, -39.36, 42.37, 0.0]\n  Target bbox: [565.32, 374.91, 588.6, 418.79]\n\nFrame 3:\n  Drone pose: [85.87, -50.34, 20.13, -41.69, 36.98, 0.0]\n  Target bbox: [666.72, 330.32, 682.23, 369.9]\n\nFrame 4:\n  Drone pose: [86.47, -50.34, 20.0, -42.22, 33.7, 0.0]\n  Target bbox: [674.52, 326.22, 697.3, 369.83]\n\nFrame 5 (current):\n  Drone pose: [86.97, -50.33, 20.0, -41.43, 32.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 689.21, \"ymin\": 339.05, \"xmax\": 715.41, \"ymax\": 385.02}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 4.87, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 4.74, \"droll\": 0.0}, {\"dx\": 1.4, \"dy\": 0.12, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 4.52, \"droll\": 0.0}, {\"dx\": 1.83, \"dy\": 0.22, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 4.2, \"droll\": 0.0}, {\"dx\": 2.25, \"dy\": 0.36, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 3.78, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.34, "window_alt_abs_m": 0.3, "target_px_mean_hist": 243.8, "cur_frame_id": 113, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776251140", "difficulty_score": 0.4324, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00004/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00005/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [12.25, 128.47, 21.2, -47.01, 177.05, 0.0]\n  Target bbox: [627.57, 330.18, 652.23, 388.97] (model-predicted box)\n\nFrame 2:\n  Drone pose: [11.21, 128.05, 20.67, -47.04, 175.64, 0.0]\n  Target bbox: [626.05, 324.12, 653.6, 395.14] (model-predicted box)\n\nFrame 3:\n  Drone pose: [10.47, 127.89, 20.64, -47.36, 175.07, 0.0]\n  Target bbox: [625.23, 322.55, 654.36, 396.79]\n\nFrame 4:\n  Drone pose: [9.84, 127.82, 20.62, -47.52, 174.84, 0.0]\n  Target bbox: [625.77, 323.44, 653.88, 395.76]\n\nFrame 5 (current):\n  Drone pose: [9.27, 127.81, 20.59, -47.59, 174.77, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.46, \"ymin\": 324.65, \"xmax\": 654.22, \"ymax\": 394.48}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": -0.01, \"dz\": -0.02, \"dpitch\": 0.06, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.02, \"dz\": -0.04, \"dpitch\": 0.1, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": -0.02, \"dz\": -0.06, \"dpitch\": 0.13, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": -0.03, \"dz\": -0.17, \"dpitch\": 0.28, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": -0.03, \"dz\": -0.2, \"dpitch\": 0.33, \"dyaw\": -0.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 2, "current_invisible": false, "window_yaw_abs_deg": 2.28, "window_alt_abs_m": 0.61, "target_px_mean_hist": 262.0, "cur_frame_id": 5, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [6.28, 127.78, 20.36, -47.22, 174.68, 0.0]\n  Target bbox: [627.07, 328.49, 652.72, 390.6]\n\nFrame 2:\n  Drone pose: [5.78, 127.78, 20.33, -47.18, 174.68, 0.0]\n  Target bbox: [625.62, 323.12, 654.02, 396.07]\n\nFrame 3:\n  Drone pose: [5.29, 127.78, 20.3, -47.13, 174.68, 0.0]\n  Target bbox: [625.58, 322.34, 654.05, 396.85]\n\nFrame 4:\n  Drone pose: [4.79, 127.78, 20.27, -47.09, 174.68, 0.0]\n  Target bbox: [626.63, 327.71, 653.15, 391.35]\n\nFrame 5 (current):\n  Drone pose: [4.29, 127.78, 20.24, -47.05, 174.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.47, \"ymin\": 322.26, \"xmax\": 654.14, \"ymax\": 396.97}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.08, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": -0.07, \"dpitch\": 0.11, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": -0.09, \"dpitch\": 0.14, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": -0.1, \"dpitch\": 0.16, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.11, "target_px_mean_hist": 567.2, "cur_frame_id": 15, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [1.29, 127.78, 20.12, -46.87, 174.68, 0.0]\n  Target bbox: [626.95, 328.33, 652.85, 390.74]\n\nFrame 2:\n  Drone pose: [0.79, 127.78, 20.1, -46.84, 174.68, 0.0]\n  Target bbox: [625.88, 323.84, 653.79, 395.35]\n\nFrame 3:\n  Drone pose: [0.29, 127.78, 20.09, -46.82, 174.68, 0.0]\n  Target bbox: [625.5, 322.24, 654.11, 397.0]\n\nFrame 4:\n  Drone pose: [-0.21, 127.78, 20.08, -46.8, 174.69, 0.0]\n  Target bbox: [625.69, 323.23, 653.96, 395.99]\n\nFrame 5 (current):\n  Drone pose: [-0.71, 127.78, 20.07, -46.79, 174.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.53, \"ymin\": 325.63, \"xmax\": 653.19, \"ymax\": 393.52}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": -0.04, \"dpitch\": 0.06, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.05, "target_px_mean_hist": 576.5, "cur_frame_id": 25, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-3.71, 127.78, 20.03, -46.72, 174.69, 0.0]\n  Target bbox: [625.42, 323.25, 654.22, 396.0]\n\nFrame 2:\n  Drone pose: [-4.21, 127.78, 20.02, -46.72, 174.69, 0.0]\n  Target bbox: [624.76, 321.67, 654.83, 397.53]\n\nFrame 3:\n  Drone pose: [-4.71, 127.78, 20.02, -46.71, 174.69, 0.0]\n  Target bbox: [626.88, 328.07, 652.92, 390.95]\n\nFrame 4:\n  Drone pose: [-5.21, 127.78, 20.02, -46.71, 174.69, 0.0]\n  Target bbox: [626.85, 326.7, 652.9, 392.42]\n\nFrame 5 (current):\n  Drone pose: [-5.71, 127.78, 20.01, -46.7, 174.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.83, \"ymin\": 323.68, \"xmax\": 653.83, \"ymax\": 395.51}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.01, "target_px_mean_hist": 594.2, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-8.71, 127.78, 20.0, -46.69, 174.69, 0.0]\n  Target bbox: [624.48, 320.35, 655.06, 398.97]\n\nFrame 2:\n  Drone pose: [-9.21, 127.78, 20.0, -46.69, 174.69, 0.0]\n  Target bbox: [624.39, 320.43, 655.15, 398.85]\n\nFrame 3:\n  Drone pose: [-9.71, 127.78, 20.0, -46.68, 174.69, 0.0]\n  Target bbox: [625.36, 321.86, 654.25, 397.36]\n\nFrame 4:\n  Drone pose: [-10.21, 127.78, 20.0, -46.68, 174.69, 0.0]\n  Target bbox: [625.04, 322.63, 654.56, 396.66]\n\nFrame 5 (current):\n  Drone pose: [-10.71, 127.78, 20.0, -46.68, 174.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.3, \"ymin\": 321.97, \"xmax\": 654.31, \"ymax\": 397.26}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 581.0, "cur_frame_id": 45, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-13.71, 127.78, 20.0, -46.68, 174.69, 0.0]\n  Target bbox: [624.55, 320.72, 654.99, 398.63]\n\nFrame 2:\n  Drone pose: [-14.21, 127.78, 20.0, -46.68, 174.69, 0.0]\n  Target bbox: [625.71, 325.34, 654.0, 393.76]\n\nFrame 3:\n  Drone pose: [-14.71, 127.78, 20.0, -46.68, 174.68, 0.0]\n  Target bbox: [625.32, 322.25, 654.29, 396.97]\n\nFrame 4:\n  Drone pose: [-15.21, 127.78, 20.0, -46.68, 174.68, 0.0]\n  Target bbox: [626.85, 328.16, 652.95, 390.9]\n\nFrame 5 (current):\n  Drone pose: [-15.71, 127.78, 20.0, -46.68, 174.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.45, \"ymin\": 320.35, \"xmax\": 655.09, \"ymax\": 398.96}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 588.0, "cur_frame_id": 55, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-18.71, 127.7, 20.0, -46.67, 174.42, 0.0]\n  Target bbox: [626.71, 328.91, 653.11, 390.11]\n\nFrame 2:\n  Drone pose: [-19.21, 127.66, 20.0, -46.67, 174.31, 0.0]\n  Target bbox: [624.9, 321.95, 654.68, 397.38]\n\nFrame 3:\n  Drone pose: [-19.72, 127.61, 20.0, -46.66, 174.16, 0.0]\n  Target bbox: [624.21, 320.44, 655.34, 398.85]\n\nFrame 4:\n  Drone pose: [-20.22, 127.56, 20.0, -46.66, 173.98, 0.0]\n  Target bbox: [625.25, 323.98, 654.43, 395.16]\n\nFrame 5 (current):\n  Drone pose: [-20.72, 127.49, 20.0, -46.66, 173.76, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.07, \"ymin\": 320.4, \"xmax\": 655.48, \"ymax\": 398.88}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": -0.08, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": -0.18, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": -0.29, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": -2.04, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -1.3, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.54, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -1.72, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.66, "window_alt_abs_m": 0.0, "target_px_mean_hist": 597.8, "cur_frame_id": 65, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-23.8, 126.81, 20.0, -46.64, 171.59, 0.0]\n  Target bbox: [625.54, 326.75, 654.22, 392.37]\n\nFrame 2:\n  Drone pose: [-24.32, 126.67, 20.0, -46.63, 171.12, 0.0]\n  Target bbox: [623.22, 320.34, 656.33, 398.94]\n\nFrame 3:\n  Drone pose: [-24.84, 126.52, 20.0, -46.64, 170.66, 0.0]\n  Target bbox: [626.0, 327.55, 653.8, 391.49]\n\nFrame 4:\n  Drone pose: [-25.37, 126.39, 20.0, -46.65, 170.22, 0.0]\n  Target bbox: [623.8, 321.38, 655.79, 397.89]\n\nFrame 5 (current):\n  Drone pose: [-25.91, 126.27, 20.0, -46.67, 169.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.47, \"ymin\": 324.55, \"xmax\": 655.22, \"ymax\": 394.66}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": -0.08, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.29, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": -0.14, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.49, \"droll\": 0.0}, {\"dx\": -1.6, \"dy\": -0.16, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -2.15, \"droll\": 0.0}, {\"dx\": -2.15, \"dy\": -0.16, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -2.18, \"droll\": 0.0}, {\"dx\": -2.69, \"dy\": -0.14, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -3.71, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.74, "window_alt_abs_m": 0.0, "target_px_mean_hist": 590.2, "cur_frame_id": 75, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-29.14, 126.16, 20.0, -46.67, 166.21, 0.0]\n  Target bbox: [618.54, 322.06, 661.62, 397.22]\n\nFrame 2:\n  Drone pose: [-29.69, 126.19, 20.0, -46.55, 164.75, 0.0]\n  Target bbox: [624.79, 327.02, 655.03, 392.1]\n\nFrame 3:\n  Drone pose: [-30.23, 126.24, 20.0, -46.64, 164.84, 0.0]\n  Target bbox: [617.62, 321.5, 662.57, 397.82]\n\nFrame 4:\n  Drone pose: [-30.78, 126.28, 20.0, -46.51, 163.39, 0.0]\n  Target bbox: [620.81, 319.38, 658.71, 400.04]\n\nFrame 5 (current):\n  Drone pose: [-31.32, 126.31, 20.0, -46.59, 163.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.22, \"ymin\": 322.74, \"xmax\": 659.0, \"ymax\": 396.48}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -1.45, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -1.38, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": 0.12, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.28, \"droll\": 0.0}, {\"dx\": -2.18, \"dy\": 0.18, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -2.64, \"droll\": 0.0}, {\"dx\": -2.72, \"dy\": 0.25, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -2.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.06, "window_alt_abs_m": 0.0, "target_px_mean_hist": 562.5, "cur_frame_id": 85, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/ORI/frames_playback/frame_00095/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.58, 126.65, 20.0, -46.39, 159.72, 0.0]\n  Target bbox: [621.37, 320.93, 658.26, 398.37]\n\nFrame 2:\n  Drone pose: [-35.11, 126.75, 20.0, -46.5, 159.97, 0.0]\n  Target bbox: [619.98, 324.33, 660.14, 394.81]\n\nFrame 3:\n  Drone pose: [-35.65, 126.85, 20.0, -46.33, 158.76, 0.0]\n  Target bbox: [619.76, 319.55, 659.81, 399.82]\n\nFrame 4:\n  Drone pose: [-36.18, 126.96, 20.0, -46.44, 159.04, 0.0]\n  Target bbox: [622.54, 323.46, 657.71, 395.77]\n\nFrame 5 (current):\n  Drone pose: [-36.71, 127.07, 20.0, -46.26, 157.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.98, \"ymin\": 326.47, \"xmax\": 655.87, \"ymax\": 392.69}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": 0.32, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.64, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": 0.43, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -2.65, \"dy\": 0.55, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -1.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.92, "window_alt_abs_m": 0.0, "target_px_mean_hist": 578.8, "cur_frame_id": 95, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00004/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00005/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [12.25, 128.47, 21.2, -47.01, 177.05, 0.0]\n  Target bbox: [627.17, 325.53, 652.52, 393.72] (model-predicted box)\n\nFrame 2:\n  Drone pose: [11.2, 127.96, 20.71, -47.12, 175.36, 0.0]\n  Target bbox: [626.21, 324.91, 653.47, 394.34] (model-predicted box)\n\nFrame 3:\n  Drone pose: [10.47, 127.89, 20.64, -43.47, 174.03, 0.0]\n  Target bbox: [637.73, 387.97, 666.12, 462.11]\n\nFrame 4:\n  Drone pose: [9.84, 127.82, 20.62, -49.93, 169.84, 0.0]\n  Target bbox: [681.27, 286.93, 712.54, 355.1]\n\nFrame 5 (current):\n  Drone pose: [9.27, 127.81, 20.59, -45.74, 177.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 591.0, \"ymin\": 357.09, \"xmax\": 619.39, \"ymax\": 425.48}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": -0.01, \"dz\": -0.02, \"dpitch\": -1.79, \"dyaw\": -3.05, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.02, \"dz\": -0.04, \"dpitch\": -1.75, \"dyaw\": -3.07, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": -0.02, \"dz\": -0.06, \"dpitch\": -1.72, \"dyaw\": -3.1, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": -0.03, \"dz\": -0.17, \"dpitch\": -1.57, \"dyaw\": -3.11, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": -0.03, \"dz\": -0.2, \"dpitch\": -1.52, \"dyaw\": -3.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 2, "current_invisible": false, "window_yaw_abs_deg": 15.18, "window_alt_abs_m": 0.61, "target_px_mean_hist": 271.8, "cur_frame_id": 5, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [6.12, 127.83, 20.36, -50.76, 171.63, 0.0]\n  Target bbox: [660.77, 268.5, 691.51, 342.53]\n\nFrame 2:\n  Drone pose: [5.89, 127.65, 20.28, -43.1, 174.4, 0.0]\n  Target bbox: [625.5, 391.83, 652.09, 455.8]\n\nFrame 3:\n  Drone pose: [5.11, 127.75, 20.33, -44.16, 169.54, 0.0]\n  Target bbox: [680.92, 379.63, 712.97, 454.2]\n\nFrame 4:\n  Drone pose: [4.79, 127.78, 20.27, -48.41, 172.54, 0.0]\n  Target bbox: [651.21, 305.01, 678.1, 370.47]\n\nFrame 5 (current):\n  Drone pose: [4.29, 127.78, 20.24, -49.22, 178.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 579.7, \"ymin\": 293.79, \"xmax\": 606.8, \"ymax\": 354.96}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 2.21, \"dyaw\": -4.05, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 2.25, \"dyaw\": -4.05, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": -0.07, \"dpitch\": 2.28, \"dyaw\": -4.05, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": -0.09, \"dpitch\": 2.31, \"dyaw\": -4.05, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": -0.1, \"dpitch\": 2.33, \"dyaw\": -4.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.81, "window_alt_abs_m": 0.22, "target_px_mean_hist": 566.2, "cur_frame_id": 15, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [1.29, 127.78, 20.12, -48.73, 169.68, 0.0]\n  Target bbox: [683.13, 297.74, 712.05, 362.46]\n\nFrame 2:\n  Drone pose: [0.63, 127.73, 20.0, -48.36, 169.48, 0.0]\n  Target bbox: [682.61, 305.13, 712.42, 369.71]\n\nFrame 3:\n  Drone pose: [0.4, 127.77, 19.98, -51.47, 170.34, 0.0]\n  Target bbox: [673.37, 236.98, 707.75, 316.91]\n\nFrame 4:\n  Drone pose: [-0.05, 127.74, 20.15, -51.06, 172.12, 0.0]\n  Target bbox: [652.99, 246.55, 685.33, 325.45]\n\nFrame 5 (current):\n  Drone pose: [-0.71, 127.78, 20.07, -46.79, 174.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.57, \"ymin\": 327.54, \"xmax\": 653.21, \"ymax\": 391.52}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": -0.04, \"dpitch\": 0.06, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.4, "window_alt_abs_m": 0.4, "target_px_mean_hist": 600.5, "cur_frame_id": 25, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-3.71, 127.78, 20.03, -44.7, 173.64, 0.0]\n  Target bbox: [638.5, 358.44, 665.91, 428.82]\n\nFrame 2:\n  Drone pose: [-4.22, 127.93, 20.08, -48.57, 171.87, 0.0]\n  Target bbox: [663.37, 298.17, 692.67, 364.49]\n\nFrame 3:\n  Drone pose: [-4.72, 127.78, 19.93, -46.6, 174.68, 0.0]\n  Target bbox: [624.76, 320.74, 654.79, 398.6]\n\nFrame 4:\n  Drone pose: [-5.18, 127.63, 20.06, -42.77, 169.94, 0.0]\n  Target bbox: [674.57, 393.93, 704.65, 460.15]\n\nFrame 5 (current):\n  Drone pose: [-5.71, 127.78, 20.01, -46.7, 174.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.19, \"ymin\": 327.19, \"xmax\": 653.56, \"ymax\": 391.91}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.06, "window_alt_abs_m": 0.39, "target_px_mean_hist": 592.5, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-8.85, 127.7, 19.95, -46.82, 174.4, 0.0]\n  Target bbox: [624.01, 320.32, 655.53, 398.94]\n\nFrame 2:\n  Drone pose: [-9.13, 127.7, 19.85, -43.99, 177.66, 0.0]\n  Target bbox: [588.36, 365.39, 616.65, 433.56]\n\nFrame 3:\n  Drone pose: [-9.71, 127.78, 20.0, -45.07, 173.75, 0.0]\n  Target bbox: [636.5, 351.06, 665.14, 422.55]\n\nFrame 4:\n  Drone pose: [-10.11, 127.81, 19.86, -46.31, 174.8, 0.0]\n  Target bbox: [626.5, 328.02, 653.29, 391.05]\n\nFrame 5 (current):\n  Drone pose: [-10.71, 127.78, 20.0, -48.72, 169.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 681.23, \"ymin\": 289.59, \"xmax\": 714.31, \"ymax\": 365.11}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.04, \"dyaw\": 5.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.04, \"dyaw\": 5.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.04, \"dyaw\": 5.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.04, \"dyaw\": 5.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.04, \"dyaw\": 5.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.34, "window_alt_abs_m": 0.54, "target_px_mean_hist": 605.8, "cur_frame_id": 45, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-13.71, 127.78, 20.0, -46.68, 174.69, 0.0]\n  Target bbox: [626.6, 328.13, 653.19, 390.94]\n\nFrame 2:\n  Drone pose: [-14.25, 127.78, 19.91, -45.37, 170.55, 0.0]\n  Target bbox: [671.32, 343.36, 703.9, 419.85]\n\nFrame 3:\n  Drone pose: [-14.71, 127.78, 20.0, -46.68, 174.68, 0.0]\n  Target bbox: [625.72, 323.26, 653.93, 395.93]\n\nFrame 4:\n  Drone pose: [-15.21, 127.78, 20.0, -45.89, 179.68, 0.0]\n  Target bbox: [566.42, 337.74, 597.35, 411.49]\n\nFrame 5 (current):\n  Drone pose: [-15.73, 127.79, 20.06, -46.8, 174.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.46, \"ymin\": 321.98, \"xmax\": 654.16, \"ymax\": 397.22}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": -0.02, \"dz\": -0.06, \"dpitch\": 0.12, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.02, \"dz\": -0.06, \"dpitch\": 0.12, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": -0.03, \"dz\": -0.06, \"dpitch\": 0.12, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": -0.05, \"dz\": -0.06, \"dpitch\": 0.12, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": -0.07, \"dz\": -0.06, \"dpitch\": 0.13, \"dyaw\": -0.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.23, "window_alt_abs_m": 0.25, "target_px_mean_hist": 590.5, "cur_frame_id": 55, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-18.85, 127.73, 20.11, -42.07, 174.43, 0.0]\n  Target bbox: [625.25, 404.26, 655.88, 483.31]\n\nFrame 2:\n  Drone pose: [-19.35, 127.74, 20.01, -46.92, 174.52, 0.0]\n  Target bbox: [624.33, 320.7, 655.23, 398.53]\n\nFrame 3:\n  Drone pose: [-19.53, 127.69, 20.01, -46.4, 174.45, 0.0]\n  Target bbox: [624.39, 320.66, 655.16, 398.64]\n\nFrame 4:\n  Drone pose: [-20.22, 127.56, 20.0, -46.66, 173.98, 0.0]\n  Target bbox: [625.61, 325.4, 654.1, 393.71]\n\nFrame 5 (current):\n  Drone pose: [-20.72, 127.49, 20.0, -46.66, 173.76, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.06, \"ymin\": 325.93, \"xmax\": 653.68, \"ymax\": 393.15}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": -0.08, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": -0.18, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": -0.29, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": -2.04, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -1.3, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.54, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -1.72, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.86, "window_alt_abs_m": 0.11, "target_px_mean_hist": 602.5, "cur_frame_id": 65, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-23.8, 126.75, 20.16, -46.87, 171.4, 0.0]\n  Target bbox: [625.66, 325.98, 654.09, 393.11]\n\nFrame 2:\n  Drone pose: [-24.32, 126.67, 20.0, -43.96, 175.32, 0.0]\n  Target bbox: [574.81, 367.51, 607.19, 444.27]\n\nFrame 3:\n  Drone pose: [-24.84, 126.52, 20.0, -46.64, 170.66, 0.0]\n  Target bbox: [622.81, 319.9, 656.72, 399.43]\n\nFrame 4:\n  Drone pose: [-25.37, 126.39, 20.0, -48.17, 172.04, 0.0]\n  Target bbox: [604.47, 300.72, 632.72, 367.81]\n\nFrame 5 (current):\n  Drone pose: [-26.01, 126.13, 19.98, -44.19, 174.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 566.57, \"ymin\": 370.61, \"xmax\": 597.41, \"ymax\": 438.36}, \"waypoint_deltas\": [{\"dx\": -0.43, \"dy\": 0.06, \"dz\": 0.02, \"dpitch\": -2.5, \"dyaw\": -4.77, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": 0.0, \"dz\": 0.02, \"dpitch\": -2.55, \"dyaw\": -4.97, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -0.02, \"dz\": 0.02, \"dpitch\": -2.44, \"dyaw\": -6.63, \"droll\": 0.0}, {\"dx\": -2.05, \"dy\": -0.02, \"dz\": 0.02, \"dpitch\": -2.5, \"dyaw\": -6.66, \"droll\": 0.0}, {\"dx\": -2.59, \"dy\": 0.0, \"dz\": 0.02, \"dpitch\": -2.4, \"dyaw\": -8.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.26, "window_alt_abs_m": 0.18, "target_px_mean_hist": 593.8, "cur_frame_id": 75, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-29.11, 126.05, 19.86, -50.48, 164.0, 0.0]\n  Target bbox: [644.85, 256.15, 680.74, 325.08]\n\nFrame 2:\n  Drone pose: [-29.53, 126.11, 20.03, -46.89, 166.27, 0.0]\n  Target bbox: [603.19, 312.77, 637.12, 388.15]\n\nFrame 3:\n  Drone pose: [-30.23, 126.24, 20.0, -46.64, 164.84, 0.0]\n  Target bbox: [619.69, 321.98, 660.52, 397.25]\n\nFrame 4:\n  Drone pose: [-30.78, 126.28, 20.0, -51.51, 162.07, 0.0]\n  Target bbox: [637.5, 236.54, 673.67, 314.79]\n\nFrame 5 (current):\n  Drone pose: [-31.37, 126.4, 19.86, -46.54, 168.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 562.14, \"ymin\": 324.57, \"xmax\": 601.61, \"ymax\": 396.79}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.05, \"dz\": 0.14, \"dpitch\": 0.1, \"dyaw\": -6.69, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": -0.02, \"dz\": 0.14, \"dpitch\": 0.01, \"dyaw\": -6.62, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": 0.03, \"dz\": 0.14, \"dpitch\": -0.07, \"dyaw\": -6.52, \"droll\": 0.0}, {\"dx\": -2.13, \"dy\": 0.09, \"dz\": 0.14, \"dpitch\": 0.08, \"dyaw\": -7.88, \"droll\": 0.0}, {\"dx\": -2.67, \"dy\": 0.16, \"dz\": 0.14, \"dpitch\": -0.01, \"dyaw\": -7.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.11, "window_alt_abs_m": 0.35, "target_px_mean_hist": 574.8, "cur_frame_id": 85, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613/aug_001/frames_playback/frame_00095/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.61, 126.7, 20.07, -47.17, 160.3, 0.0]\n  Target bbox: [617.31, 314.24, 651.4, 385.1]\n\nFrame 2:\n  Drone pose: [-35.11, 126.75, 20.0, -45.21, 164.97, 0.0]\n  Target bbox: [560.8, 346.96, 602.82, 419.03]\n\nFrame 3:\n  Drone pose: [-35.65, 126.85, 20.0, -46.33, 158.76, 0.0]\n  Target bbox: [621.29, 321.57, 658.35, 397.78]\n\nFrame 4:\n  Drone pose: [-36.28, 126.92, 19.97, -41.52, 156.77, 0.0]\n  Target bbox: [644.27, 407.74, 683.78, 480.32]\n\nFrame 5 (current):\n  Drone pose: [-36.71, 127.07, 20.0, -46.26, 157.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.53, \"ymin\": 320.59, \"xmax\": 659.09, \"ymax\": 398.77}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": 0.32, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.64, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": 0.43, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -2.65, \"dy\": 0.55, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -1.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.94, "window_alt_abs_m": 0.13, "target_px_mean_hist": 585.0, "cur_frame_id": 95, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_613", "difficulty_score": 0.4417, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [16.27, -82.16, 23.72, -48.45, 160.71, 0.0]\n  Target bbox: [621.86, 325.08, 657.82, 394.36]\n\nFrame 2:\n  Drone pose: [13.72, -79.7, 24.52, -53.44, 166.18, 0.0]\n  Target bbox: [621.34, 322.87, 658.26, 396.36]\n\nFrame 3:\n  Drone pose: [11.57, -76.38, 25.32, -57.84, 176.9, 0.0]\n  Target bbox: [625.64, 326.44, 654.07, 392.44]\n\nFrame 4:\n  Drone pose: [9.65, -72.64, 25.69, -60.19, -168.05, 0.0]\n  Target bbox: [623.1, 330.98, 657.06, 387.82]\n\nFrame 5 (current):\n  Drone pose: [8.07, -69.17, 25.36, -59.61, -153.26, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.58, \"ymin\": 324.98, \"xmax\": 660.64, \"ymax\": 393.91}, \"waypoint_deltas\": [{\"dx\": -1.37, \"dy\": 2.57, \"dz\": -0.8, \"dpitch\": 1.94, \"dyaw\": 10.45, \"droll\": 0.0}, {\"dx\": -2.35, \"dy\": 3.11, \"dz\": -1.6, \"dpitch\": 2.78, \"dyaw\": 13.24, \"droll\": 0.0}, {\"dx\": -3.04, \"dy\": 3.67, \"dz\": -2.14, \"dpitch\": 3.23, \"dyaw\": 13.87, \"droll\": 0.0}, {\"dx\": -3.6, \"dy\": 4.04, \"dz\": -2.23, \"dpitch\": 3.11, \"dyaw\": 13.64, \"droll\": 0.0}, {\"dx\": -4.12, \"dy\": 4.24, \"dz\": -2.26, \"dpitch\": 3.34, \"dyaw\": 14.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 46.03, "window_alt_abs_m": 2.31, "target_px_mean_hist": 338.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00017/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [1.94, -64.84, 22.22, -55.07, -138.65, 0.0]\n  Target bbox: [618.06, 322.68, 661.98, 396.15]\n\nFrame 2:\n  Drone pose: [1.54, -64.82, 22.08, -54.73, -138.87, 0.0]\n  Target bbox: [615.0, 318.93, 665.2, 399.89]\n\nFrame 3:\n  Drone pose: [1.15, -64.8, 21.95, -54.39, -139.09, 0.0]\n  Target bbox: [619.51, 328.08, 660.2, 390.63]\n\nFrame 4:\n  Drone pose: [0.76, -64.78, 21.82, -54.64, -140.78, 0.0]\n  Target bbox: [620.52, 329.2, 659.25, 389.52]\n\nFrame 5 (current):\n  Drone pose: [0.37, -64.76, 21.7, -54.87, -142.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.8, \"ymin\": 323.89, \"xmax\": 659.84, \"ymax\": 394.84}, \"waypoint_deltas\": [{\"dx\": -0.4, \"dy\": 0.02, \"dz\": -0.13, \"dpitch\": -0.22, \"dyaw\": -1.8, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 0.04, \"dz\": -0.24, \"dpitch\": -0.41, \"dyaw\": -3.65, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 0.06, \"dz\": -0.35, \"dpitch\": -0.8, \"dyaw\": -5.54, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.07, \"dz\": -0.43, \"dpitch\": -1.0, \"dyaw\": -7.48, \"droll\": 0.0}, {\"dx\": -1.97, \"dy\": 0.09, \"dz\": -0.5, \"dpitch\": -1.18, \"dyaw\": -9.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.88, "window_alt_abs_m": 0.52, "target_px_mean_hist": 542.0, "cur_frame_id": 17, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-3.56, -64.57, 21.02, -55.64, -156.4, 0.0]\n  Target bbox: [614.82, 315.03, 665.68, 403.76]\n\nFrame 2:\n  Drone pose: [-3.95, -64.56, 21.02, -55.43, -156.51, 0.0]\n  Target bbox: [616.02, 317.78, 664.31, 400.77]\n\nFrame 3:\n  Drone pose: [-4.35, -64.54, 21.03, -55.24, -156.61, 0.0]\n  Target bbox: [617.61, 319.84, 662.68, 398.82]\n\nFrame 4:\n  Drone pose: [-4.74, -64.52, 21.05, -55.06, -156.72, 0.0]\n  Target bbox: [617.01, 317.73, 663.37, 400.94]\n\nFrame 5 (current):\n  Drone pose: [-5.13, -64.5, 21.08, -54.9, -156.82, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.46, \"ymin\": 317.48, \"xmax\": 662.94, \"ymax\": 401.21}, \"waypoint_deltas\": [{\"dx\": -0.4, \"dy\": 0.02, \"dz\": 0.04, \"dpitch\": 0.15, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 0.04, \"dz\": 0.09, \"dpitch\": 0.28, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 0.06, \"dz\": 0.15, \"dpitch\": 0.4, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.07, \"dz\": 0.22, \"dpitch\": 0.51, \"dyaw\": -0.4, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": 0.05, \"dz\": 0.29, \"dpitch\": 0.62, \"dyaw\": -0.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.42, "window_alt_abs_m": 0.06, "target_px_mean_hist": 622.8, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-8.53, -65.11, 21.87, -54.34, -166.7, 0.0]\n  Target bbox: [621.01, 320.11, 659.4, 398.7]\n\nFrame 2:\n  Drone pose: [-8.79, -65.29, 21.99, -54.14, -167.57, 0.0]\n  Target bbox: [619.83, 325.71, 660.17, 393.13]\n\nFrame 3:\n  Drone pose: [-9.05, -65.47, 22.12, -54.11, -170.25, 0.0]\n  Target bbox: [622.44, 321.12, 657.96, 397.73]\n\nFrame 4:\n  Drone pose: [-9.31, -65.65, 22.24, -53.91, -171.06, 0.0]\n  Target bbox: [622.48, 323.79, 657.86, 395.07]\n\nFrame 5 (current):\n  Drone pose: [-9.57, -65.83, 22.38, -53.71, -171.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.22, \"ymin\": 318.71, \"xmax\": 665.61, \"ymax\": 400.22}, \"waypoint_deltas\": [{\"dx\": -0.26, \"dy\": -0.18, \"dz\": 0.14, \"dpitch\": 0.09, \"dyaw\": -2.56, \"droll\": 0.0}, {\"dx\": -0.52, \"dy\": -0.36, \"dz\": 0.28, \"dpitch\": 0.29, \"dyaw\": -3.28, \"droll\": 0.0}, {\"dx\": -0.78, \"dy\": -0.54, \"dz\": 0.42, \"dpitch\": 0.44, \"dyaw\": -5.74, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": -0.72, \"dz\": 0.57, \"dpitch\": 0.65, \"dyaw\": -6.39, \"droll\": 0.0}, {\"dx\": -1.3, \"dy\": -0.9, \"dz\": 0.72, \"dpitch\": 0.85, \"dyaw\": -8.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.15, "window_alt_abs_m": 0.51, "target_px_mean_hist": 567.2, "cur_frame_id": 45, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-11.91, -67.45, 23.72, -51.93, 173.9, 0.0]\n  Target bbox: [624.86, 324.99, 654.8, 394.13]\n\nFrame 2:\n  Drone pose: [-12.17, -67.63, 23.89, -51.73, 173.42, 0.0]\n  Target bbox: [624.84, 329.25, 654.92, 389.88]\n\nFrame 3:\n  Drone pose: [-12.42, -67.81, 24.03, -51.5, 172.94, 0.0]\n  Target bbox: [618.85, 325.1, 661.23, 394.17]\n\nFrame 4:\n  Drone pose: [-12.68, -67.99, 24.16, -51.15, 170.96, 0.0]\n  Target bbox: [624.52, 329.5, 655.25, 389.68]\n\nFrame 5 (current):\n  Drone pose: [-13.45, -68.41, 24.41, -51.73, 169.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.57, \"ymin\": 326.42, \"xmax\": 659.55, \"ymax\": 392.85}, \"waypoint_deltas\": [{\"dx\": -0.83, \"dy\": -0.5, \"dz\": 0.1, \"dpitch\": -0.3, \"dyaw\": -3.25, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": -0.97, \"dz\": 0.18, \"dpitch\": -0.67, \"dyaw\": -4.91, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": -1.39, \"dz\": 0.23, \"dpitch\": -0.82, \"dyaw\": -8.0, \"droll\": 0.0}, {\"dx\": -3.26, \"dy\": -1.68, \"dz\": 0.92, \"dpitch\": -1.87, \"dyaw\": -9.19, \"droll\": 0.0}, {\"dx\": -3.93, \"dy\": -1.83, \"dz\": 0.92, \"dpitch\": -1.76, \"dyaw\": -11.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.38, "window_alt_abs_m": 0.68, "target_px_mean_hist": 450.5, "cur_frame_id": 58, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-21.04, -70.49, 24.14, -52.96, 153.0, 0.0]\n  Target bbox: [621.71, 327.46, 658.2, 391.56]\n\nFrame 2:\n  Drone pose: [-22.36, -70.84, 23.92, -53.59, 150.68, 0.0]\n  Target bbox: [621.95, 325.25, 658.32, 393.75]\n\nFrame 3:\n  Drone pose: [-22.98, -70.89, 23.77, -53.11, 148.86, 0.0]\n  Target bbox: [618.67, 322.62, 661.08, 396.5]\n\nFrame 4:\n  Drone pose: [-23.53, -70.9, 23.56, -52.93, 148.74, 0.0]\n  Target bbox: [619.51, 324.99, 660.81, 394.09]\n\nFrame 5 (current):\n  Drone pose: [-24.01, -70.87, 23.35, -52.23, 147.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.21, \"ymin\": 320.11, \"xmax\": 663.49, \"ymax\": 399.05}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": 0.13, \"dz\": -0.21, \"dpitch\": 0.39, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": -0.7, \"dy\": 0.24, \"dz\": -0.42, \"dpitch\": 1.14, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": -1.16, \"dy\": 0.34, \"dz\": -0.63, \"dpitch\": 1.38, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": -1.66, \"dy\": 0.47, \"dz\": -0.86, \"dpitch\": 2.0, \"dyaw\": -0.85, \"droll\": 0.0}, {\"dx\": -2.17, \"dy\": 0.63, \"dz\": -1.04, \"dpitch\": 2.08, \"dyaw\": -0.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.53, "window_alt_abs_m": 0.8, "target_px_mean_hist": 466.8, "cur_frame_id": 72, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00086/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-28.8, -68.87, 21.49, -48.18, 143.86, 0.0]\n  Target bbox: [620.1, 324.0, 660.3, 395.17]\n\nFrame 2:\n  Drone pose: [-29.35, -68.52, 21.35, -47.92, 143.4, 0.0]\n  Target bbox: [626.34, 324.54, 653.97, 394.66]\n\nFrame 3:\n  Drone pose: [-29.92, -68.14, 21.22, -47.7, 142.96, 0.0]\n  Target bbox: [625.85, 325.81, 654.43, 393.38]\n\nFrame 4:\n  Drone pose: [-30.49, -67.76, 21.1, -47.51, 142.53, 0.0]\n  Target bbox: [624.27, 323.0, 656.11, 396.18]\n\nFrame 5 (current):\n  Drone pose: [-31.07, -67.37, 20.99, -47.34, 142.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.25, \"ymin\": 322.9, \"xmax\": 656.12, \"ymax\": 396.27}, \"waypoint_deltas\": [{\"dx\": -0.59, \"dy\": 0.39, \"dz\": -0.11, \"dpitch\": 0.14, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 0.79, \"dz\": -0.2, \"dpitch\": 0.27, \"dyaw\": -0.86, \"droll\": 0.0}, {\"dx\": -1.76, \"dy\": 1.18, \"dz\": -0.29, \"dpitch\": 0.4, \"dyaw\": -1.28, \"droll\": 0.0}, {\"dx\": -2.34, \"dy\": 1.58, \"dz\": -0.38, \"dpitch\": 0.54, \"dyaw\": -1.68, \"droll\": 0.0}, {\"dx\": -2.89, \"dy\": 1.98, \"dz\": -0.45, \"dpitch\": 0.68, \"dyaw\": -2.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.75, "window_alt_abs_m": 0.51, "target_px_mean_hist": 542.2, "cur_frame_id": 86, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-36.0, -63.65, 20.31, -46.11, 139.41, 0.0]\n  Target bbox: [626.66, 325.41, 653.64, 393.77]\n\nFrame 2:\n  Drone pose: [-36.46, -63.14, 20.27, -46.01, 139.5, 0.0]\n  Target bbox: [622.68, 324.6, 657.71, 394.59]\n\nFrame 3:\n  Drone pose: [-36.92, -62.59, 20.23, -45.95, 139.69, 0.0]\n  Target bbox: [627.4, 323.98, 652.91, 395.25]\n\nFrame 4:\n  Drone pose: [-37.36, -62.03, 20.2, -45.9, 139.96, 0.0]\n  Target bbox: [621.31, 323.72, 659.13, 395.5]\n\nFrame 5 (current):\n  Drone pose: [-37.79, -61.44, 20.17, -45.86, 140.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.09, \"ymin\": 322.94, \"xmax\": 654.26, \"ymax\": 396.28}, \"waypoint_deltas\": [{\"dx\": -0.43, \"dy\": 0.59, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": -0.84, \"dy\": 1.18, \"dz\": -0.05, \"dpitch\": 0.09, \"dyaw\": 0.75, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": 1.75, \"dz\": -0.07, \"dpitch\": 0.17, \"dyaw\": 1.1, \"droll\": 0.0}, {\"dx\": -1.65, \"dy\": 2.28, \"dz\": -0.09, \"dpitch\": 0.28, \"dyaw\": 1.38, \"droll\": 0.0}, {\"dx\": -2.04, \"dy\": 2.76, \"dz\": -0.1, \"dpitch\": 0.45, \"dyaw\": 1.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.87, "window_alt_abs_m": 0.14, "target_px_mean_hist": 589.2, "cur_frame_id": 99, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00113/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-41.76, -56.91, 20.02, -45.98, 147.29, 0.0]\n  Target bbox: [617.57, 319.94, 662.25, 399.28]\n\nFrame 2:\n  Drone pose: [-42.15, -56.62, 20.02, -46.08, 148.26, 0.0]\n  Target bbox: [618.52, 320.17, 661.29, 398.99]\n\nFrame 3:\n  Drone pose: [-42.54, -56.33, 20.02, -46.16, 149.21, 0.0]\n  Target bbox: [616.55, 318.89, 663.1, 400.54]\n\nFrame 4:\n  Drone pose: [-42.94, -56.04, 20.01, -46.24, 150.16, 0.0]\n  Target bbox: [618.61, 321.37, 661.18, 397.9]\n\nFrame 5 (current):\n  Drone pose: [-43.34, -55.76, 20.01, -46.32, 151.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.19, \"ymin\": 316.48, \"xmax\": 664.42, \"ymax\": 402.84}, \"waypoint_deltas\": [{\"dx\": -0.45, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.44, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": 0.15, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": -1.43, \"dy\": 0.17, \"dz\": -0.01, \"dpitch\": -0.03, \"dyaw\": 0.6, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": 0.12, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.45, \"droll\": 0.0}, {\"dx\": -2.45, \"dy\": 0.08, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.8, "window_alt_abs_m": 0.01, "target_px_mean_hist": 609.0, "cur_frame_id": 113, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/ORI/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.36, -55.93, 20.0, -46.21, 150.6, 0.0]\n  Target bbox: [620.14, 323.23, 659.73, 395.96]\n\nFrame 2:\n  Drone pose: [-48.87, -55.98, 20.0, -46.19, 150.44, 0.0]\n  Target bbox: [616.43, 317.96, 663.26, 401.3]\n\nFrame 3:\n  Drone pose: [-49.39, -56.03, 20.0, -46.17, 150.29, 0.0]\n  Target bbox: [617.32, 318.68, 662.34, 400.64]\n\nFrame 4:\n  Drone pose: [-49.9, -56.08, 20.0, -46.15, 150.13, 0.0]\n  Target bbox: [618.73, 321.47, 661.05, 397.82]\n\nFrame 5 (current):\n  Drone pose: [-50.41, -56.12, 20.0, -46.13, 149.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.87, \"ymin\": 318.45, \"xmax\": 662.79, \"ymax\": 400.89}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.16, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": -0.15, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.46, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -2.46, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.62, "window_alt_abs_m": 0.0, "target_px_mean_hist": 589.8, "cur_frame_id": 127, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [16.42, -82.13, 23.72, -50.04, 156.81, 0.0]\n  Target bbox: [618.06, 323.69, 661.77, 395.58]\n\nFrame 2:\n  Drone pose: [13.72, -79.7, 24.52, -53.69, 171.18, 0.0]\n  Target bbox: [567.65, 320.65, 611.71, 393.46]\n\nFrame 3:\n  Drone pose: [11.54, -76.47, 25.47, -57.89, 176.51, 0.0]\n  Target bbox: [580.44, 353.52, 626.7, 423.1]\n\nFrame 4:\n  Drone pose: [9.65, -72.64, 25.69, -60.19, -168.05, 0.0]\n  Target bbox: [622.03, 325.68, 658.25, 393.06] (model-predicted box)\n\nFrame 5 (current):\n  Drone pose: [8.07, -69.17, 25.36, -59.61, -153.26, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.81, \"ymin\": 329.39, \"xmax\": 658.28, \"ymax\": 389.5}, \"waypoint_deltas\": [{\"dx\": -1.37, \"dy\": 2.57, \"dz\": -0.8, \"dpitch\": 1.94, \"dyaw\": 10.45, \"droll\": 0.0}, {\"dx\": -2.35, \"dy\": 3.11, \"dz\": -1.6, \"dpitch\": 2.78, \"dyaw\": 13.24, \"droll\": 0.0}, {\"dx\": -3.04, \"dy\": 3.67, \"dz\": -2.14, \"dpitch\": 3.23, \"dyaw\": 13.87, \"droll\": 0.0}, {\"dx\": -3.6, \"dy\": 4.04, \"dz\": -2.23, \"dpitch\": 3.11, \"dyaw\": 13.64, \"droll\": 0.0}, {\"dx\": -4.12, \"dy\": 4.24, \"dz\": -2.26, \"dpitch\": 3.34, \"dyaw\": 14.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 1, "current_invisible": false, "window_yaw_abs_deg": 49.93, "window_alt_abs_m": 2.31, "target_px_mean_hist": 353.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00017/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [1.94, -64.84, 22.22, -51.26, -143.32, 0.0]\n  Target bbox: [657.94, 381.83, 713.82, 468.21]\n\nFrame 2:\n  Drone pose: [1.54, -64.82, 22.08, -54.73, -138.87, 0.0]\n  Target bbox: [615.05, 318.85, 665.14, 399.95]\n\nFrame 3:\n  Drone pose: [1.15, -64.8, 21.95, -54.39, -139.09, 0.0]\n  Target bbox: [623.3, 326.15, 656.42, 392.49]\n\nFrame 4:\n  Drone pose: [0.76, -64.78, 21.82, -52.45, -145.78, 0.0]\n  Target bbox: [667.48, 358.24, 710.4, 437.3]\n\nFrame 5 (current):\n  Drone pose: [0.37, -64.76, 21.7, -49.87, -144.97, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 641.63, \"ymin\": 408.48, \"xmax\": 686.61, \"ymax\": 479.17}, \"waypoint_deltas\": [{\"dx\": -0.4, \"dy\": 0.02, \"dz\": -0.13, \"dpitch\": -5.22, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 0.04, \"dz\": -0.24, \"dpitch\": -5.41, \"dyaw\": -1.21, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 0.06, \"dz\": -0.35, \"dpitch\": -5.8, \"dyaw\": -3.1, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.07, \"dz\": -0.43, \"dpitch\": -6.0, \"dyaw\": -5.04, \"droll\": 0.0}, {\"dx\": -1.97, \"dy\": 0.09, \"dz\": -0.5, \"dpitch\": -6.18, \"dyaw\": -7.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.17, "window_alt_abs_m": 0.52, "target_px_mean_hist": 563.8, "cur_frame_id": 17, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-3.56, -64.57, 21.02, -55.64, -156.4, 0.0]\n  Target bbox: [614.83, 315.2, 665.6, 403.35]\n\nFrame 2:\n  Drone pose: [-3.95, -64.56, 21.02, -58.35, -152.76, 0.0]\n  Target bbox: [579.66, 270.74, 627.6, 351.79]\n\nFrame 3:\n  Drone pose: [-4.34, -64.68, 21.09, -58.06, -165.59, 0.0]\n  Target bbox: [617.7, 320.69, 662.34, 397.64]\n\nFrame 4:\n  Drone pose: [-4.74, -64.52, 21.05, -53.68, -159.18, 0.0]\n  Target bbox: [641.9, 342.65, 687.03, 423.21]\n\nFrame 5 (current):\n  Drone pose: [-5.13, -64.5, 21.08, -54.8, -155.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 600.27, \"ymin\": 316.41, \"xmax\": 651.23, \"ymax\": 405.92}, \"waypoint_deltas\": [{\"dx\": -0.4, \"dy\": 0.02, \"dz\": 0.04, \"dpitch\": 0.05, \"dyaw\": -1.56, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 0.04, \"dz\": 0.09, \"dpitch\": 0.18, \"dyaw\": -1.66, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 0.06, \"dz\": 0.15, \"dpitch\": 0.3, \"dyaw\": -1.76, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.07, \"dz\": 0.22, \"dpitch\": 0.41, \"dyaw\": -1.86, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": 0.05, \"dz\": 0.29, \"dpitch\": 0.52, \"dyaw\": -2.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 26.7, "window_alt_abs_m": 0.15, "target_px_mean_hist": 623.2, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-8.53, -65.11, 21.87, -54.53, -161.7, 0.0]\n  Target bbox: [570.16, 319.69, 611.23, 395.87]\n\nFrame 2:\n  Drone pose: [-8.79, -65.29, 21.99, -51.69, -162.57, 0.0]\n  Target bbox: [563.76, 361.21, 616.16, 443.56]\n\nFrame 3:\n  Drone pose: [-9.05, -65.47, 22.12, -54.46, -175.25, 0.0]\n  Target bbox: [670.73, 315.59, 709.12, 395.25]\n\nFrame 4:\n  Drone pose: [-9.31, -65.65, 22.24, -57.87, -166.06, 0.0]\n  Target bbox: [566.4, 257.35, 613.11, 332.09]\n\nFrame 5 (current):\n  Drone pose: [-9.73, -65.88, 22.48, -50.59, -166.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.94, \"ymin\": 323.44, \"xmax\": 661.04, \"ymax\": 395.61}, \"waypoint_deltas\": [{\"dx\": -0.1, \"dy\": -0.13, \"dz\": 0.04, \"dpitch\": -3.03, \"dyaw\": -7.91, \"droll\": 0.0}, {\"dx\": -0.36, \"dy\": -0.31, \"dz\": 0.18, \"dpitch\": -2.83, \"dyaw\": -8.63, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": -0.49, \"dz\": 0.32, \"dpitch\": -2.68, \"dyaw\": -11.09, \"droll\": 0.0}, {\"dx\": -0.88, \"dy\": -0.67, \"dz\": 0.47, \"dpitch\": -2.47, \"dyaw\": -11.74, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": -0.85, \"dz\": 0.62, \"dpitch\": -2.27, \"dyaw\": -14.1, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.18, "window_alt_abs_m": 0.61, "target_px_mean_hist": 596.5, "cur_frame_id": 45, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-11.91, -67.45, 23.72, -49.8, 168.9, 0.0]\n  Target bbox: [675.07, 366.95, 709.0, 426.96]\n\nFrame 2:\n  Drone pose: [-12.17, -67.63, 23.89, -50.24, 172.95, 0.0]\n  Target bbox: [629.75, 352.64, 659.93, 416.61]\n\nFrame 3:\n  Drone pose: [-12.42, -67.81, 24.03, -51.5, 172.94, 0.0]\n  Target bbox: [621.43, 328.06, 658.48, 391.12]\n\nFrame 4:\n  Drone pose: [-12.7, -68.03, 24.03, -49.95, 165.81, 0.0]\n  Target bbox: [625.0, 326.94, 654.72, 392.32]\n\nFrame 5 (current):\n  Drone pose: [-13.45, -68.41, 24.41, -51.73, 169.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.79, \"ymin\": 326.27, \"xmax\": 658.33, \"ymax\": 392.87}, \"waypoint_deltas\": [{\"dx\": -0.83, \"dy\": -0.5, \"dz\": 0.1, \"dpitch\": -0.3, \"dyaw\": -3.25, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": -0.97, \"dz\": 0.18, \"dpitch\": -0.67, \"dyaw\": -4.91, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": -1.39, \"dz\": 0.23, \"dpitch\": -0.82, \"dyaw\": -8.0, \"droll\": 0.0}, {\"dx\": -3.26, \"dy\": -1.68, \"dz\": 0.92, \"dpitch\": -1.87, \"dyaw\": -9.19, \"droll\": 0.0}, {\"dx\": -3.93, \"dy\": -1.83, \"dz\": 0.92, \"dpitch\": -1.76, \"dyaw\": -11.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.91, "window_alt_abs_m": 0.69, "target_px_mean_hist": 487.2, "cur_frame_id": 58, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-21.04, -70.49, 24.14, -54.11, 150.39, 0.0]\n  Target bbox: [647.9, 308.32, 685.78, 373.27]\n\nFrame 2:\n  Drone pose: [-22.45, -70.8, 23.85, -58.4, 148.75, 0.0]\n  Target bbox: [570.67, 306.14, 617.35, 380.64]\n\nFrame 3:\n  Drone pose: [-22.98, -70.89, 23.77, -53.11, 148.86, 0.0]\n  Target bbox: [621.66, 327.57, 658.26, 391.52]\n\nFrame 4:\n  Drone pose: [-23.53, -70.9, 23.56, -52.93, 148.74, 0.0]\n  Target bbox: [618.47, 324.48, 661.84, 394.51]\n\nFrame 5 (current):\n  Drone pose: [-24.01, -70.87, 23.35, -50.7, 152.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 568.11, \"ymin\": 351.38, \"xmax\": 607.71, \"ymax\": 422.77}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": 0.13, \"dz\": -0.21, \"dpitch\": -1.14, \"dyaw\": -4.3, \"droll\": 0.0}, {\"dx\": -0.7, \"dy\": 0.24, \"dz\": -0.42, \"dpitch\": -0.39, \"dyaw\": -5.21, \"droll\": 0.0}, {\"dx\": -1.16, \"dy\": 0.34, \"dz\": -0.63, \"dpitch\": -0.15, \"dyaw\": -4.85, \"droll\": 0.0}, {\"dx\": -1.66, \"dy\": 0.47, \"dz\": -0.86, \"dpitch\": 0.47, \"dyaw\": -5.85, \"droll\": 0.0}, {\"dx\": -2.17, \"dy\": 0.63, \"dz\": -1.04, \"dpitch\": 0.55, \"dyaw\": -5.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.59, "window_alt_abs_m": 0.8, "target_px_mean_hist": 478.8, "cur_frame_id": 72, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00086/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-28.77, -69.07, 21.49, -46.61, 135.73, 0.0]\n  Target bbox: [624.66, 327.2, 655.1, 391.91]\n\nFrame 2:\n  Drone pose: [-29.35, -68.52, 21.35, -47.92, 143.4, 0.0]\n  Target bbox: [623.62, 323.9, 656.72, 395.2]\n\nFrame 3:\n  Drone pose: [-29.92, -68.14, 21.22, -47.7, 142.96, 0.0]\n  Target bbox: [621.18, 324.6, 659.26, 394.4]\n\nFrame 4:\n  Drone pose: [-30.47, -67.75, 20.91, -43.41, 137.76, 0.0]\n  Target bbox: [684.77, 319.79, 718.42, 386.7]\n\nFrame 5 (current):\n  Drone pose: [-31.07, -67.37, 20.99, -42.89, 139.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 653.24, \"ymin\": 398.11, \"xmax\": 686.84, \"ymax\": 471.7}, \"waypoint_deltas\": [{\"dx\": -0.59, \"dy\": 0.39, \"dz\": -0.11, \"dpitch\": -4.31, \"dyaw\": 2.16, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 0.79, \"dz\": -0.2, \"dpitch\": -4.18, \"dyaw\": 1.73, \"droll\": 0.0}, {\"dx\": -1.76, \"dy\": 1.18, \"dz\": -0.29, \"dpitch\": -4.05, \"dyaw\": 1.31, \"droll\": 0.0}, {\"dx\": -2.34, \"dy\": 1.58, \"dz\": -0.38, \"dpitch\": -3.91, \"dyaw\": 0.91, \"droll\": 0.0}, {\"dx\": -2.89, \"dy\": 1.98, \"dz\": -0.45, \"dpitch\": -3.77, \"dyaw\": 0.55, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.06, "window_alt_abs_m": 0.66, "target_px_mean_hist": 543.0, "cur_frame_id": 86, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-36.0, -63.65, 20.31, -46.11, 139.41, 0.0]\n  Target bbox: [621.9, 322.68, 658.37, 396.44]\n\nFrame 2:\n  Drone pose: [-36.44, -62.99, 20.17, -39.27, 131.45, 0.0]\n  Target bbox: [682.46, 358.38, 725.06, 426.67]\n\nFrame 3:\n  Drone pose: [-36.92, -62.59, 20.23, -45.99, 143.85, 0.0]\n  Target bbox: [572.25, 322.2, 610.27, 398.15]\n\nFrame 4:\n  Drone pose: [-37.36, -62.03, 20.2, -40.9, 139.99, 0.0]\n  Target bbox: [627.25, 409.41, 652.31, 477.96]\n\nFrame 5 (current):\n  Drone pose: [-37.79, -61.44, 20.17, -45.67, 135.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 685.67, \"ymin\": 331.01, \"xmax\": 712.02, \"ymax\": 398.06}, \"waypoint_deltas\": [{\"dx\": -0.43, \"dy\": 0.59, \"dz\": -0.03, \"dpitch\": -0.15, \"dyaw\": 5.37, \"droll\": 0.0}, {\"dx\": -0.84, \"dy\": 1.18, \"dz\": -0.05, \"dpitch\": -0.1, \"dyaw\": 5.75, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": 1.75, \"dz\": -0.07, \"dpitch\": -0.02, \"dyaw\": 6.1, \"droll\": 0.0}, {\"dx\": -1.65, \"dy\": 2.28, \"dz\": -0.09, \"dpitch\": 0.09, \"dyaw\": 6.38, \"droll\": 0.0}, {\"dx\": -2.04, \"dy\": 2.76, \"dz\": -0.1, \"dpitch\": 0.26, \"dyaw\": 6.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 28.93, "window_alt_abs_m": 0.26, "target_px_mean_hist": 573.0, "cur_frame_id": 99, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00113/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-41.76, -56.91, 20.02, -44.77, 152.14, 0.0]\n  Target bbox: [561.75, 344.98, 604.04, 418.22]\n\nFrame 2:\n  Drone pose: [-42.15, -56.62, 20.02, -47.4, 150.28, 0.0]\n  Target bbox: [590.7, 294.62, 640.53, 380.92]\n\nFrame 3:\n  Drone pose: [-42.54, -56.33, 20.02, -46.16, 149.21, 0.0]\n  Target bbox: [616.66, 319.0, 662.99, 400.42]\n\nFrame 4:\n  Drone pose: [-42.94, -56.04, 20.01, -47.69, 145.16, 0.0]\n  Target bbox: [677.53, 300.66, 719.44, 373.7]\n\nFrame 5 (current):\n  Drone pose: [-43.34, -55.76, 20.01, -44.69, 155.26, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 570.76, \"ymin\": 351.72, \"xmax\": 611.78, \"ymax\": 424.8}, \"waypoint_deltas\": [{\"dx\": -0.45, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": -1.66, \"dyaw\": -3.73, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": 0.15, \"dz\": 0.0, \"dpitch\": -1.66, \"dyaw\": -3.64, \"droll\": 0.0}, {\"dx\": -1.43, \"dy\": 0.17, \"dz\": -0.01, \"dpitch\": -1.66, \"dyaw\": -3.57, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": 0.12, \"dz\": -0.01, \"dpitch\": -1.64, \"dyaw\": -3.72, \"droll\": 0.0}, {\"dx\": -2.45, \"dy\": 0.08, \"dz\": -0.01, \"dpitch\": -1.62, \"dyaw\": -3.88, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.08, "window_alt_abs_m": 0.01, "target_px_mean_hist": 621.2, "cur_frame_id": 113, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378/aug_001/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.36, -55.93, 20.0, -46.21, 150.6, 0.0]\n  Target bbox: [617.83, 320.15, 661.91, 399.15]\n\nFrame 2:\n  Drone pose: [-48.87, -55.98, 20.0, -46.91, 152.55, 0.0]\n  Target bbox: [589.64, 306.27, 639.89, 389.61]\n\nFrame 3:\n  Drone pose: [-49.32, -55.87, 20.1, -40.74, 161.7, 0.0]\n  Target bbox: [555.62, 352.54, 599.33, 427.23]\n\nFrame 4:\n  Drone pose: [-49.9, -56.08, 20.0, -45.87, 155.13, 0.0]\n  Target bbox: [560.71, 330.18, 601.95, 401.98]\n\nFrame 5 (current):\n  Drone pose: [-50.41, -56.12, 20.0, -46.13, 149.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.99, \"ymin\": 321.55, \"xmax\": 660.79, \"ymax\": 397.72}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.16, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": -0.15, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.46, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -2.46, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.82, "window_alt_abs_m": 0.2, "target_px_mean_hist": 595.0, "cur_frame_id": 127, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776097378", "difficulty_score": 0.4265, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [109.39, -43.56, 22.0, -46.48, 90.0, 0.0]\n  Target bbox: [621.99, 331.17, 658.01, 388.0]\n\nFrame 2:\n  Drone pose: [108.36, -43.96, 21.2, -44.07, 87.19, 0.0]\n  Target bbox: [622.08, 329.38, 657.63, 389.88]\n\nFrame 3:\n  Drone pose: [107.93, -43.83, 20.67, -42.78, 86.08, 0.0]\n  Target bbox: [620.91, 328.59, 658.78, 390.7]\n\nFrame 4:\n  Drone pose: [107.8, -43.43, 20.64, -42.59, 85.75, 0.0]\n  Target bbox: [619.05, 328.3, 660.61, 391.0]\n\nFrame 5 (current):\n  Drone pose: [107.79, -42.93, 20.62, -42.55, 85.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.04, \"ymin\": 330.21, \"xmax\": 655.73, \"ymax\": 389.1}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.03, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.54, \"dz\": -0.07, \"dpitch\": 0.05, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.05, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.56, \"dz\": -0.2, \"dpitch\": 0.2, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.29, "window_alt_abs_m": 1.38, "target_px_mean_hist": 507.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.79, -39.34, 20.36, -42.29, 85.69, 0.0]\n  Target bbox: [620.51, 329.23, 659.2, 390.08]\n\nFrame 2:\n  Drone pose: [107.79, -38.83, 20.33, -42.26, 85.69, 0.0]\n  Target bbox: [625.31, 330.25, 654.46, 389.08]\n\nFrame 3:\n  Drone pose: [107.79, -38.32, 20.3, -42.23, 85.69, 0.0]\n  Target bbox: [620.01, 329.33, 659.68, 390.03]\n\nFrame 4:\n  Drone pose: [107.79, -37.81, 20.27, -42.2, 85.68, 0.0]\n  Target bbox: [627.52, 329.73, 652.27, 389.59]\n\nFrame 5 (current):\n  Drone pose: [107.79, -37.3, 20.24, -42.18, 85.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.78, \"ymin\": 329.57, \"xmax\": 655.97, \"ymax\": 389.74}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.02, \"dz\": -0.05, \"dpitch\": 0.06, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.52, \"dz\": -0.07, \"dpitch\": 0.08, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.03, \"dz\": -0.09, \"dpitch\": 0.09, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.54, \"dz\": -0.11, \"dpitch\": 0.11, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.11, "target_px_mean_hist": 527.5, "cur_frame_id": 15, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00027/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.79, -33.24, 20.09, -42.04, 85.66, 0.0]\n  Target bbox: [618.64, 328.76, 661.02, 390.59]\n\nFrame 2:\n  Drone pose: [107.79, -32.73, 20.08, -42.03, 85.66, 0.0]\n  Target bbox: [620.59, 328.76, 659.11, 390.52]\n\nFrame 3:\n  Drone pose: [107.79, -32.22, 20.07, -42.02, 85.66, 0.0]\n  Target bbox: [622.57, 329.4, 657.16, 389.94]\n\nFrame 4:\n  Drone pose: [107.79, -31.72, 20.06, -42.01, 85.66, 0.0]\n  Target bbox: [618.34, 327.78, 661.3, 391.52]\n\nFrame 5 (current):\n  Drone pose: [107.79, -31.21, 20.05, -42.01, 85.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.65, \"ymin\": 327.65, \"xmax\": 661.0, \"ymax\": 391.64}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.02, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.52, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.04, "target_px_mean_hist": 536.2, "cur_frame_id": 27, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.78, -27.68, 20.02, -42.0, 85.65, 0.0]\n  Target bbox: [619.76, 327.53, 659.9, 391.75]\n\nFrame 2:\n  Drone pose: [107.78, -27.17, 20.01, -42.01, 85.65, 0.0]\n  Target bbox: [624.55, 329.58, 655.21, 389.76]\n\nFrame 3:\n  Drone pose: [107.78, -26.67, 20.01, -42.01, 85.65, 0.0]\n  Target bbox: [625.28, 329.88, 654.49, 389.45]\n\nFrame 4:\n  Drone pose: [107.78, -26.17, 20.01, -42.01, 85.65, 0.0]\n  Target bbox: [621.57, 329.02, 658.15, 390.28]\n\nFrame 5 (current):\n  Drone pose: [107.78, -25.66, 20.01, -42.01, 85.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.25, \"ymin\": 328.02, \"xmax\": 661.39, \"ymax\": 391.29}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": 1.51, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": 2.01, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": 2.51, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.01, "target_px_mean_hist": 550.2, "cur_frame_id": 38, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.81, -21.64, 20.0, -42.04, 85.7, 0.0]\n  Target bbox: [627.18, 330.14, 652.62, 389.19]\n\nFrame 2:\n  Drone pose: [107.82, -21.13, 20.0, -42.04, 85.73, 0.0]\n  Target bbox: [621.61, 329.05, 658.11, 390.23]\n\nFrame 3:\n  Drone pose: [107.84, -20.63, 20.0, -42.05, 85.78, 0.0]\n  Target bbox: [619.89, 327.6, 659.77, 391.68]\n\nFrame 4:\n  Drone pose: [107.86, -20.13, 20.0, -42.05, 85.84, 0.0]\n  Target bbox: [627.44, 330.0, 652.36, 389.33]\n\nFrame 5 (current):\n  Drone pose: [107.89, -19.63, 20.0, -42.06, 85.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.83, \"ymin\": 329.68, \"xmax\": 654.94, \"ymax\": 389.64}, \"waypoint_deltas\": [{\"dx\": 0.04, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": 1.52, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.41, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": 2.02, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.61, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": 2.53, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.85, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.23, "window_alt_abs_m": 0.0, "target_px_mean_hist": 551.2, "cur_frame_id": 50, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [108.53, -15.56, 20.0, -42.2, 87.66, 0.0]\n  Target bbox: [622.98, 328.72, 656.72, 390.55]\n\nFrame 2:\n  Drone pose: [108.66, -15.05, 20.0, -42.22, 87.99, 0.0]\n  Target bbox: [624.48, 329.81, 655.26, 389.49]\n\nFrame 3:\n  Drone pose: [108.78, -14.53, 20.0, -42.26, 88.33, 0.0]\n  Target bbox: [620.69, 329.11, 658.99, 390.14]\n\nFrame 4:\n  Drone pose: [108.9, -14.0, 20.0, -42.29, 88.66, 0.0]\n  Target bbox: [622.11, 329.5, 657.59, 389.76]\n\nFrame 5 (current):\n  Drone pose: [109.01, -13.48, 20.0, -42.33, 88.97, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.85, \"ymin\": 328.19, \"xmax\": 660.76, \"ymax\": 391.05}, \"waypoint_deltas\": [{\"dx\": 0.1, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": 1.07, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.48, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": 1.6, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 2.13, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 2.13, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": 2.67, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": 2.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.31, "window_alt_abs_m": 0.0, "target_px_mean_hist": 546.8, "cur_frame_id": 62, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00073/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [109.34, -9.73, 20.0, -42.65, 92.62, 0.0]\n  Target bbox: [621.16, 329.13, 659.16, 390.15]\n\nFrame 2:\n  Drone pose: [109.33, -9.18, 20.0, -42.71, 92.6, 0.0]\n  Target bbox: [623.14, 326.42, 656.9, 392.99]\n\nFrame 3:\n  Drone pose: [109.31, -8.64, 20.0, -42.73, 93.93, 0.0]\n  Target bbox: [627.5, 329.71, 652.7, 389.58]\n\nFrame 4:\n  Drone pose: [109.27, -8.1, 20.0, -42.79, 93.84, 0.0]\n  Target bbox: [620.09, 324.29, 659.88, 395.24]\n\nFrame 5 (current):\n  Drone pose: [109.23, -7.55, 20.0, -42.8, 95.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.31, \"ymin\": 327.52, \"xmax\": 659.0, \"ymax\": 391.74}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": 0.54, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": -0.12, \"dy\": 1.08, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 1.1, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": 1.62, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 0.93, \"droll\": 0.0}, {\"dx\": -0.25, \"dy\": 2.16, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 2.16, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": 2.7, \"dz\": 0.0, \"dpitch\": -0.19, \"dyaw\": 2.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.71, "window_alt_abs_m": 0.0, "target_px_mean_hist": 547.0, "cur_frame_id": 73, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [108.73, -3.24, 20.0, -43.08, 98.0, 0.0]\n  Target bbox: [620.78, 324.61, 659.13, 394.87]\n\nFrame 2:\n  Drone pose: [108.65, -2.7, 20.0, -43.04, 99.17, 0.0]\n  Target bbox: [619.16, 326.19, 661.14, 393.09]\n\nFrame 3:\n  Drone pose: [108.55, -2.17, 20.0, -43.11, 98.93, 0.0]\n  Target bbox: [619.67, 323.99, 660.21, 395.56]\n\nFrame 4:\n  Drone pose: [108.45, -1.64, 20.0, -43.06, 100.04, 0.0]\n  Target bbox: [626.13, 329.03, 654.06, 390.28]\n\nFrame 5 (current):\n  Drone pose: [108.34, -1.11, 20.0, -43.13, 99.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.6, \"ymin\": 323.33, \"xmax\": 661.3, \"ymax\": 396.23}, \"waypoint_deltas\": [{\"dx\": -0.12, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 1.06, \"droll\": 0.0}, {\"dx\": -0.25, \"dy\": 1.06, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.73, \"droll\": 0.0}, {\"dx\": -0.38, \"dy\": 1.59, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 1.78, \"droll\": 0.0}, {\"dx\": -0.5, \"dy\": 2.11, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 1.45, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": 2.64, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 2.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.83, "window_alt_abs_m": 0.0, "target_px_mean_hist": 559.5, "cur_frame_id": 85, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00096/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.48, 2.58, 20.0, -43.23, 101.63, 0.0]\n  Target bbox: [619.01, 323.65, 660.86, 395.89]\n\nFrame 2:\n  Drone pose: [107.35, 3.11, 20.0, -43.16, 102.67, 0.0]\n  Target bbox: [616.7, 325.6, 663.57, 393.74]\n\nFrame 3:\n  Drone pose: [107.22, 3.64, 20.0, -43.24, 102.32, 0.0]\n  Target bbox: [618.08, 322.01, 661.76, 397.5]\n\nFrame 4:\n  Drone pose: [107.08, 4.18, 20.0, -43.17, 103.35, 0.0]\n  Target bbox: [622.56, 327.74, 657.64, 391.58]\n\nFrame 5 (current):\n  Drone pose: [106.95, 4.71, 20.0, -43.26, 103.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.03, \"ymin\": 323.68, \"xmax\": 660.83, \"ymax\": 395.86}, \"waypoint_deltas\": [{\"dx\": -0.13, \"dy\": 0.54, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 1.03, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 1.08, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.71, \"droll\": 0.0}, {\"dx\": -0.38, \"dy\": 1.62, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 1.78, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 2.15, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 1.51, \"droll\": 0.0}, {\"dx\": -0.59, \"dy\": 2.69, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 2.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.76, "window_alt_abs_m": 0.0, "target_px_mean_hist": 551.5, "cur_frame_id": 96, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/ORI/frames_playback/frame_00108/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.0, 9.01, 20.0, -43.29, 106.09, 0.0]\n  Target bbox: [618.99, 326.06, 661.2, 393.26]\n\nFrame 2:\n  Drone pose: [105.84, 9.53, 20.0, -43.39, 105.69, 0.0]\n  Target bbox: [620.31, 323.89, 659.57, 395.48]\n\nFrame 3:\n  Drone pose: [105.66, 10.06, 20.0, -43.3, 106.57, 0.0]\n  Target bbox: [624.62, 327.41, 655.55, 391.89]\n\nFrame 4:\n  Drone pose: [105.45, 10.58, 20.0, -43.41, 106.0, 0.0]\n  Target bbox: [619.15, 323.79, 660.68, 395.71]\n\nFrame 5 (current):\n  Drone pose: [105.19, 11.09, 20.0, -43.33, 106.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.27, \"ymin\": 325.77, \"xmax\": 661.92, \"ymax\": 393.56}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": -0.84, \"droll\": 0.0}, {\"dx\": -0.67, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": 1.52, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": 2.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": 2.51, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.53, "window_alt_abs_m": 0.0, "target_px_mean_hist": 560.2, "cur_frame_id": 108, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [109.28, -43.69, 21.97, -41.94, 85.48, 0.0]\n  Target bbox: [675.4, 404.94, 702.58, 461.7]\n\nFrame 2:\n  Drone pose: [108.51, -43.95, 21.22, -39.12, 85.99, 0.0]\n  Target bbox: [640.51, 413.82, 678.41, 473.94]\n\nFrame 3:\n  Drone pose: [107.78, -43.94, 20.71, -42.1, 80.69, 0.0]\n  Target bbox: [687.84, 340.98, 715.83, 401.34]\n\nFrame 4:\n  Drone pose: [107.86, -43.38, 20.54, -47.52, 89.05, 0.0]\n  Target bbox: [587.33, 247.5, 613.59, 305.16]\n\nFrame 5 (current):\n  Drone pose: [107.76, -42.78, 20.52, -42.61, 85.61, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.41, \"ymin\": 330.07, \"xmax\": 651.41, \"ymax\": 389.26}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.36, \"dz\": 0.07, \"dpitch\": 0.08, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 0.88, \"dz\": 0.05, \"dpitch\": 0.09, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 1.39, \"dz\": 0.03, \"dpitch\": 0.11, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 1.9, \"dz\": 0.01, \"dpitch\": 0.12, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 2.41, \"dz\": -0.1, \"dpitch\": 0.26, \"dyaw\": 0.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.59, "window_alt_abs_m": 1.45, "target_px_mean_hist": 523.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.79, -39.34, 20.36, -44.73, 81.25, 0.0]\n  Target bbox: [675.16, 289.09, 715.64, 351.4]\n\nFrame 2:\n  Drone pose: [107.89, -38.7, 20.39, -42.54, 85.93, 0.0]\n  Target bbox: [618.59, 328.32, 661.06, 390.97]\n\nFrame 3:\n  Drone pose: [107.87, -38.45, 20.22, -41.95, 85.92, 0.0]\n  Target bbox: [627.68, 329.8, 652.12, 389.53]\n\nFrame 4:\n  Drone pose: [107.75, -37.68, 20.19, -41.46, 90.55, 0.0]\n  Target bbox: [557.52, 345.34, 597.2, 404.81]\n\nFrame 5 (current):\n  Drone pose: [107.62, -37.39, 20.23, -42.03, 85.25, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.51, \"ymin\": 329.29, \"xmax\": 659.2, \"ymax\": 390.04}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": 0.6, \"dz\": -0.01, \"dpitch\": -0.12, \"dyaw\": 0.43, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": 1.11, \"dz\": -0.04, \"dpitch\": -0.09, \"dyaw\": 0.43, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": 1.61, \"dz\": -0.06, \"dpitch\": -0.07, \"dyaw\": 0.42, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": 2.12, \"dz\": -0.08, \"dpitch\": -0.06, \"dyaw\": 0.42, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": 2.63, \"dz\": -0.1, \"dpitch\": -0.04, \"dyaw\": 0.42, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.62, "window_alt_abs_m": 0.27, "target_px_mean_hist": 533.5, "cur_frame_id": 15, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00027/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.9, -33.33, 20.13, -45.43, 80.98, 0.0]\n  Target bbox: [688.66, 272.9, 716.65, 333.95]\n\nFrame 2:\n  Drone pose: [107.79, -32.73, 20.08, -42.03, 85.66, 0.0]\n  Target bbox: [621.02, 328.92, 658.69, 390.38]\n\nFrame 3:\n  Drone pose: [107.79, -32.22, 20.07, -43.06, 90.66, 0.0]\n  Target bbox: [555.5, 312.76, 598.55, 375.29]\n\nFrame 4:\n  Drone pose: [107.79, -31.72, 20.06, -42.64, 88.87, 0.0]\n  Target bbox: [583.24, 320.05, 615.74, 379.63]\n\nFrame 5 (current):\n  Drone pose: [107.79, -31.21, 20.05, -42.01, 85.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.25, \"ymin\": 329.46, \"xmax\": 651.55, \"ymax\": 389.86}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.02, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.52, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.68, "window_alt_abs_m": 0.08, "target_px_mean_hist": 541.2, "cur_frame_id": 27, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.95, -27.75, 19.95, -37.78, 90.42, 0.0]\n  Target bbox: [565.33, 398.5, 605.1, 459.2]\n\nFrame 2:\n  Drone pose: [107.78, -27.17, 20.01, -46.72, 89.62, 0.0]\n  Target bbox: [569.86, 251.49, 609.54, 311.57]\n\nFrame 3:\n  Drone pose: [107.78, -26.67, 20.01, -42.01, 85.65, 0.0]\n  Target bbox: [619.09, 327.48, 660.56, 391.8]\n\nFrame 4:\n  Drone pose: [107.74, -26.12, 20.07, -44.26, 89.24, 0.0]\n  Target bbox: [578.2, 296.03, 608.64, 354.88]\n\nFrame 5 (current):\n  Drone pose: [107.76, -25.69, 19.87, -44.65, 85.05, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.89, \"ymin\": 280.2, \"xmax\": 666.27, \"ymax\": 342.76}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": 0.53, \"dz\": 0.14, \"dpitch\": 2.63, \"dyaw\": 0.59, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": 1.04, \"dz\": 0.13, \"dpitch\": 2.63, \"dyaw\": 0.59, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 1.54, \"dz\": 0.13, \"dpitch\": 2.63, \"dyaw\": 0.59, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 2.04, \"dz\": 0.13, \"dpitch\": 2.63, \"dyaw\": 0.6, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 2.54, \"dz\": 0.13, \"dpitch\": 2.62, \"dyaw\": 0.6, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.56, "window_alt_abs_m": 0.32, "target_px_mean_hist": 546.0, "cur_frame_id": 38, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.81, -21.64, 20.0, -38.07, 83.02, 0.0]\n  Target bbox: [659.93, 396.7, 687.14, 457.04]\n\nFrame 2:\n  Drone pose: [107.82, -21.13, 20.0, -42.04, 85.73, 0.0]\n  Target bbox: [626.88, 330.03, 652.92, 389.3]\n\nFrame 3:\n  Drone pose: [107.84, -20.63, 20.0, -42.05, 85.78, 0.0]\n  Target bbox: [620.47, 328.63, 659.23, 390.64]\n\nFrame 4:\n  Drone pose: [107.86, -20.13, 20.0, -46.95, 88.68, 0.0]\n  Target bbox: [591.41, 248.44, 616.99, 307.58]\n\nFrame 5 (current):\n  Drone pose: [107.99, -19.55, 20.13, -42.36, 86.19, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.74, \"ymin\": 329.2, \"xmax\": 657.98, \"ymax\": 390.09}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": 0.43, \"dz\": -0.13, \"dpitch\": 0.29, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 0.93, \"dz\": -0.13, \"dpitch\": 0.28, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 1.44, \"dz\": -0.13, \"dpitch\": 0.27, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 1.94, \"dz\": -0.13, \"dpitch\": 0.25, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": 2.45, \"dz\": -0.13, \"dpitch\": 0.23, \"dyaw\": 0.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.15, "window_alt_abs_m": 0.13, "target_px_mean_hist": 547.8, "cur_frame_id": 50, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [108.68, -15.6, 19.92, -47.04, 89.26, 0.0]\n  Target bbox: [605.17, 245.39, 643.73, 305.97]\n\nFrame 2:\n  Drone pose: [108.66, -15.05, 20.0, -44.78, 86.48, 0.0]\n  Target bbox: [637.83, 284.77, 680.33, 348.91]\n\nFrame 3:\n  Drone pose: [108.69, -14.53, 19.94, -42.15, 88.07, 0.0]\n  Target bbox: [627.76, 329.45, 652.03, 389.84]\n\nFrame 4:\n  Drone pose: [108.95, -14.16, 19.91, -41.95, 88.81, 0.0]\n  Target bbox: [622.77, 329.57, 656.93, 389.71]\n\nFrame 5 (current):\n  Drone pose: [109.05, -13.55, 19.82, -42.86, 87.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 640.39, \"ymin\": 314.42, \"xmax\": 670.11, \"ymax\": 374.86}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.6, \"dz\": 0.18, \"dpitch\": 0.49, \"dyaw\": 1.38, \"droll\": 0.0}, {\"dx\": 0.14, \"dy\": 1.14, \"dz\": 0.18, \"dpitch\": 0.44, \"dyaw\": 1.59, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": 1.67, \"dz\": 0.18, \"dpitch\": 0.39, \"dyaw\": 1.75, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": 2.2, \"dz\": 0.18, \"dpitch\": 0.35, \"dyaw\": 3.24, \"droll\": 0.0}, {\"dx\": 0.27, \"dy\": 2.74, \"dz\": 0.18, \"dpitch\": 0.3, \"dyaw\": 3.32, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.07, "window_alt_abs_m": 0.25, "target_px_mean_hist": 565.0, "cur_frame_id": 62, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00073/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [109.34, -9.73, 20.0, -44.35, 93.62, 0.0]\n  Target bbox: [608.15, 300.04, 646.82, 362.22]\n\nFrame 2:\n  Drone pose: [109.33, -9.18, 20.0, -42.71, 92.6, 0.0]\n  Target bbox: [619.67, 323.54, 660.27, 395.99]\n\nFrame 3:\n  Drone pose: [109.31, -8.64, 20.0, -37.73, 96.46, 0.0]\n  Target bbox: [593.24, 413.26, 623.82, 475.11]\n\nFrame 4:\n  Drone pose: [109.27, -8.1, 20.0, -42.79, 93.84, 0.0]\n  Target bbox: [618.41, 323.01, 661.56, 396.58]\n\nFrame 5 (current):\n  Drone pose: [109.29, -7.57, 20.05, -37.85, 95.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.19, \"ymin\": 413.1, \"xmax\": 649.46, \"ymax\": 474.33}, \"waypoint_deltas\": [{\"dx\": -0.12, \"dy\": 0.56, \"dz\": -0.05, \"dpitch\": -5.02, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": 1.1, \"dz\": -0.05, \"dpitch\": -5.02, \"dyaw\": 0.74, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": 1.64, \"dz\": -0.05, \"dpitch\": -5.09, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": 2.18, \"dz\": -0.05, \"dpitch\": -5.07, \"dyaw\": 1.8, \"droll\": 0.0}, {\"dx\": -0.37, \"dy\": 2.72, \"dz\": -0.05, \"dpitch\": -5.14, \"dyaw\": 1.64, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.14, "window_alt_abs_m": 0.05, "target_px_mean_hist": 559.0, "cur_frame_id": 73, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [108.73, -3.24, 20.0, -43.08, 98.0, 0.0]\n  Target bbox: [622.88, 326.46, 657.1, 392.94]\n\nFrame 2:\n  Drone pose: [108.75, -2.74, 20.15, -43.2, 99.44, 0.0]\n  Target bbox: [620.24, 327.84, 660.02, 391.47]\n\nFrame 3:\n  Drone pose: [108.55, -2.17, 20.0, -45.81, 95.5, 0.0]\n  Target bbox: [662.43, 279.82, 702.5, 350.61]\n\nFrame 4:\n  Drone pose: [108.45, -1.64, 20.0, -42.46, 102.38, 0.0]\n  Target bbox: [591.92, 337.05, 630.35, 403.34]\n\nFrame 5 (current):\n  Drone pose: [108.5, -1.17, 20.0, -43.0, 100.18, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.47, \"ymin\": 325.58, \"xmax\": 657.38, \"ymax\": 393.9}, \"waypoint_deltas\": [{\"dx\": -0.28, \"dy\": 0.59, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.63, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": 1.12, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": 0.3, \"droll\": 0.0}, {\"dx\": -0.54, \"dy\": 1.65, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": 2.17, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 1.02, \"droll\": 0.0}, {\"dx\": -0.78, \"dy\": 2.7, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 2.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.46, "window_alt_abs_m": 0.31, "target_px_mean_hist": 543.5, "cur_frame_id": 85, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00096/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [107.56, 2.62, 19.84, -43.02, 101.87, 0.0]\n  Target bbox: [622.67, 326.39, 657.29, 392.98]\n\nFrame 2:\n  Drone pose: [107.4, 3.13, 19.83, -38.97, 100.59, 0.0]\n  Target bbox: [653.99, 394.6, 681.74, 458.04]\n\nFrame 3:\n  Drone pose: [107.24, 3.64, 19.99, -48.02, 104.27, 0.0]\n  Target bbox: [598.44, 245.62, 634.11, 312.67]\n\nFrame 4:\n  Drone pose: [106.94, 4.25, 19.98, -40.66, 103.87, 0.0]\n  Target bbox: [608.28, 370.55, 650.1, 437.21]\n\nFrame 5 (current):\n  Drone pose: [106.95, 4.71, 20.0, -43.26, 103.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.14, \"ymin\": 322.98, \"xmax\": 660.75, \"ymax\": 396.46}, \"waypoint_deltas\": [{\"dx\": -0.13, \"dy\": 0.54, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 1.03, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 1.08, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.71, \"droll\": 0.0}, {\"dx\": -0.38, \"dy\": 1.62, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 1.78, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 2.15, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 1.51, \"droll\": 0.0}, {\"dx\": -0.59, \"dy\": 2.69, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 2.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.23, "window_alt_abs_m": 0.19, "target_px_mean_hist": 561.5, "cur_frame_id": 96, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347/aug_001/frames_playback/frame_00108/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.16, 8.99, 20.03, -40.66, 104.46, 0.0]\n  Target bbox: [647.83, 371.19, 683.05, 435.35]\n\nFrame 2:\n  Drone pose: [105.84, 9.53, 20.0, -42.95, 104.26, 0.0]\n  Target bbox: [641.52, 333.39, 673.65, 401.0]\n\nFrame 3:\n  Drone pose: [105.76, 10.16, 20.0, -43.95, 104.04, 0.0]\n  Target bbox: [652.45, 317.68, 699.12, 384.66]\n\nFrame 4:\n  Drone pose: [105.45, 10.58, 20.0, -39.79, 103.53, 0.0]\n  Target bbox: [649.04, 383.45, 691.94, 458.44]\n\nFrame 5 (current):\n  Drone pose: [105.19, 11.09, 20.0, -43.33, 106.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.45, \"ymin\": 327.85, \"xmax\": 653.72, \"ymax\": 391.45}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": -0.84, \"droll\": 0.0}, {\"dx\": -0.67, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": 1.52, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": 2.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": 2.51, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.07, "window_alt_abs_m": 0.04, "target_px_mean_hist": 559.2, "cur_frame_id": 108, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_347", "difficulty_score": 0.2458, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.36, -0.28, 22.0, -46.41, -98.53, 0.0]\n  Target bbox: [617.1, 322.17, 662.96, 397.22]\n\nFrame 2:\n  Drone pose: [103.03, -1.84, 21.2, -47.01, -96.5, 0.0]\n  Target bbox: [626.12, 325.47, 653.6, 393.66]\n\nFrame 3:\n  Drone pose: [102.28, -2.8, 20.67, -47.07, -94.34, 0.0]\n  Target bbox: [615.3, 319.08, 664.7, 400.18]\n\nFrame 4:\n  Drone pose: [101.86, -3.46, 20.64, -47.27, -94.62, 0.0]\n  Target bbox: [616.96, 320.56, 663.11, 398.7]\n\nFrame 5 (current):\n  Drone pose: [101.59, -4.0, 20.62, -47.27, -95.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.0, \"ymin\": 325.0, \"xmax\": 659.64, \"ymax\": 394.01}, \"waypoint_deltas\": [{\"dx\": -0.17, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": 0.0, \"dyaw\": 0.55, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": -1.02, \"dz\": -0.05, \"dpitch\": 0.0, \"dyaw\": 0.97, \"droll\": 0.0}, {\"dx\": -0.42, \"dy\": -1.52, \"dz\": -0.07, \"dpitch\": 0.01, \"dyaw\": 1.31, \"droll\": 0.0}, {\"dx\": -0.51, \"dy\": -2.03, \"dz\": -0.09, \"dpitch\": 0.02, \"dyaw\": 1.61, \"droll\": 0.0}, {\"dx\": -0.6, \"dy\": -2.54, \"dz\": -0.2, \"dpitch\": 0.16, \"dyaw\": 1.88, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.24, "window_alt_abs_m": 1.38, "target_px_mean_hist": 561.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.84, -7.55, 20.36, -47.05, -93.02, 0.0]\n  Target bbox: [618.0, 325.17, 661.57, 393.85]\n\nFrame 2:\n  Drone pose: [100.78, -8.05, 20.33, -47.02, -92.82, 0.0]\n  Target bbox: [626.27, 325.33, 653.44, 393.71]\n\nFrame 3:\n  Drone pose: [100.72, -8.56, 20.3, -46.99, -92.64, 0.0]\n  Target bbox: [621.47, 325.0, 658.16, 393.97]\n\nFrame 4:\n  Drone pose: [100.68, -9.07, 20.27, -46.96, -92.51, 0.0]\n  Target bbox: [616.9, 323.44, 662.61, 395.51]\n\nFrame 5 (current):\n  Drone pose: [100.65, -9.58, 20.24, -46.94, -92.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.55, \"ymin\": 324.72, \"xmax\": 654.14, \"ymax\": 394.27}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -1.55, \"dz\": -0.07, \"dpitch\": 0.04, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": -2.07, \"dz\": -0.09, \"dpitch\": 0.03, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": -2.6, \"dz\": -0.11, \"dpitch\": 0.01, \"dyaw\": -0.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.59, "window_alt_abs_m": 0.11, "target_px_mean_hist": 630.5, "cur_frame_id": 15, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00026/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.14, -13.27, 20.1, -46.99, -94.0, 0.0]\n  Target bbox: [615.66, 323.62, 663.86, 395.36]\n\nFrame 2:\n  Drone pose: [101.31, -13.82, 20.09, -47.04, -94.56, 0.0]\n  Target bbox: [618.71, 322.47, 660.83, 396.5]\n\nFrame 3:\n  Drone pose: [101.5, -14.39, 20.08, -47.1, -95.19, 0.0]\n  Target bbox: [618.88, 324.08, 660.73, 394.88]\n\nFrame 4:\n  Drone pose: [101.7, -14.96, 20.07, -47.16, -95.87, 0.0]\n  Target bbox: [619.18, 323.9, 660.44, 395.06]\n\nFrame 5 (current):\n  Drone pose: [101.91, -15.54, 20.06, -47.25, -96.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.37, \"ymin\": 323.97, \"xmax\": 659.26, \"ymax\": 395.03}, \"waypoint_deltas\": [{\"dx\": 0.19, \"dy\": -0.59, \"dz\": -0.01, \"dpitch\": -0.09, \"dyaw\": -0.66, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.18, \"dz\": -0.02, \"dpitch\": -0.2, \"dyaw\": -1.22, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": -1.79, \"dz\": -0.02, \"dpitch\": -0.33, \"dyaw\": -1.63, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": -2.41, \"dz\": -0.03, \"dpitch\": -0.5, \"dyaw\": -1.85, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": -3.04, \"dz\": -0.03, \"dpitch\": -0.58, \"dyaw\": -3.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.56, "window_alt_abs_m": 0.04, "target_px_mean_hist": 644.0, "cur_frame_id": 26, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00037/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.22, -19.86, 20.02, -48.01, -102.9, 0.0]\n  Target bbox: [614.46, 317.96, 665.65, 401.11]\n\nFrame 2:\n  Drone pose: [102.06, -20.5, 20.02, -48.11, -104.13, 0.0]\n  Target bbox: [619.69, 319.59, 660.59, 399.46]\n\nFrame 3:\n  Drone pose: [101.87, -21.15, 20.01, -48.21, -105.27, 0.0]\n  Target bbox: [617.07, 317.34, 663.27, 401.73]\n\nFrame 4:\n  Drone pose: [101.66, -21.8, 20.01, -48.31, -106.35, 0.0]\n  Target bbox: [614.85, 316.03, 665.51, 403.09]\n\nFrame 5 (current):\n  Drone pose: [101.44, -22.45, 20.01, -48.41, -107.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.01, \"ymin\": 316.07, \"xmax\": 666.35, \"ymax\": 403.11}, \"waypoint_deltas\": [{\"dx\": -0.24, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": -1.28, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": -1.98, \"droll\": 0.0}, {\"dx\": -0.74, \"dy\": -1.91, \"dz\": -0.01, \"dpitch\": -0.24, \"dyaw\": -2.93, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -2.54, \"dz\": -0.01, \"dpitch\": -0.3, \"dyaw\": -3.84, \"droll\": 0.0}, {\"dx\": -1.28, \"dy\": -3.16, \"dz\": -0.01, \"dpitch\": -0.36, \"dyaw\": -4.72, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.49, "window_alt_abs_m": 0.01, "target_px_mean_hist": 682.0, "cur_frame_id": 37, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00048/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [99.61, -26.86, 20.0, -48.85, -113.84, 0.0]\n  Target bbox: [616.57, 320.18, 663.69, 398.78]\n\nFrame 2:\n  Drone pose: [99.32, -27.48, 20.0, -48.89, -114.68, 0.0]\n  Target bbox: [617.94, 321.51, 662.28, 397.38]\n\nFrame 3:\n  Drone pose: [99.01, -28.1, 20.0, -48.93, -115.46, 0.0]\n  Target bbox: [617.57, 322.89, 662.6, 395.97]\n\nFrame 4:\n  Drone pose: [98.69, -28.7, 20.0, -48.95, -116.15, 0.0]\n  Target bbox: [619.4, 322.37, 660.84, 396.55]\n\nFrame 5 (current):\n  Drone pose: [98.36, -29.29, 20.0, -48.97, -116.83, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.35, \"ymin\": 316.78, \"xmax\": 663.12, \"ymax\": 402.21}, \"waypoint_deltas\": [{\"dx\": -0.32, \"dy\": -0.6, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.72, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": -1.18, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -1.45, \"droll\": 0.0}, {\"dx\": -0.91, \"dy\": -1.76, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -2.22, \"droll\": 0.0}, {\"dx\": -1.2, \"dy\": -2.35, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -3.0, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": -2.92, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": -3.78, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.99, "window_alt_abs_m": 0.0, "target_px_mean_hist": 674.2, "cur_frame_id": 48, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00060/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [95.97, -33.93, 20.0, -49.01, -121.28, 0.0]\n  Target bbox: [620.8, 322.77, 659.48, 396.08]\n\nFrame 2:\n  Drone pose: [95.64, -34.51, 20.0, -48.97, -121.9, 0.0]\n  Target bbox: [617.14, 317.95, 663.36, 401.08]\n\nFrame 3:\n  Drone pose: [95.29, -35.08, 20.0, -48.93, -122.49, 0.0]\n  Target bbox: [616.79, 320.28, 663.6, 398.58]\n\nFrame 4:\n  Drone pose: [94.94, -35.66, 20.0, -48.9, -123.08, 0.0]\n  Target bbox: [617.68, 317.89, 662.84, 401.12]\n\nFrame 5 (current):\n  Drone pose: [94.6, -36.23, 20.0, -48.86, -123.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.26, \"ymin\": 321.81, \"xmax\": 662.1, \"ymax\": 397.08}, \"waypoint_deltas\": [{\"dx\": -0.35, \"dy\": -0.58, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.58, \"droll\": 0.0}, {\"dx\": -0.69, \"dy\": -1.15, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": -1.16, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": -1.72, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -1.74, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": -2.29, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": -2.3, \"droll\": 0.0}, {\"dx\": -1.73, \"dy\": -2.86, \"dz\": 0.0, \"dpitch\": 0.26, \"dyaw\": -2.88, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.38, "window_alt_abs_m": 0.0, "target_px_mean_hist": 674.8, "cur_frame_id": 60, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00071/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.15, -40.21, 20.0, -48.47, -127.53, 0.0]\n  Target bbox: [620.64, 321.75, 659.78, 397.22]\n\nFrame 2:\n  Drone pose: [91.79, -40.75, 20.0, -48.39, -128.01, 0.0]\n  Target bbox: [623.14, 323.3, 657.19, 395.59]\n\nFrame 3:\n  Drone pose: [91.39, -41.29, 20.0, -48.33, -128.36, 0.0]\n  Target bbox: [620.22, 326.14, 660.04, 392.69]\n\nFrame 4:\n  Drone pose: [91.08, -41.85, 20.0, -48.21, -128.98, 0.0]\n  Target bbox: [621.75, 326.16, 658.52, 392.71]\n\nFrame 5 (current):\n  Drone pose: [91.03, -42.38, 20.0, -47.77, -130.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.5, \"ymin\": 323.33, \"xmax\": 657.87, \"ymax\": 395.63}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": 0.44, \"dyaw\": -1.14, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": 0.92, \"dyaw\": -2.26, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": 1.46, \"dyaw\": -3.31, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -2.23, \"dz\": 0.0, \"dpitch\": 1.98, \"dyaw\": -5.36, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": -2.86, \"dz\": 0.0, \"dpitch\": 2.04, \"dyaw\": -6.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.68, "window_alt_abs_m": 0.0, "target_px_mean_hist": 663.8, "cur_frame_id": 71, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00082/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [89.27, -47.41, 20.0, -47.59, -137.88, 0.0]\n  Target bbox: [620.34, 324.0, 659.26, 394.91]\n\nFrame 2:\n  Drone pose: [88.04, -48.38, 20.0, -49.03, -137.38, 0.0]\n  Target bbox: [622.23, 324.63, 657.43, 394.15]\n\nFrame 3:\n  Drone pose: [86.88, -49.01, 20.0, -50.02, -136.12, 0.0]\n  Target bbox: [621.27, 324.3, 658.36, 394.38]\n\nFrame 4:\n  Drone pose: [85.97, -49.33, 20.0, -50.32, -134.64, 0.0]\n  Target bbox: [626.92, 321.16, 653.33, 397.58]\n\nFrame 5 (current):\n  Drone pose: [85.29, -49.48, 20.0, -50.09, -133.28, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.36, \"ymin\": 322.42, \"xmax\": 661.12, \"ymax\": 396.3}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": 0.45, \"dyaw\": 1.0, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": -0.32, \"dz\": 0.0, \"dpitch\": 0.89, \"dyaw\": 1.62, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": -0.6, \"dz\": 0.0, \"dpitch\": 1.25, \"dyaw\": 1.92, \"droll\": 0.0}, {\"dx\": -1.83, \"dy\": -0.95, \"dz\": 0.0, \"dpitch\": 1.54, \"dyaw\": 2.03, \"droll\": 0.0}, {\"dx\": -2.23, \"dy\": -1.33, \"dz\": 0.0, \"dpitch\": 1.25, \"dyaw\": 3.35, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.6, "window_alt_abs_m": 0.0, "target_px_mean_hist": 683.8, "cur_frame_id": 82, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00093/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [82.19, -51.6, 20.0, -48.96, -128.47, 0.0]\n  Target bbox: [619.89, 323.37, 660.47, 395.44]\n\nFrame 2:\n  Drone pose: [81.7, -51.99, 20.0, -48.81, -128.26, 0.0]\n  Target bbox: [620.23, 323.57, 660.11, 395.24]\n\nFrame 3:\n  Drone pose: [81.02, -52.36, 20.0, -48.82, -127.51, 0.0]\n  Target bbox: [618.93, 322.37, 661.47, 396.49]\n\nFrame 4:\n  Drone pose: [80.34, -52.73, 20.0, -48.83, -126.76, 0.0]\n  Target bbox: [618.78, 319.69, 661.73, 399.3]\n\nFrame 5 (current):\n  Drone pose: [79.66, -53.11, 20.0, -48.83, -126.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.9, \"ymin\": 321.21, \"xmax\": 660.53, \"ymax\": 397.76}, \"waypoint_deltas\": [{\"dx\": -0.67, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.75, \"droll\": 0.0}, {\"dx\": -1.35, \"dy\": -0.74, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 1.51, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": -1.11, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 2.26, \"droll\": 0.0}, {\"dx\": -2.7, \"dy\": -1.48, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 3.01, \"droll\": 0.0}, {\"dx\": -3.38, \"dy\": -1.85, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 3.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.47, "window_alt_abs_m": 0.0, "target_px_mean_hist": 686.8, "cur_frame_id": 93, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/ORI/frames_playback/frame_00104/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [74.92, -55.71, 20.0, -48.72, -120.74, 0.0]\n  Target bbox: [617.86, 320.25, 661.97, 398.77]\n\nFrame 2:\n  Drone pose: [74.25, -56.08, 20.0, -49.1, -118.48, 0.0]\n  Target bbox: [616.9, 320.05, 662.92, 398.95]\n\nFrame 3:\n  Drone pose: [73.57, -56.45, 20.0, -49.44, -116.17, 0.0]\n  Target bbox: [620.05, 320.63, 659.71, 398.26]\n\nFrame 4:\n  Drone pose: [72.89, -56.82, 20.0, -49.73, -113.8, 0.0]\n  Target bbox: [617.66, 319.24, 662.05, 399.66]\n\nFrame 5 (current):\n  Drone pose: [72.21, -57.19, 20.0, -49.97, -111.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.65, \"ymin\": 318.85, \"xmax\": 665.09, \"ymax\": 399.99}, \"waypoint_deltas\": [{\"dx\": -0.67, \"dy\": -0.38, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 2.45, \"droll\": 0.0}, {\"dx\": -1.35, \"dy\": -0.75, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 4.93, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": -1.12, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": 7.43, \"droll\": 0.0}, {\"dx\": -2.7, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": -0.28, \"dyaw\": 9.94, \"droll\": 0.0}, {\"dx\": -3.38, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": -0.27, \"dyaw\": 12.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.34, "window_alt_abs_m": 0.0, "target_px_mean_hist": 682.5, "cur_frame_id": 104, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.36, -0.28, 22.0, -46.41, -98.53, 0.0]\n  Target bbox: [617.13, 322.14, 662.91, 397.24]\n\nFrame 2:\n  Drone pose: [103.05, -1.78, 21.05, -51.22, -93.46, 0.0]\n  Target bbox: [584.14, 250.92, 622.97, 317.62]\n\nFrame 3:\n  Drone pose: [102.36, -2.83, 20.63, -47.06, -94.61, 0.0]\n  Target bbox: [616.11, 319.72, 663.83, 399.48]\n\nFrame 4:\n  Drone pose: [101.86, -3.46, 20.64, -46.53, -89.62, 0.0]\n  Target bbox: [555.98, 332.14, 608.98, 415.74]\n\nFrame 5 (current):\n  Drone pose: [101.49, -3.93, 20.7, -47.29, -95.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.74, \"ymin\": 324.46, \"xmax\": 660.87, \"ymax\": 394.5}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": -0.58, \"dz\": -0.11, \"dpitch\": 0.02, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": -0.21, \"dy\": -1.09, \"dz\": -0.13, \"dpitch\": 0.02, \"dyaw\": 0.62, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": -1.59, \"dz\": -0.15, \"dpitch\": 0.03, \"dyaw\": 0.96, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": -2.1, \"dz\": -0.17, \"dpitch\": 0.04, \"dyaw\": 1.26, \"droll\": 0.0}, {\"dx\": -0.5, \"dy\": -2.61, \"dz\": -0.28, \"dpitch\": 0.18, \"dyaw\": 1.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.62, "window_alt_abs_m": 1.43, "target_px_mean_hist": 558.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.84, -7.55, 20.36, -47.05, -93.02, 0.0]\n  Target bbox: [619.58, 325.16, 660.03, 393.81]\n\nFrame 2:\n  Drone pose: [100.78, -8.05, 20.33, -47.02, -92.82, 0.0]\n  Target bbox: [621.71, 323.69, 657.88, 395.32]\n\nFrame 3:\n  Drone pose: [100.72, -8.56, 20.3, -46.99, -92.64, 0.0]\n  Target bbox: [622.91, 324.96, 656.73, 394.05]\n\nFrame 4:\n  Drone pose: [100.68, -9.07, 20.27, -46.96, -92.51, 0.0]\n  Target bbox: [618.68, 324.67, 660.91, 394.24]\n\nFrame 5 (current):\n  Drone pose: [100.48, -9.5, 20.25, -46.42, -94.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 649.84, \"ymin\": 331.97, \"xmax\": 691.74, \"ymax\": 401.72}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": -0.59, \"dz\": -0.03, \"dpitch\": -0.5, \"dyaw\": 2.1, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -1.11, \"dz\": -0.06, \"dpitch\": -0.49, \"dyaw\": 2.05, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -1.63, \"dz\": -0.08, \"dpitch\": -0.48, \"dyaw\": 1.91, \"droll\": 0.0}, {\"dx\": 0.29, \"dy\": -2.15, \"dz\": -0.1, \"dpitch\": -0.49, \"dyaw\": 1.7, \"droll\": 0.0}, {\"dx\": 0.39, \"dy\": -2.68, \"dz\": -0.12, \"dpitch\": -0.51, \"dyaw\": 1.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.51, "window_alt_abs_m": 0.11, "target_px_mean_hist": 627.0, "cur_frame_id": 15, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00026/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.14, -13.27, 20.1, -46.99, -94.0, 0.0]\n  Target bbox: [619.94, 323.93, 659.66, 395.02]\n\nFrame 2:\n  Drone pose: [101.31, -13.74, 20.07, -49.6, -99.54, 0.0]\n  Target bbox: [678.19, 278.38, 717.24, 352.76]\n\nFrame 3:\n  Drone pose: [101.5, -14.39, 20.08, -44.92, -92.62, 0.0]\n  Target bbox: [595.65, 361.77, 624.65, 431.15]\n\nFrame 4:\n  Drone pose: [101.81, -14.92, 20.18, -44.86, -99.44, 0.0]\n  Target bbox: [662.29, 363.94, 691.67, 436.71]\n\nFrame 5 (current):\n  Drone pose: [101.9, -15.62, 20.18, -47.76, -100.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 659.47, \"ymin\": 318.71, \"xmax\": 709.7, \"ymax\": 395.86}, \"waypoint_deltas\": [{\"dx\": 0.2, \"dy\": -0.51, \"dz\": -0.13, \"dpitch\": 0.42, \"dyaw\": 3.24, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": -1.1, \"dz\": -0.14, \"dpitch\": 0.31, \"dyaw\": 2.68, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": -1.71, \"dz\": -0.14, \"dpitch\": 0.18, \"dyaw\": 2.27, \"droll\": 0.0}, {\"dx\": 0.52, \"dy\": -2.33, \"dz\": -0.15, \"dpitch\": 0.01, \"dyaw\": 2.05, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": -2.96, \"dz\": -0.15, \"dpitch\": -0.07, \"dyaw\": 0.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.31, "window_alt_abs_m": 0.14, "target_px_mean_hist": 651.0, "cur_frame_id": 26, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00037/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.25, -20.04, 20.07, -46.04, -108.14, 0.0]\n  Target bbox: [674.06, 363.97, 718.62, 437.02]\n\nFrame 2:\n  Drone pose: [102.13, -20.45, 19.87, -47.77, -104.3, 0.0]\n  Target bbox: [618.83, 320.88, 661.32, 398.14]\n\nFrame 3:\n  Drone pose: [102.04, -21.1, 20.07, -45.25, -100.78, 0.0]\n  Target bbox: [556.86, 367.93, 609.78, 451.68]\n\nFrame 4:\n  Drone pose: [101.66, -21.8, 20.01, -48.31, -106.35, 0.0]\n  Target bbox: [614.22, 315.85, 666.14, 403.3]\n\nFrame 5 (current):\n  Drone pose: [101.44, -22.45, 20.01, -47.01, -107.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.83, \"ymin\": 339.41, \"xmax\": 670.82, \"ymax\": 426.66}, \"waypoint_deltas\": [{\"dx\": -0.24, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": -1.49, \"dyaw\": -0.59, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": -1.28, \"dz\": 0.0, \"dpitch\": -1.57, \"dyaw\": -1.57, \"droll\": 0.0}, {\"dx\": -0.74, \"dy\": -1.91, \"dz\": -0.01, \"dpitch\": -1.64, \"dyaw\": -2.52, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -2.54, \"dz\": -0.01, \"dpitch\": -1.7, \"dyaw\": -3.43, \"droll\": 0.0}, {\"dx\": -1.28, \"dy\": -3.16, \"dz\": -0.01, \"dpitch\": -1.76, \"dyaw\": -4.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.37, "window_alt_abs_m": 0.46, "target_px_mean_hist": 658.2, "cur_frame_id": 37, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00048/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [99.7, -26.92, 19.93, -48.78, -114.22, 0.0]\n  Target bbox: [617.82, 320.5, 662.47, 398.49]\n\nFrame 2:\n  Drone pose: [99.32, -27.48, 20.0, -48.89, -114.68, 0.0]\n  Target bbox: [615.43, 318.77, 664.88, 400.17]\n\nFrame 3:\n  Drone pose: [99.06, -28.01, 20.15, -49.0, -115.48, 0.0]\n  Target bbox: [621.07, 321.01, 659.24, 397.94]\n\nFrame 4:\n  Drone pose: [98.58, -28.83, 19.93, -49.13, -116.0, 0.0]\n  Target bbox: [615.53, 316.36, 664.95, 402.68]\n\nFrame 5 (current):\n  Drone pose: [98.36, -29.29, 20.0, -52.2, -121.83, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 672.45, \"ymin\": 271.14, \"xmax\": 719.35, \"ymax\": 342.64}, \"waypoint_deltas\": [{\"dx\": -0.32, \"dy\": -0.6, \"dz\": 0.0, \"dpitch\": 3.23, \"dyaw\": 4.28, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": -1.18, \"dz\": 0.0, \"dpitch\": 3.26, \"dyaw\": 3.55, \"droll\": 0.0}, {\"dx\": -0.91, \"dy\": -1.76, \"dz\": 0.0, \"dpitch\": 3.31, \"dyaw\": 2.78, \"droll\": 0.0}, {\"dx\": -1.2, \"dy\": -2.35, \"dz\": 0.0, \"dpitch\": 3.37, \"dyaw\": 2.0, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": -2.92, \"dz\": 0.0, \"dpitch\": 3.44, \"dyaw\": 1.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.61, "window_alt_abs_m": 0.52, "target_px_mean_hist": 657.2, "cur_frame_id": 48, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00060/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [95.97, -33.93, 20.0, -44.01, -117.16, 0.0]\n  Target bbox: [569.58, 402.02, 618.62, 487.83]\n\nFrame 2:\n  Drone pose: [95.5, -34.4, 19.95, -48.85, -121.31, 0.0]\n  Target bbox: [619.37, 322.37, 660.94, 396.55]\n\nFrame 3:\n  Drone pose: [95.13, -35.15, 20.08, -48.9, -126.22, 0.0]\n  Target bbox: [662.86, 327.26, 707.99, 407.35]\n\nFrame 4:\n  Drone pose: [94.94, -35.66, 20.0, -48.9, -123.08, 0.0]\n  Target bbox: [617.73, 318.28, 662.77, 400.73]\n\nFrame 5 (current):\n  Drone pose: [94.66, -36.2, 20.07, -52.46, -126.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 648.87, \"ymin\": 258.81, \"xmax\": 693.11, \"ymax\": 340.14}, \"waypoint_deltas\": [{\"dx\": -0.41, \"dy\": -0.61, \"dz\": -0.07, \"dpitch\": 3.64, \"dyaw\": 2.25, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": -1.18, \"dz\": -0.07, \"dpitch\": 3.69, \"dyaw\": 1.67, \"droll\": 0.0}, {\"dx\": -1.1, \"dy\": -1.75, \"dz\": -0.07, \"dpitch\": 3.74, \"dyaw\": 1.09, \"droll\": 0.0}, {\"dx\": -1.45, \"dy\": -2.32, \"dz\": -0.07, \"dpitch\": 3.79, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": -1.79, \"dy\": -2.89, \"dz\": -0.07, \"dpitch\": 3.86, \"dyaw\": -0.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.63, "window_alt_abs_m": 0.33, "target_px_mean_hist": 664.2, "cur_frame_id": 60, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00071/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.15, -40.21, 20.0, -48.87, -128.62, 0.0]\n  Target bbox: [633.19, 315.47, 672.06, 390.56]\n\nFrame 2:\n  Drone pose: [91.89, -40.67, 19.88, -47.82, -124.32, 0.0]\n  Target bbox: [575.27, 323.22, 619.01, 404.02]\n\nFrame 3:\n  Drone pose: [91.31, -41.21, 19.89, -48.16, -127.97, 0.0]\n  Target bbox: [618.19, 323.83, 662.16, 394.97]\n\nFrame 4:\n  Drone pose: [91.08, -41.85, 20.0, -48.21, -128.98, 0.0]\n  Target bbox: [624.91, 322.48, 655.43, 396.45]\n\nFrame 5 (current):\n  Drone pose: [91.06, -42.57, 20.02, -48.01, -130.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.39, \"ymin\": 323.07, \"xmax\": 658.0, \"ymax\": 395.87}, \"waypoint_deltas\": [{\"dx\": -0.09, \"dy\": -0.33, \"dz\": -0.02, \"dpitch\": 0.68, \"dyaw\": -0.65, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -0.84, \"dz\": -0.02, \"dpitch\": 1.16, \"dyaw\": -1.77, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": -1.32, \"dz\": -0.02, \"dpitch\": 1.7, \"dyaw\": -2.82, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -2.04, \"dz\": -0.02, \"dpitch\": 2.22, \"dyaw\": -4.87, \"droll\": 0.0}, {\"dx\": -0.29, \"dy\": -2.67, \"dz\": -0.02, \"dpitch\": 2.28, \"dyaw\": -5.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.69, "window_alt_abs_m": 0.25, "target_px_mean_hist": 677.5, "cur_frame_id": 71, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00082/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [89.25, -47.48, 20.01, -46.7, -137.65, 0.0]\n  Target bbox: [615.4, 342.96, 655.73, 409.7]\n\nFrame 2:\n  Drone pose: [88.04, -48.38, 20.0, -48.64, -133.14, 0.0]\n  Target bbox: [579.2, 329.83, 606.67, 404.59]\n\nFrame 3:\n  Drone pose: [86.76, -48.89, 19.94, -49.95, -135.5, 0.0]\n  Target bbox: [625.1, 319.93, 654.53, 398.83]\n\nFrame 4:\n  Drone pose: [86.05, -49.23, 20.01, -50.11, -134.57, 0.0]\n  Target bbox: [620.09, 326.77, 660.1, 391.91]\n\nFrame 5 (current):\n  Drone pose: [85.26, -49.56, 20.11, -51.6, -138.4, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 675.34, \"ymin\": 304.42, \"xmax\": 712.21, \"ymax\": 377.61}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.04, \"dz\": -0.11, \"dpitch\": 1.96, \"dyaw\": 6.12, \"droll\": 0.0}, {\"dx\": -0.96, \"dy\": -0.24, \"dz\": -0.11, \"dpitch\": 2.4, \"dyaw\": 6.74, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": -0.52, \"dz\": -0.11, \"dpitch\": 2.76, \"dyaw\": 7.04, \"droll\": 0.0}, {\"dx\": -1.8, \"dy\": -0.87, \"dz\": -0.11, \"dpitch\": 3.05, \"dyaw\": 7.15, \"droll\": 0.0}, {\"dx\": -2.2, \"dy\": -1.25, \"dz\": -0.11, \"dpitch\": 2.76, \"dyaw\": 8.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.63, "window_alt_abs_m": 0.23, "target_px_mean_hist": 685.0, "cur_frame_id": 82, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00093/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [82.1, -51.57, 20.08, -53.5, -123.16, 0.0]\n  Target bbox: [564.05, 249.99, 605.09, 325.63]\n\nFrame 2:\n  Drone pose: [81.7, -51.99, 20.0, -48.81, -128.26, 0.0]\n  Target bbox: [621.66, 324.35, 658.65, 394.51]\n\nFrame 3:\n  Drone pose: [81.02, -52.36, 20.0, -48.82, -127.51, 0.0]\n  Target bbox: [624.74, 321.39, 655.62, 397.52]\n\nFrame 4:\n  Drone pose: [80.22, -52.65, 19.91, -48.71, -126.25, 0.0]\n  Target bbox: [621.73, 318.64, 658.73, 400.27]\n\nFrame 5 (current):\n  Drone pose: [79.74, -53.27, 20.02, -50.84, -121.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 562.61, \"ymin\": 295.63, \"xmax\": 606.37, \"ymax\": 365.38}, \"waypoint_deltas\": [{\"dx\": -0.75, \"dy\": -0.21, \"dz\": -0.02, \"dpitch\": 2.01, \"dyaw\": -3.72, \"droll\": 0.0}, {\"dx\": -1.43, \"dy\": -0.58, \"dz\": -0.02, \"dpitch\": 2.02, \"dyaw\": -2.96, \"droll\": 0.0}, {\"dx\": -2.11, \"dy\": -0.95, \"dz\": -0.02, \"dpitch\": 2.03, \"dyaw\": -2.21, \"droll\": 0.0}, {\"dx\": -2.78, \"dy\": -1.32, \"dz\": -0.02, \"dpitch\": 2.05, \"dyaw\": -1.46, \"droll\": 0.0}, {\"dx\": -3.46, \"dy\": -1.69, \"dz\": -0.02, \"dpitch\": 2.07, \"dyaw\": -0.71, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.84, "window_alt_abs_m": 0.27, "target_px_mean_hist": 677.0, "cur_frame_id": 93, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815/aug_001/frames_playback/frame_00104/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [74.86, -55.79, 19.87, -48.21, -116.72, 0.0]\n  Target bbox: [575.15, 330.61, 615.57, 407.27]\n\nFrame 2:\n  Drone pose: [74.3, -56.05, 19.84, -45.58, -118.67, 0.0]\n  Target bbox: [618.41, 372.96, 662.99, 452.85]\n\nFrame 3:\n  Drone pose: [73.5, -56.4, 20.05, -50.75, -119.38, 0.0]\n  Target bbox: [653.21, 298.37, 704.41, 379.65]\n\nFrame 4:\n  Drone pose: [73.06, -56.8, 19.91, -46.36, -109.33, 0.0]\n  Target bbox: [561.91, 373.27, 607.36, 453.11]\n\nFrame 5 (current):\n  Drone pose: [72.21, -57.19, 20.0, -53.06, -108.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 580.31, \"ymin\": 268.23, \"xmax\": 635.58, \"ymax\": 348.37}, \"waypoint_deltas\": [{\"dx\": -0.67, \"dy\": -0.38, \"dz\": 0.0, \"dpitch\": 3.1, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": -1.35, \"dy\": -0.75, \"dz\": 0.0, \"dpitch\": 2.95, \"dyaw\": 2.05, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": -1.12, \"dz\": 0.0, \"dpitch\": 2.86, \"dyaw\": 4.55, \"droll\": 0.0}, {\"dx\": -2.7, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": 2.81, \"dyaw\": 7.06, \"droll\": 0.0}, {\"dx\": -3.38, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": 2.82, \"dyaw\": 9.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.54, "window_alt_abs_m": 0.46, "target_px_mean_hist": 661.8, "cur_frame_id": 104, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776475815", "difficulty_score": 0.2525, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.39, 86.94, 22.0, -46.4, -90.0, 0.0]\n  Target bbox: [620.73, 327.46, 659.27, 391.74]\n\nFrame 2:\n  Drone pose: [103.5, 85.51, 21.2, -46.63, -87.31, 0.0]\n  Target bbox: [620.42, 325.6, 659.99, 393.51]\n\nFrame 3:\n  Drone pose: [103.13, 84.63, 20.67, -46.42, -86.14, 0.0]\n  Target bbox: [621.43, 324.21, 658.97, 394.86]\n\nFrame 4:\n  Drone pose: [103.02, 84.0, 20.64, -46.57, -85.77, 0.0]\n  Target bbox: [617.57, 324.68, 662.89, 394.45]\n\nFrame 5 (current):\n  Drone pose: [102.99, 83.45, 20.62, -46.59, -85.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.92, \"ymin\": 323.81, \"xmax\": 656.45, \"ymax\": 395.26}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.54, \"dz\": -0.07, \"dpitch\": 0.03, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.06, \"dz\": -0.09, \"dpitch\": 0.04, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.57, \"dz\": -0.2, \"dpitch\": 0.18, \"dyaw\": 0.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.32, "window_alt_abs_m": 1.38, "target_px_mean_hist": 700.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00017/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.99, 78.83, 20.3, -46.31, -85.64, 0.0]\n  Target bbox: [616.89, 324.07, 663.6, 395.06]\n\nFrame 2:\n  Drone pose: [102.99, 78.32, 20.27, -46.28, -85.63, 0.0]\n  Target bbox: [616.95, 324.1, 663.52, 394.99]\n\nFrame 3:\n  Drone pose: [102.99, 77.81, 20.24, -46.26, -85.63, 0.0]\n  Target bbox: [618.83, 323.67, 661.63, 395.41]\n\nFrame 4:\n  Drone pose: [102.99, 77.3, 20.22, -46.24, -85.62, 0.0]\n  Target bbox: [616.84, 323.93, 663.65, 395.2]\n\nFrame 5 (current):\n  Drone pose: [102.99, 76.79, 20.19, -46.21, -85.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.89, \"ymin\": 323.9, \"xmax\": 663.58, \"ymax\": 395.17}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.02, \"dz\": -0.04, \"dpitch\": 0.03, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.53, \"dz\": -0.06, \"dpitch\": 0.04, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -2.03, \"dz\": -0.07, \"dpitch\": 0.05, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -2.54, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": 0.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.02, "window_alt_abs_m": 0.11, "target_px_mean_hist": 758.8, "cur_frame_id": 17, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.98, 71.71, 20.05, -46.12, -85.59, 0.0]\n  Target bbox: [623.19, 322.89, 657.19, 396.13]\n\nFrame 2:\n  Drone pose: [102.98, 71.21, 20.04, -46.12, -85.59, 0.0]\n  Target bbox: [623.96, 322.95, 656.41, 396.09]\n\nFrame 3:\n  Drone pose: [102.98, 70.7, 20.04, -46.12, -85.58, 0.0]\n  Target bbox: [616.71, 323.7, 663.78, 395.37]\n\nFrame 4:\n  Drone pose: [102.98, 70.19, 20.03, -46.12, -85.58, 0.0]\n  Target bbox: [619.2, 323.21, 661.25, 395.84]\n\nFrame 5 (current):\n  Drone pose: [102.98, 69.69, 20.03, -46.12, -85.58, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.2, \"ymin\": 322.93, \"xmax\": 662.28, \"ymax\": 396.14}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.01, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.52, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.02, \"dz\": -0.02, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.52, \"dz\": -0.02, \"dpitch\": -0.02, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.02, "target_px_mean_hist": 781.2, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.98, 64.65, 20.0, -46.16, -85.56, 0.0]\n  Target bbox: [618.22, 323.2, 662.26, 395.87]\n\nFrame 2:\n  Drone pose: [102.98, 64.14, 20.0, -46.16, -85.56, 0.0]\n  Target bbox: [621.81, 322.63, 658.61, 396.42]\n\nFrame 3:\n  Drone pose: [102.98, 63.64, 20.0, -46.16, -85.56, 0.0]\n  Target bbox: [621.27, 323.0, 659.16, 396.08]\n\nFrame 4:\n  Drone pose: [102.98, 63.14, 20.0, -46.17, -85.56, 0.0]\n  Target bbox: [623.6, 323.0, 656.79, 396.07]\n\nFrame 5 (current):\n  Drone pose: [102.98, 62.63, 20.0, -46.17, -85.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.38, \"ymin\": 322.76, \"xmax\": 657.0, \"ymax\": 396.28}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.01, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.51, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 768.0, "cur_frame_id": 45, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.98, 57.61, 20.0, -46.21, -85.55, 0.0]\n  Target bbox: [617.64, 322.9, 662.84, 396.14]\n\nFrame 2:\n  Drone pose: [102.98, 57.11, 20.0, -46.21, -85.54, 0.0]\n  Target bbox: [616.63, 323.51, 663.85, 395.53]\n\nFrame 3:\n  Drone pose: [102.98, 56.6, 20.0, -46.21, -85.54, 0.0]\n  Target bbox: [619.68, 322.56, 660.78, 396.48]\n\nFrame 4:\n  Drone pose: [102.98, 56.1, 20.0, -46.22, -85.54, 0.0]\n  Target bbox: [619.29, 322.93, 661.17, 396.13]\n\nFrame 5 (current):\n  Drone pose: [102.98, 55.6, 20.0, -46.22, -85.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.54, \"ymin\": 323.03, \"xmax\": 657.87, \"ymax\": 396.04}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 777.5, "cur_frame_id": 59, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00073/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.98, 50.59, 20.0, -46.24, -85.54, 0.0]\n  Target bbox: [618.1, 322.77, 662.39, 396.29]\n\nFrame 2:\n  Drone pose: [102.98, 50.09, 20.0, -46.24, -85.54, 0.0]\n  Target bbox: [618.26, 323.16, 662.22, 395.9]\n\nFrame 3:\n  Drone pose: [102.98, 49.58, 20.0, -46.25, -85.54, 0.0]\n  Target bbox: [617.83, 323.01, 662.67, 396.07]\n\nFrame 4:\n  Drone pose: [102.98, 49.08, 20.0, -46.25, -85.53, 0.0]\n  Target bbox: [617.05, 323.41, 663.45, 395.68]\n\nFrame 5 (current):\n  Drone pose: [102.98, 48.58, 20.0, -46.25, -85.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.32, \"ymin\": 322.75, \"xmax\": 660.13, \"ymax\": 396.32}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 765.0, "cur_frame_id": 73, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.98, 43.57, 20.0, -46.26, -85.54, 0.0]\n  Target bbox: [617.92, 322.95, 662.57, 396.13]\n\nFrame 2:\n  Drone pose: [102.98, 43.07, 20.0, -46.26, -85.55, 0.0]\n  Target bbox: [619.74, 322.98, 660.7, 396.05]\n\nFrame 3:\n  Drone pose: [102.99, 42.57, 20.0, -46.26, -85.57, 0.0]\n  Target bbox: [623.56, 323.03, 656.82, 396.04]\n\nFrame 4:\n  Drone pose: [103.0, 42.07, 20.0, -46.27, -85.6, 0.0]\n  Target bbox: [616.61, 323.58, 663.88, 395.49]\n\nFrame 5 (current):\n  Drone pose: [103.01, 41.57, 20.0, -46.27, -85.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.04, \"ymin\": 323.11, \"xmax\": 661.41, \"ymax\": 395.93}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": -2.02, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": -2.52, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.1, "window_alt_abs_m": 0.0, "target_px_mean_hist": 765.2, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00101/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.98, 36.44, 20.0, -46.56, -88.68, 0.0]\n  Target bbox: [617.31, 324.21, 663.21, 394.78]\n\nFrame 2:\n  Drone pose: [104.19, 35.89, 20.0, -46.63, -89.37, 0.0]\n  Target bbox: [624.54, 323.4, 655.84, 395.56]\n\nFrame 3:\n  Drone pose: [104.43, 35.34, 20.0, -46.72, -90.13, 0.0]\n  Target bbox: [616.93, 324.37, 663.0, 394.61]\n\nFrame 4:\n  Drone pose: [104.68, 34.78, 20.0, -46.8, -90.94, 0.0]\n  Target bbox: [617.91, 323.75, 661.58, 395.24]\n\nFrame 5 (current):\n  Drone pose: [104.94, 34.22, 20.0, -46.9, -91.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.32, \"ymin\": 323.19, \"xmax\": 659.23, \"ymax\": 395.75}, \"waypoint_deltas\": [{\"dx\": 0.26, \"dy\": -0.58, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.84, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": -1.16, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": -1.62, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": -1.74, \"dz\": 0.0, \"dpitch\": -0.33, \"dyaw\": -2.29, \"droll\": 0.0}, {\"dx\": 0.84, \"dy\": -2.33, \"dz\": 0.0, \"dpitch\": -0.45, \"dyaw\": -2.77, \"droll\": 0.0}, {\"dx\": 0.91, \"dy\": -2.92, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": -3.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.09, "window_alt_abs_m": 0.0, "target_px_mean_hist": 792.8, "cur_frame_id": 101, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00113/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00115/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.19, 28.34, 20.0, -47.83, -101.04, 0.0]\n  Target bbox: [613.29, 316.92, 666.86, 402.29]\n\nFrame 2:\n  Drone pose: [104.91, 27.77, 20.0, -47.87, -101.82, 0.0]\n  Target bbox: [613.98, 317.53, 666.17, 401.64]\n\nFrame 3:\n  Drone pose: [104.61, 27.21, 20.0, -47.9, -102.49, 0.0]\n  Target bbox: [617.57, 319.67, 662.57, 399.36]\n\nFrame 4:\n  Drone pose: [104.27, 26.66, 20.0, -47.92, -103.05, 0.0]\n  Target bbox: [612.38, 316.33, 667.87, 402.98]\n\nFrame 5 (current):\n  Drone pose: [103.9, 26.12, 20.0, -47.93, -103.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.29, \"ymin\": 316.38, \"xmax\": 667.93, \"ymax\": 402.9}, \"waypoint_deltas\": [{\"dx\": -0.38, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.4, \"droll\": 0.0}, {\"dx\": -0.78, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.73, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": -1.54, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -1.06, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -1.43, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": -2.52, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": -1.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.48, "window_alt_abs_m": 0.0, "target_px_mean_hist": 765.8, "cur_frame_id": 115, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/ORI/frames_playback/frame_00129/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.92, 21.28, 20.0, -47.67, -103.46, 0.0]\n  Target bbox: [613.43, 320.56, 666.17, 398.5]\n\nFrame 2:\n  Drone pose: [100.8, 20.82, 20.0, -47.65, -103.04, 0.0]\n  Target bbox: [616.87, 320.61, 662.74, 398.39]\n\nFrame 3:\n  Drone pose: [100.7, 20.36, 20.0, -47.62, -102.68, 0.0]\n  Target bbox: [617.17, 320.35, 662.42, 398.67]\n\nFrame 4:\n  Drone pose: [100.61, 19.9, 20.0, -47.59, -102.35, 0.0]\n  Target bbox: [621.18, 321.05, 658.46, 397.93]\n\nFrame 5 (current):\n  Drone pose: [100.52, 19.44, 20.0, -47.56, -102.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.54, \"ymin\": 320.86, \"xmax\": 665.03, \"ymax\": 398.21}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": -0.12, \"dy\": -0.94, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 0.45, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -1.41, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.62, \"droll\": 0.0}, {\"dx\": -0.21, \"dy\": -1.89, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 0.76, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": -2.37, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": 0.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.39, "window_alt_abs_m": 0.0, "target_px_mean_hist": 797.2, "cur_frame_id": 129, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.39, 86.94, 22.0, -48.85, -85.06, 0.0]\n  Target bbox: [561.73, 286.96, 603.41, 353.59]\n\nFrame 2:\n  Drone pose: [103.5, 85.51, 21.2, -46.93, -87.19, 0.0]\n  Target bbox: [620.8, 320.09, 656.72, 388.88]\n\nFrame 3:\n  Drone pose: [103.06, 84.69, 20.51, -46.99, -83.33, 0.0]\n  Target bbox: [588.52, 308.72, 629.99, 380.66]\n\nFrame 4:\n  Drone pose: [103.02, 84.0, 20.64, -48.97, -90.77, 0.0]\n  Target bbox: [678.62, 285.6, 718.23, 356.21]\n\nFrame 5 (current):\n  Drone pose: [102.99, 83.45, 20.62, -46.59, -85.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.07, \"ymin\": 324.01, \"xmax\": 655.26, \"ymax\": 395.05}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.54, \"dz\": -0.07, \"dpitch\": 0.03, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.06, \"dz\": -0.09, \"dpitch\": 0.04, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.57, \"dz\": -0.2, \"dpitch\": 0.18, \"dyaw\": 0.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.51, "window_alt_abs_m": 1.66, "target_px_mean_hist": 705.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00017/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.89, 78.71, 20.22, -46.37, -85.3, 0.0]\n  Target bbox: [622.91, 323.24, 657.48, 395.85]\n\nFrame 2:\n  Drone pose: [102.99, 78.32, 20.27, -51.28, -81.98, 0.0]\n  Target bbox: [573.54, 239.33, 620.57, 313.54]\n\nFrame 3:\n  Drone pose: [103.03, 77.99, 20.25, -45.99, -85.79, 0.0]\n  Target bbox: [617.3, 324.21, 663.18, 394.91]\n\nFrame 4:\n  Drone pose: [103.1, 77.23, 20.3, -48.82, -88.54, 0.0]\n  Target bbox: [650.58, 284.96, 690.86, 356.37]\n\nFrame 5 (current):\n  Drone pose: [102.99, 76.79, 20.19, -46.14, -82.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 583.36, \"ymin\": 324.78, \"xmax\": 627.33, \"ymax\": 398.07}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.02, \"dpitch\": -0.06, \"dyaw\": -2.96, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.02, \"dz\": -0.04, \"dpitch\": -0.04, \"dyaw\": -2.95, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.53, \"dz\": -0.06, \"dpitch\": -0.03, \"dyaw\": -2.95, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -2.03, \"dz\": -0.07, \"dpitch\": -0.02, \"dyaw\": -2.95, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -2.54, \"dz\": -0.09, \"dpitch\": -0.01, \"dyaw\": -2.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.77, "window_alt_abs_m": 0.24, "target_px_mean_hist": 761.2, "cur_frame_id": 17, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.82, 71.63, 19.99, -45.75, -80.06, 0.0]\n  Target bbox: [559.41, 330.01, 603.69, 405.54]\n\nFrame 2:\n  Drone pose: [102.98, 71.21, 20.04, -46.12, -85.59, 0.0]\n  Target bbox: [616.58, 323.65, 663.91, 395.45]\n\nFrame 3:\n  Drone pose: [103.12, 70.66, 20.03, -43.19, -83.69, 0.0]\n  Target bbox: [597.16, 373.19, 628.99, 447.14]\n\nFrame 4:\n  Drone pose: [103.04, 70.07, 19.95, -46.21, -85.74, 0.0]\n  Target bbox: [616.61, 323.53, 663.88, 395.52]\n\nFrame 5 (current):\n  Drone pose: [102.98, 69.69, 20.03, -46.12, -85.58, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.7, \"ymin\": 323.63, \"xmax\": 663.78, \"ymax\": 395.43}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.01, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.52, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.02, \"dz\": -0.02, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.52, \"dz\": -0.02, \"dpitch\": -0.02, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.63, "window_alt_abs_m": 0.22, "target_px_mean_hist": 785.5, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.93, 64.82, 19.94, -45.79, -85.44, 0.0]\n  Target bbox: [616.91, 323.63, 663.58, 395.49]\n\nFrame 2:\n  Drone pose: [102.98, 64.14, 20.0, -42.53, -90.2, 0.0]\n  Target bbox: [672.44, 386.59, 717.08, 457.54]\n\nFrame 3:\n  Drone pose: [102.98, 63.64, 20.0, -44.71, -81.56, 0.0]\n  Target bbox: [575.13, 347.89, 611.49, 422.39]\n\nFrame 4:\n  Drone pose: [102.98, 63.14, 20.0, -48.67, -85.91, 0.0]\n  Target bbox: [628.4, 280.78, 660.4, 354.23]\n\nFrame 5 (current):\n  Drone pose: [102.81, 62.6, 19.93, -46.1, -85.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.92, \"ymin\": 322.9, \"xmax\": 660.52, \"ymax\": 396.16}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": -0.47, \"dz\": 0.07, \"dpitch\": -0.08, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -0.97, \"dz\": 0.07, \"dpitch\": -0.08, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -1.48, \"dz\": 0.07, \"dpitch\": -0.08, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -1.98, \"dz\": 0.07, \"dpitch\": -0.09, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -2.48, \"dz\": 0.07, \"dpitch\": -0.09, \"dyaw\": -0.54, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.65, "window_alt_abs_m": 0.13, "target_px_mean_hist": 772.5, "cur_frame_id": 45, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.92, 57.72, 19.97, -45.99, -85.39, 0.0]\n  Target bbox: [618.18, 323.17, 662.3, 395.91]\n\nFrame 2:\n  Drone pose: [102.98, 57.11, 20.0, -44.19, -86.43, 0.0]\n  Target bbox: [633.85, 356.79, 667.62, 430.16]\n\nFrame 3:\n  Drone pose: [102.91, 56.43, 20.04, -46.55, -85.27, 0.0]\n  Target bbox: [621.14, 322.68, 659.28, 396.32]\n\nFrame 4:\n  Drone pose: [103.02, 56.1, 19.93, -46.13, -85.67, 0.0]\n  Target bbox: [620.18, 322.7, 660.28, 396.37]\n\nFrame 5 (current):\n  Drone pose: [102.95, 55.56, 19.85, -44.29, -80.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 557.12, \"ymin\": 352.83, \"xmax\": 605.72, \"ymax\": 429.16}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.46, \"dz\": 0.15, \"dpitch\": -1.93, \"dyaw\": -5.1, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -0.96, \"dz\": 0.15, \"dpitch\": -1.94, \"dyaw\": -5.1, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -1.46, \"dz\": 0.15, \"dpitch\": -1.94, \"dyaw\": -5.1, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -1.97, \"dz\": 0.15, \"dpitch\": -1.94, \"dyaw\": -5.1, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -2.47, \"dz\": 0.15, \"dpitch\": -1.94, \"dyaw\": -5.1, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.81, "window_alt_abs_m": 0.26, "target_px_mean_hist": 770.2, "cur_frame_id": 59, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00073/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.83, 50.68, 19.97, -45.7, -85.73, 0.0]\n  Target bbox: [631.56, 328.44, 663.77, 401.73]\n\nFrame 2:\n  Drone pose: [103.05, 50.15, 19.86, -45.94, -85.79, 0.0]\n  Target bbox: [621.88, 322.73, 658.53, 396.29]\n\nFrame 3:\n  Drone pose: [103.0, 49.5, 20.16, -46.63, -85.6, 0.0]\n  Target bbox: [617.84, 323.13, 662.65, 395.92]\n\nFrame 4:\n  Drone pose: [103.07, 49.25, 20.05, -46.07, -85.87, 0.0]\n  Target bbox: [621.33, 323.09, 659.08, 395.95]\n\nFrame 5 (current):\n  Drone pose: [102.97, 48.63, 19.9, -46.02, -85.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.38, \"ymin\": 322.8, \"xmax\": 663.11, \"ymax\": 396.24}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.55, \"dz\": 0.1, \"dpitch\": -0.23, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.05, \"dz\": 0.1, \"dpitch\": -0.23, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.55, \"dz\": 0.1, \"dpitch\": -0.23, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -2.05, \"dz\": 0.1, \"dpitch\": -0.23, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -2.55, \"dz\": 0.1, \"dpitch\": -0.24, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.87, "window_alt_abs_m": 0.68, "target_px_mean_hist": 778.8, "cur_frame_id": 73, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.98, 43.57, 20.0, -46.26, -85.54, 0.0]\n  Target bbox: [623.12, 323.05, 657.27, 396.01]\n\nFrame 2:\n  Drone pose: [102.98, 43.07, 20.0, -46.26, -85.55, 0.0]\n  Target bbox: [616.6, 323.52, 663.89, 395.53]\n\nFrame 3:\n  Drone pose: [103.15, 42.66, 20.05, -50.59, -81.1, 0.0]\n  Target bbox: [559.65, 250.26, 603.21, 325.61]\n\nFrame 4:\n  Drone pose: [102.96, 42.23, 20.07, -46.9, -90.52, 0.0]\n  Target bbox: [675.05, 313.25, 722.96, 383.14]\n\nFrame 5 (current):\n  Drone pose: [103.01, 41.57, 20.0, -46.27, -85.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.24, \"ymin\": 322.77, \"xmax\": 660.21, \"ymax\": 396.3}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": -2.02, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": -2.52, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.75, "window_alt_abs_m": 0.14, "target_px_mean_hist": 776.0, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00101/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.93, 36.45, 19.89, -45.8, -86.54, 0.0]\n  Target bbox: [597.66, 332.4, 635.48, 405.98]\n\nFrame 2:\n  Drone pose: [104.19, 35.89, 20.0, -51.63, -84.59, 0.0]\n  Target bbox: [561.89, 240.03, 606.82, 314.12]\n\nFrame 3:\n  Drone pose: [104.4, 35.37, 20.02, -42.29, -85.03, 0.0]\n  Target bbox: [559.98, 398.7, 604.11, 472.44]\n\nFrame 4:\n  Drone pose: [104.79, 34.86, 20.07, -45.51, -96.26, 0.0]\n  Target bbox: [675.16, 345.65, 720.37, 419.46]\n\nFrame 5 (current):\n  Drone pose: [104.95, 34.2, 19.94, -46.83, -91.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.52, \"ymin\": 323.19, \"xmax\": 661.96, \"ymax\": 395.79}, \"waypoint_deltas\": [{\"dx\": 0.25, \"dy\": -0.56, \"dz\": 0.06, \"dpitch\": -0.18, \"dyaw\": -0.83, \"droll\": 0.0}, {\"dx\": 0.49, \"dy\": -1.14, \"dz\": 0.06, \"dpitch\": -0.28, \"dyaw\": -1.61, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -1.72, \"dz\": 0.06, \"dpitch\": -0.4, \"dyaw\": -2.28, \"droll\": 0.0}, {\"dx\": 0.83, \"dy\": -2.31, \"dz\": 0.06, \"dpitch\": -0.52, \"dyaw\": -2.76, \"droll\": 0.0}, {\"dx\": 0.9, \"dy\": -2.9, \"dz\": 0.06, \"dpitch\": -0.66, \"dyaw\": -3.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.09, "window_alt_abs_m": 0.31, "target_px_mean_hist": 782.5, "cur_frame_id": 101, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00113/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00115/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.21, 28.51, 19.96, -50.18, -95.99, 0.0]\n  Target bbox: [555.43, 273.66, 609.77, 359.05]\n\nFrame 2:\n  Drone pose: [104.93, 27.73, 19.9, -47.78, -101.91, 0.0]\n  Target bbox: [613.63, 316.8, 666.6, 402.44]\n\nFrame 3:\n  Drone pose: [104.58, 27.2, 19.9, -51.75, -102.14, 0.0]\n  Target bbox: [612.66, 250.76, 661.5, 334.8]\n\nFrame 4:\n  Drone pose: [104.26, 26.63, 20.11, -45.73, -107.28, 0.0]\n  Target bbox: [660.07, 358.18, 716.03, 444.11]\n\nFrame 5 (current):\n  Drone pose: [103.9, 25.94, 19.94, -49.87, -105.22, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 634.82, \"ymin\": 289.95, \"xmax\": 681.35, \"ymax\": 371.08}, \"waypoint_deltas\": [{\"dx\": -0.38, \"dy\": -0.34, \"dz\": 0.06, \"dpitch\": 1.95, \"dyaw\": 1.3, \"droll\": 0.0}, {\"dx\": -0.78, \"dy\": -0.85, \"dz\": 0.06, \"dpitch\": 1.97, \"dyaw\": 0.97, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": -1.36, \"dz\": 0.06, \"dpitch\": 2.0, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": -1.86, \"dz\": 0.06, \"dpitch\": 2.06, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": -2.34, \"dz\": 0.06, \"dpitch\": 2.14, \"dyaw\": -0.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.35, "window_alt_abs_m": 0.44, "target_px_mean_hist": 789.2, "cur_frame_id": 115, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604/aug_001/frames_playback/frame_00129/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.92, 21.24, 20.1, -52.89, -103.26, 0.0]\n  Target bbox: [612.45, 236.31, 661.65, 314.48]\n\nFrame 2:\n  Drone pose: [100.95, 20.83, 19.93, -46.36, -98.51, 0.0]\n  Target bbox: [562.69, 342.0, 602.49, 417.78]\n\nFrame 3:\n  Drone pose: [100.8, 20.35, 19.84, -47.35, -103.03, 0.0]\n  Target bbox: [619.86, 320.86, 659.76, 398.17]\n\nFrame 4:\n  Drone pose: [100.61, 19.9, 20.0, -47.59, -102.35, 0.0]\n  Target bbox: [615.61, 320.45, 663.97, 398.6]\n\nFrame 5 (current):\n  Drone pose: [100.54, 19.56, 20.04, -48.52, -100.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 595.14, \"ymin\": 303.32, \"xmax\": 646.39, \"ymax\": 379.12}, \"waypoint_deltas\": [{\"dx\": -0.09, \"dy\": -0.59, \"dz\": -0.04, \"dpitch\": 0.99, \"dyaw\": -1.42, \"droll\": 0.0}, {\"dx\": -0.14, \"dy\": -1.06, \"dz\": -0.04, \"dpitch\": 1.02, \"dyaw\": -1.22, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": -1.53, \"dz\": -0.04, \"dpitch\": 1.05, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": -2.01, \"dz\": -0.04, \"dpitch\": 1.08, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": -2.49, \"dz\": -0.04, \"dpitch\": 1.1, \"dyaw\": -0.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.9, "window_alt_abs_m": 0.47, "target_px_mean_hist": 782.8, "cur_frame_id": 129, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1604", "difficulty_score": 0.1815, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [20.39, 1.94, 22.0, -46.27, 0.0, 0.0]\n  Target bbox: [629.56, 328.46, 650.44, 390.83]\n\nFrame 2:\n  Drone pose: [19.99, 0.92, 21.2, -43.85, 2.81, 0.0]\n  Target bbox: [629.29, 328.47, 650.93, 390.88]\n\nFrame 3:\n  Drone pose: [20.12, 0.49, 20.67, -42.55, 3.92, 0.0]\n  Target bbox: [628.86, 330.66, 651.31, 388.62]\n\nFrame 4:\n  Drone pose: [20.52, 0.36, 20.64, -42.37, 4.25, 0.0]\n  Target bbox: [628.92, 331.16, 651.23, 388.12]\n\nFrame 5 (current):\n  Drone pose: [21.02, 0.34, 20.62, -42.32, 4.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.07, \"ymin\": 330.64, \"xmax\": 651.1, \"ymax\": 388.7}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.0, \"dz\": -0.07, \"dpitch\": 0.04, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": 0.0, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": 0.0, \"dz\": -0.2, \"dpitch\": 0.2, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.29, "window_alt_abs_m": 1.38, "target_px_mean_hist": 478.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00016/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.12, 0.34, 20.33, -42.03, 4.31, 0.0]\n  Target bbox: [628.55, 325.45, 651.75, 394.07]\n\nFrame 2:\n  Drone pose: [25.63, 0.34, 20.3, -42.0, 4.31, 0.0]\n  Target bbox: [628.33, 324.91, 652.01, 394.7]\n\nFrame 3:\n  Drone pose: [26.14, 0.34, 20.27, -41.97, 4.32, 0.0]\n  Target bbox: [628.77, 326.42, 651.5, 393.04]\n\nFrame 4:\n  Drone pose: [26.65, 0.34, 20.24, -41.95, 4.32, 0.0]\n  Target bbox: [628.57, 327.25, 651.69, 392.18]\n\nFrame 5 (current):\n  Drone pose: [27.16, 0.34, 20.22, -41.92, 4.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.14, \"ymin\": 326.7, \"xmax\": 652.13, \"ymax\": 392.73}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.01, \"dz\": -0.07, \"dpitch\": 0.06, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": 0.01, \"dz\": -0.09, \"dpitch\": 0.08, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 2.54, \"dy\": 0.02, \"dz\": -0.1, \"dpitch\": 0.09, \"dyaw\": -0.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.11, "target_px_mean_hist": 480.8, "cur_frame_id": 16, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00028/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [31.22, 0.42, 20.08, -41.8, 4.12, 0.0]\n  Target bbox: [628.65, 327.0, 651.6, 392.41]\n\nFrame 2:\n  Drone pose: [31.73, 0.46, 20.07, -41.8, 4.02, 0.0]\n  Target bbox: [628.47, 325.8, 651.83, 393.73]\n\nFrame 3:\n  Drone pose: [32.23, 0.5, 20.06, -41.8, 3.9, 0.0]\n  Target bbox: [627.87, 323.33, 652.49, 396.24]\n\nFrame 4:\n  Drone pose: [32.74, 0.56, 20.05, -41.81, 3.75, 0.0]\n  Target bbox: [628.85, 327.79, 651.38, 391.58]\n\nFrame 5 (current):\n  Drone pose: [33.25, 0.63, 20.04, -41.82, 3.57, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.96, \"ymin\": 329.03, \"xmax\": 651.25, \"ymax\": 390.33}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 0.17, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": -0.47, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.28, \"dz\": -0.01, \"dpitch\": -0.04, \"dyaw\": -0.77, \"droll\": 0.0}, {\"dx\": 2.04, \"dy\": 0.41, \"dz\": -0.02, \"dpitch\": -0.05, \"dyaw\": -1.11, \"droll\": 0.0}, {\"dx\": 2.54, \"dy\": 0.55, \"dz\": -0.02, \"dpitch\": -0.05, \"dyaw\": -1.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.55, "window_alt_abs_m": 0.04, "target_px_mean_hist": 510.0, "cur_frame_id": 28, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [37.27, 1.69, 20.01, -41.85, 0.68, 0.0]\n  Target bbox: [629.56, 324.27, 650.77, 395.33]\n\nFrame 2:\n  Drone pose: [37.78, 1.86, 20.01, -41.86, 0.22, 0.0]\n  Target bbox: [629.44, 328.28, 650.67, 391.11]\n\nFrame 3:\n  Drone pose: [38.3, 2.02, 20.01, -41.88, -0.2, 0.0]\n  Target bbox: [629.26, 327.59, 650.64, 391.79]\n\nFrame 4:\n  Drone pose: [38.82, 2.15, 20.01, -41.9, -0.55, 0.0]\n  Target bbox: [629.22, 325.03, 650.48, 394.45]\n\nFrame 5 (current):\n  Drone pose: [39.35, 2.24, 20.0, -41.94, -0.81, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.7, \"ymin\": 327.71, \"xmax\": 655.38, \"ymax\": 391.72}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -1.55, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -1.67, \"droll\": 0.0}, {\"dx\": 1.66, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": -3.09, \"droll\": 0.0}, {\"dx\": 2.23, \"dy\": 0.12, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": -3.09, \"droll\": 0.0}, {\"dx\": 2.8, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": -0.29, \"dyaw\": -4.42, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.5, "window_alt_abs_m": 0.01, "target_px_mean_hist": 506.0, "cur_frame_id": 40, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [43.84, 2.22, 20.0, -42.43, -6.32, 0.0]\n  Target bbox: [622.16, 325.35, 657.88, 394.09]\n\nFrame 2:\n  Drone pose: [44.38, 2.17, 20.0, -42.42, -7.58, 0.0]\n  Target bbox: [626.17, 322.39, 653.44, 397.19]\n\nFrame 3:\n  Drone pose: [44.92, 2.13, 20.0, -42.48, -7.48, 0.0]\n  Target bbox: [625.12, 327.25, 655.0, 392.15]\n\nFrame 4:\n  Drone pose: [45.47, 2.09, 20.0, -42.45, -8.75, 0.0]\n  Target bbox: [627.32, 328.57, 652.49, 390.71]\n\nFrame 5 (current):\n  Drone pose: [46.01, 2.04, 20.0, -42.51, -8.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.19, \"ymin\": 327.09, \"xmax\": 652.57, \"ymax\": 392.31}, \"waypoint_deltas\": [{\"dx\": 0.53, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.08, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -0.89, \"droll\": 0.0}, {\"dx\": 2.14, \"dy\": -0.28, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -2.05, \"droll\": 0.0}, {\"dx\": 2.68, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -1.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.75, "window_alt_abs_m": 0.0, "target_px_mean_hist": 506.8, "cur_frame_id": 52, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00064/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [50.28, 1.49, 20.0, -42.52, -12.74, 0.0]\n  Target bbox: [624.5, 322.44, 655.12, 397.1]\n\nFrame 2:\n  Drone pose: [50.81, 1.51, 20.0, -42.56, -12.82, 0.0]\n  Target bbox: [622.55, 326.18, 657.5, 393.2]\n\nFrame 3:\n  Drone pose: [51.35, 1.53, 20.0, -42.43, -14.24, 0.0]\n  Target bbox: [624.96, 325.06, 654.75, 394.39]\n\nFrame 4:\n  Drone pose: [51.88, 1.5, 20.0, -42.49, -14.19, 0.0]\n  Target bbox: [626.16, 326.3, 653.61, 393.09]\n\nFrame 5 (current):\n  Drone pose: [52.4, 1.54, 20.0, -42.5, -14.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.34, \"ymin\": 326.99, \"xmax\": 654.81, \"ymax\": 392.43}, \"waypoint_deltas\": [{\"dx\": 0.53, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -1.28, \"droll\": 0.0}, {\"dx\": 1.06, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -1.21, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": -2.61, \"droll\": 0.0}, {\"dx\": 2.12, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": -2.65, \"droll\": 0.0}, {\"dx\": 2.64, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": 0.36, \"dyaw\": -3.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.67, "window_alt_abs_m": 0.0, "target_px_mean_hist": 492.8, "cur_frame_id": 64, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [56.64, 1.55, 20.0, -42.01, -19.8, 0.0]\n  Target bbox: [625.29, 328.43, 654.79, 390.89]\n\nFrame 2:\n  Drone pose: [57.18, 1.51, 20.0, -41.84, -21.0, 0.0]\n  Target bbox: [624.7, 325.87, 655.06, 393.63]\n\nFrame 3:\n  Drone pose: [57.71, 1.47, 20.0, -41.9, -20.92, 0.0]\n  Target bbox: [622.62, 322.58, 657.02, 397.04]\n\nFrame 4:\n  Drone pose: [58.25, 1.49, 20.0, -41.94, -21.02, 0.0]\n  Target bbox: [627.09, 327.86, 653.07, 391.55]\n\nFrame 5 (current):\n  Drone pose: [58.8, 1.48, 20.0, -41.76, -22.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.03, \"ymin\": 324.88, \"xmax\": 655.7, \"ymax\": 394.66}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": -0.08, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": -1.06, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": 0.3, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": 2.1, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 0.39, \"dyaw\": -2.43, \"droll\": 0.0}, {\"dx\": 2.95, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -2.85, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.66, "window_alt_abs_m": 0.0, "target_px_mean_hist": 490.8, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [66.06, 1.1, 20.0, -44.89, -30.59, 0.0]\n  Target bbox: [620.65, 321.48, 659.04, 397.91]\n\nFrame 2:\n  Drone pose: [67.17, -0.25, 20.0, -46.75, -27.84, 0.0]\n  Target bbox: [624.13, 325.84, 655.76, 393.22]\n\nFrame 3:\n  Drone pose: [67.34, -1.6, 20.0, -47.22, -23.42, 0.0]\n  Target bbox: [618.71, 321.21, 661.61, 398.03]\n\nFrame 4:\n  Drone pose: [67.35, -2.44, 20.0, -46.69, -21.78, 0.0]\n  Target bbox: [623.14, 322.94, 656.59, 396.24]\n\nFrame 5 (current):\n  Drone pose: [67.33, -2.81, 20.0, -46.13, -20.1, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.75, \"ymin\": 324.77, \"xmax\": 656.45, \"ymax\": 394.43}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": 0.77, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -0.62, \"dz\": 0.0, \"dpitch\": 1.28, \"dyaw\": 1.28, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": -0.81, \"dz\": 0.0, \"dpitch\": 2.06, \"dyaw\": 0.85, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": -0.94, \"dz\": 0.0, \"dpitch\": 2.17, \"dyaw\": 1.32, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": -1.07, \"dz\": 0.0, \"dpitch\": 2.41, \"dyaw\": 0.37, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.48, "window_alt_abs_m": 0.0, "target_px_mean_hist": 561.0, "cur_frame_id": 88, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00100/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [69.73, -4.47, 20.0, -43.7, -19.43, 0.0]\n  Target bbox: [626.61, 327.28, 653.55, 392.03]\n\nFrame 2:\n  Drone pose: [70.22, -4.76, 20.0, -43.58, -20.0, 0.0]\n  Target bbox: [619.65, 323.15, 660.61, 396.36]\n\nFrame 3:\n  Drone pose: [70.71, -5.09, 20.0, -43.48, -20.44, 0.0]\n  Target bbox: [624.88, 327.47, 655.23, 391.8]\n\nFrame 4:\n  Drone pose: [71.21, -5.47, 20.0, -43.42, -20.76, 0.0]\n  Target bbox: [623.91, 325.36, 656.28, 393.99]\n\nFrame 5 (current):\n  Drone pose: [71.71, -5.89, 20.0, -43.38, -20.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.01, \"ymin\": 322.95, \"xmax\": 660.23, \"ymax\": 396.46}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.45, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": -0.92, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": -1.4, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -1.89, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.27, \"droll\": 0.0}, {\"dx\": 2.47, \"dy\": -2.37, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -0.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.56, "window_alt_abs_m": 0.0, "target_px_mean_hist": 523.8, "cur_frame_id": 100, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/ORI/frames_playback/frame_00112/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [75.55, -9.57, 20.0, -43.25, -20.34, 0.0]\n  Target bbox: [623.38, 323.0, 656.28, 396.51]\n\nFrame 2:\n  Drone pose: [75.99, -9.97, 20.0, -43.36, -19.21, 0.0]\n  Target bbox: [619.32, 322.52, 660.94, 397.0]\n\nFrame 3:\n  Drone pose: [76.43, -10.34, 20.0, -43.45, -19.5, 0.0]\n  Target bbox: [624.65, 326.23, 655.52, 393.13]\n\nFrame 4:\n  Drone pose: [76.88, -10.68, 20.0, -43.3, -19.89, 0.0]\n  Target bbox: [625.95, 327.8, 653.91, 391.46]\n\nFrame 5 (current):\n  Drone pose: [77.32, -10.97, 20.0, -43.37, -19.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.82, \"ymin\": 321.98, \"xmax\": 656.77, \"ymax\": 397.65}, \"waypoint_deltas\": [{\"dx\": 0.46, \"dy\": -0.27, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.75, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 1.39, \"droll\": 0.0}, {\"dx\": 1.4, \"dy\": -0.69, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 1.94, \"droll\": 0.0}, {\"dx\": 1.91, \"dy\": -0.85, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": 2.38, \"droll\": 0.0}, {\"dx\": 2.44, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": -0.35, \"dyaw\": 2.78, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.65, "window_alt_abs_m": 0.0, "target_px_mean_hist": 499.5, "cur_frame_id": 112, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [20.37, 1.87, 21.89, -46.08, 0.23, 0.0]\n  Target bbox: [629.21, 330.35, 650.89, 388.87]\n\nFrame 2:\n  Drone pose: [19.99, 0.92, 21.2, -43.85, 2.81, 0.0]\n  Target bbox: [629.13, 325.52, 651.18, 394.02]\n\nFrame 3:\n  Drone pose: [20.11, 0.4, 20.62, -42.47, 4.15, 0.0]\n  Target bbox: [629.14, 330.46, 651.04, 388.88]\n\nFrame 4:\n  Drone pose: [20.52, 0.36, 20.64, -40.08, 8.76, 0.0]\n  Target bbox: [571.58, 369.89, 596.15, 429.1]\n\nFrame 5 (current):\n  Drone pose: [21.18, 0.34, 20.65, -43.43, 8.83, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 570.79, \"ymin\": 311.65, \"xmax\": 597.67, \"ymax\": 382.93}, \"waypoint_deltas\": [{\"dx\": 0.35, \"dy\": 0.0, \"dz\": -0.06, \"dpitch\": 1.12, \"dyaw\": -4.54, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": 0.0, \"dz\": -0.08, \"dpitch\": 1.14, \"dyaw\": -4.54, \"droll\": 0.0}, {\"dx\": 1.38, \"dy\": 0.0, \"dz\": -0.1, \"dpitch\": 1.15, \"dyaw\": -4.53, \"droll\": 0.0}, {\"dx\": 1.89, \"dy\": 0.0, \"dz\": -0.12, \"dpitch\": 1.17, \"dyaw\": -4.53, \"droll\": 0.0}, {\"dx\": 2.4, \"dy\": 0.0, \"dz\": -0.23, \"dpitch\": 1.31, \"dyaw\": -4.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.6, "window_alt_abs_m": 1.3, "target_px_mean_hist": 473.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00016/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.12, 0.34, 20.33, -38.77, 6.17, 0.0]\n  Target bbox: [604.88, 382.5, 628.54, 446.82]\n\nFrame 2:\n  Drone pose: [25.63, 0.34, 20.3, -43.24, 6.39, 0.0]\n  Target bbox: [601.85, 304.89, 626.23, 373.67]\n\nFrame 3:\n  Drone pose: [26.14, 0.47, 20.18, -41.86, 3.96, 0.0]\n  Target bbox: [628.67, 324.87, 651.63, 394.62]\n\nFrame 4:\n  Drone pose: [26.65, 0.34, 20.24, -41.95, 4.32, 0.0]\n  Target bbox: [628.93, 330.16, 651.25, 389.18]\n\nFrame 5 (current):\n  Drone pose: [27.11, 0.5, 20.12, -41.73, 3.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.47, \"ymin\": 328.89, \"xmax\": 651.74, \"ymax\": 390.46}, \"waypoint_deltas\": [{\"dx\": 0.55, \"dy\": -0.16, \"dz\": 0.07, \"dpitch\": -0.16, \"dyaw\": 0.44, \"droll\": 0.0}, {\"dx\": 1.06, \"dy\": -0.16, \"dz\": 0.05, \"dpitch\": -0.14, \"dyaw\": 0.44, \"droll\": 0.0}, {\"dx\": 1.57, \"dy\": -0.15, \"dz\": 0.03, \"dpitch\": -0.13, \"dyaw\": 0.43, \"droll\": 0.0}, {\"dx\": 2.08, \"dy\": -0.15, \"dz\": 0.01, \"dpitch\": -0.11, \"dyaw\": 0.41, \"droll\": 0.0}, {\"dx\": 2.59, \"dy\": -0.14, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 0.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.44, "window_alt_abs_m": 0.33, "target_px_mean_hist": 488.8, "cur_frame_id": 16, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00028/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [31.22, 0.42, 20.08, -42.12, 6.31, 0.0]\n  Target bbox: [600.42, 320.77, 624.51, 388.73]\n\nFrame 2:\n  Drone pose: [31.81, 0.53, 20.07, -41.92, 3.83, 0.0]\n  Target bbox: [628.73, 325.72, 651.57, 393.81]\n\nFrame 3:\n  Drone pose: [32.22, 0.49, 19.91, -41.56, 3.94, 0.0]\n  Target bbox: [628.73, 328.71, 651.48, 390.6]\n\nFrame 4:\n  Drone pose: [32.74, 0.56, 20.05, -41.81, 3.75, 0.0]\n  Target bbox: [628.82, 328.5, 651.4, 390.88]\n\nFrame 5 (current):\n  Drone pose: [33.11, 0.56, 19.94, -41.65, 6.24, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 596.53, \"ymin\": 327.03, \"xmax\": 620.04, \"ymax\": 387.03}, \"waypoint_deltas\": [{\"dx\": 0.65, \"dy\": 0.15, \"dz\": 0.1, \"dpitch\": -0.18, \"dyaw\": -2.89, \"droll\": 0.0}, {\"dx\": 1.17, \"dy\": 0.24, \"dz\": 0.09, \"dpitch\": -0.19, \"dyaw\": -3.14, \"droll\": 0.0}, {\"dx\": 1.68, \"dy\": 0.35, \"dz\": 0.09, \"dpitch\": -0.21, \"dyaw\": -3.44, \"droll\": 0.0}, {\"dx\": 2.18, \"dy\": 0.48, \"dz\": 0.08, \"dpitch\": -0.22, \"dyaw\": -3.78, \"droll\": 0.0}, {\"dx\": 2.68, \"dy\": 0.62, \"dz\": 0.08, \"dpitch\": -0.22, \"dyaw\": -4.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.28, "window_alt_abs_m": 0.43, "target_px_mean_hist": 510.5, "cur_frame_id": 28, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [37.27, 1.69, 20.01, -44.16, -4.32, 0.0]\n  Target bbox: [690.52, 290.4, 714.79, 355.05]\n\nFrame 2:\n  Drone pose: [37.66, 1.91, 19.93, -41.57, 0.09, 0.0]\n  Target bbox: [629.44, 327.4, 650.61, 392.04]\n\nFrame 3:\n  Drone pose: [38.3, 2.02, 20.01, -45.22, 4.18, 0.0]\n  Target bbox: [572.22, 268.78, 597.94, 341.08]\n\nFrame 4:\n  Drone pose: [38.82, 2.26, 19.93, -41.79, -0.85, 0.0]\n  Target bbox: [629.13, 328.32, 650.65, 391.05]\n\nFrame 5 (current):\n  Drone pose: [39.35, 2.24, 20.0, -41.94, -0.81, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.31, \"ymin\": 326.21, \"xmax\": 656.69, \"ymax\": 393.22}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -1.55, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -1.67, \"droll\": 0.0}, {\"dx\": 1.66, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": -3.09, \"droll\": 0.0}, {\"dx\": 2.23, \"dy\": 0.12, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": -3.09, \"droll\": 0.0}, {\"dx\": 2.8, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": -0.29, \"dyaw\": -4.42, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.58, "window_alt_abs_m": 0.31, "target_px_mean_hist": 512.8, "cur_frame_id": 40, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [43.84, 2.22, 20.0, -42.43, -6.32, 0.0]\n  Target bbox: [621.51, 325.01, 658.53, 394.45]\n\nFrame 2:\n  Drone pose: [44.38, 2.17, 20.0, -42.48, -3.22, 0.0]\n  Target bbox: [572.15, 323.59, 598.96, 396.43]\n\nFrame 3:\n  Drone pose: [44.94, 2.04, 20.05, -41.04, -12.23, 0.0]\n  Target bbox: [684.46, 354.1, 719.87, 421.21]\n\nFrame 4:\n  Drone pose: [45.47, 2.16, 19.87, -46.52, -7.64, 0.0]\n  Target bbox: [609.75, 251.58, 636.97, 324.56]\n\nFrame 5 (current):\n  Drone pose: [46.01, 2.04, 20.0, -42.51, -8.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.77, \"ymin\": 323.41, \"xmax\": 653.87, \"ymax\": 396.08}, \"waypoint_deltas\": [{\"dx\": 0.53, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.08, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -0.89, \"droll\": 0.0}, {\"dx\": 2.14, \"dy\": -0.28, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -2.05, \"droll\": 0.0}, {\"dx\": 2.68, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -1.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.69, "window_alt_abs_m": 0.36, "target_px_mean_hist": 511.0, "cur_frame_id": 52, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00064/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [50.3, 1.6, 19.89, -43.2, -8.17, 0.0]\n  Target bbox: [564.9, 315.11, 593.02, 379.34]\n\nFrame 2:\n  Drone pose: [50.81, 1.51, 20.0, -40.45, -7.82, 0.0]\n  Target bbox: [555.32, 359.44, 600.25, 434.64]\n\nFrame 3:\n  Drone pose: [51.46, 1.43, 19.91, -42.49, -14.05, 0.0]\n  Target bbox: [625.37, 324.26, 654.31, 395.23]\n\nFrame 4:\n  Drone pose: [51.88, 1.5, 20.0, -42.91, -16.79, 0.0]\n  Target bbox: [656.86, 316.97, 687.71, 389.6]\n\nFrame 5 (current):\n  Drone pose: [52.4, 1.67, 20.05, -42.53, -14.65, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.49, \"ymin\": 326.52, \"xmax\": 655.66, \"ymax\": 392.9}, \"waypoint_deltas\": [{\"dx\": 0.53, \"dy\": -0.16, \"dz\": -0.05, \"dpitch\": 0.16, \"dyaw\": -0.94, \"droll\": 0.0}, {\"dx\": 1.06, \"dy\": -0.19, \"dz\": -0.05, \"dpitch\": 0.11, \"dyaw\": -0.87, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": -0.17, \"dz\": -0.05, \"dpitch\": 0.26, \"dyaw\": -2.27, \"droll\": 0.0}, {\"dx\": 2.12, \"dy\": -0.16, \"dz\": -0.05, \"dpitch\": 0.24, \"dyaw\": -2.31, \"droll\": 0.0}, {\"dx\": 2.64, \"dy\": -0.2, \"dz\": -0.05, \"dpitch\": 0.39, \"dyaw\": -3.56, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.46, "window_alt_abs_m": 0.33, "target_px_mean_hist": 492.8, "cur_frame_id": 64, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [56.69, 1.46, 19.94, -45.33, -19.83, 0.0]\n  Target bbox: [626.58, 269.83, 659.33, 338.31]\n\nFrame 2:\n  Drone pose: [57.18, 1.51, 20.0, -39.8, -18.14, 0.0]\n  Target bbox: [589.79, 363.13, 618.08, 425.82]\n\nFrame 3:\n  Drone pose: [57.73, 1.5, 20.05, -36.99, -23.18, 0.0]\n  Target bbox: [649.26, 406.12, 685.02, 482.46]\n\nFrame 4:\n  Drone pose: [58.25, 1.49, 20.0, -38.26, -24.09, 0.0]\n  Target bbox: [663.29, 391.34, 694.13, 453.2]\n\nFrame 5 (current):\n  Drone pose: [58.87, 1.55, 20.05, -43.15, -24.9, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 652.77, \"ymin\": 302.94, \"xmax\": 686.56, \"ymax\": 374.75}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.07, \"dz\": -0.05, \"dpitch\": 1.39, \"dyaw\": 2.61, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": -0.15, \"dz\": -0.05, \"dpitch\": 1.6, \"dyaw\": 1.54, \"droll\": 0.0}, {\"dx\": 1.37, \"dy\": -0.14, \"dz\": -0.05, \"dpitch\": 1.69, \"dyaw\": 1.6, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": -0.13, \"dz\": -0.05, \"dpitch\": 1.78, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": 2.88, \"dy\": -0.12, \"dz\": -0.05, \"dpitch\": 1.36, \"dyaw\": -0.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.45, "window_alt_abs_m": 0.22, "target_px_mean_hist": 495.2, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [66.13, 1.14, 20.09, -45.39, -27.87, 0.0]\n  Target bbox: [585.41, 317.82, 623.63, 392.72]\n\nFrame 2:\n  Drone pose: [67.32, -0.36, 20.04, -45.85, -32.39, 0.0]\n  Target bbox: [676.6, 348.98, 709.85, 415.93]\n\nFrame 3:\n  Drone pose: [67.37, -1.76, 20.07, -47.48, -22.99, 0.0]\n  Target bbox: [625.01, 325.06, 655.2, 394.04]\n\nFrame 4:\n  Drone pose: [67.35, -2.44, 20.0, -49.29, -23.72, 0.0]\n  Target bbox: [645.42, 279.72, 679.86, 352.82]\n\nFrame 5 (current):\n  Drone pose: [67.32, -2.94, 19.95, -48.32, -18.13, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 599.29, \"ymin\": 283.55, \"xmax\": 643.71, \"ymax\": 361.87}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.24, \"dz\": 0.05, \"dpitch\": 2.96, \"dyaw\": -1.84, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -0.49, \"dz\": 0.05, \"dpitch\": 3.47, \"dyaw\": -0.69, \"droll\": 0.0}, {\"dx\": 0.14, \"dy\": -0.68, \"dz\": 0.05, \"dpitch\": 4.25, \"dyaw\": -1.12, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": -0.81, \"dz\": 0.05, \"dpitch\": 4.36, \"dyaw\": -0.65, \"droll\": 0.0}, {\"dx\": 0.97, \"dy\": -0.94, \"dz\": 0.05, \"dpitch\": 4.6, \"dyaw\": -1.6, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.23, "window_alt_abs_m": 0.2, "target_px_mean_hist": 560.5, "cur_frame_id": 88, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00100/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [69.66, -4.54, 20.09, -41.85, -17.37, 0.0]\n  Target bbox: [601.24, 357.54, 634.61, 426.66]\n\nFrame 2:\n  Drone pose: [70.08, -4.67, 19.9, -45.4, -16.3, 0.0]\n  Target bbox: [572.49, 287.0, 613.43, 360.26]\n\nFrame 3:\n  Drone pose: [70.71, -5.09, 20.0, -43.48, -20.44, 0.0]\n  Target bbox: [624.44, 325.85, 655.76, 393.55]\n\nFrame 4:\n  Drone pose: [71.28, -5.52, 19.93, -43.42, -20.69, 0.0]\n  Target bbox: [623.8, 325.19, 656.39, 394.15]\n\nFrame 5 (current):\n  Drone pose: [71.79, -5.81, 19.94, -44.79, -19.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 602.58, \"ymin\": 300.47, \"xmax\": 640.3, \"ymax\": 371.43}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.53, \"dz\": 0.06, \"dpitch\": 1.43, \"dyaw\": -1.35, \"droll\": 0.0}, {\"dx\": 0.93, \"dy\": -1.0, \"dz\": 0.06, \"dpitch\": 1.44, \"dyaw\": -1.43, \"droll\": 0.0}, {\"dx\": 1.43, \"dy\": -1.48, \"dz\": 0.06, \"dpitch\": 1.46, \"dyaw\": -1.47, \"droll\": 0.0}, {\"dx\": 1.91, \"dy\": -1.97, \"dz\": 0.06, \"dpitch\": 1.48, \"dyaw\": -1.48, \"droll\": 0.0}, {\"dx\": 2.39, \"dy\": -2.45, \"dz\": 0.06, \"dpitch\": 1.53, \"dyaw\": -1.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.37, "window_alt_abs_m": 0.37, "target_px_mean_hist": 503.8, "cur_frame_id": 100, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644/aug_001/frames_playback/frame_00112/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [75.48, -9.43, 19.92, -41.43, -25.66, 0.0]\n  Target bbox: [683.15, 350.12, 720.0, 424.88]\n\nFrame 2:\n  Drone pose: [75.99, -9.97, 20.0, -43.82, -17.89, 0.0]\n  Target bbox: [608.98, 318.99, 638.35, 385.27]\n\nFrame 3:\n  Drone pose: [76.5, -10.47, 20.08, -48.42, -18.32, 0.0]\n  Target bbox: [612.43, 246.14, 645.68, 315.27]\n\nFrame 4:\n  Drone pose: [76.88, -10.68, 20.0, -43.3, -19.89, 0.0]\n  Target bbox: [623.63, 322.81, 656.04, 396.67]\n\nFrame 5 (current):\n  Drone pose: [77.32, -10.97, 20.0, -44.51, -19.41, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.55, \"ymin\": 304.22, \"xmax\": 660.32, \"ymax\": 376.85}, \"waypoint_deltas\": [{\"dx\": 0.46, \"dy\": -0.27, \"dz\": 0.0, \"dpitch\": 1.08, \"dyaw\": 1.12, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": 1.03, \"dyaw\": 1.76, \"droll\": 0.0}, {\"dx\": 1.4, \"dy\": -0.69, \"dz\": 0.0, \"dpitch\": 0.98, \"dyaw\": 2.31, \"droll\": 0.0}, {\"dx\": 1.91, \"dy\": -0.85, \"dz\": 0.0, \"dpitch\": 0.9, \"dyaw\": 2.75, \"droll\": 0.0}, {\"dx\": 2.44, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": 0.79, \"dyaw\": 3.15, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.25, "window_alt_abs_m": 0.23, "target_px_mean_hist": 515.0, "cur_frame_id": 112, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776022644", "difficulty_score": 0.2893, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.39, -3.06, 22.0, -46.4, -90.0, 0.0]\n  Target bbox: [627.5, 328.2, 652.5, 390.93]\n\nFrame 2:\n  Drone pose: [101.36, -4.62, 21.2, -46.81, -86.87, 0.0]\n  Target bbox: [625.36, 327.94, 654.92, 391.23]\n\nFrame 3:\n  Drone pose: [100.91, -5.57, 20.67, -46.7, -85.41, 0.0]\n  Target bbox: [623.34, 325.52, 656.97, 393.5]\n\nFrame 4:\n  Drone pose: [100.77, -6.23, 20.64, -46.88, -84.94, 0.0]\n  Target bbox: [616.75, 324.19, 663.7, 394.84]\n\nFrame 5 (current):\n  Drone pose: [100.75, -6.77, 20.62, -46.9, -84.87, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.31, \"ymin\": 325.0, \"xmax\": 657.0, \"ymax\": 393.99}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.54, \"dz\": -0.07, \"dpitch\": 0.03, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.06, \"dz\": -0.09, \"dpitch\": 0.04, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.57, \"dz\": -0.2, \"dpitch\": 0.18, \"dyaw\": 0.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.13, "window_alt_abs_m": 1.38, "target_px_mean_hist": 549.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.75, -9.85, 20.39, -46.69, -84.84, 0.0]\n  Target bbox: [622.9, 325.83, 657.43, 393.27]\n\nFrame 2:\n  Drone pose: [100.75, -10.36, 20.36, -46.67, -84.83, 0.0]\n  Target bbox: [619.06, 325.7, 661.32, 393.4]\n\nFrame 3:\n  Drone pose: [100.75, -10.88, 20.33, -46.64, -84.83, 0.0]\n  Target bbox: [621.93, 323.87, 658.44, 395.17]\n\nFrame 4:\n  Drone pose: [100.75, -11.39, 20.3, -46.62, -84.82, 0.0]\n  Target bbox: [623.09, 324.76, 657.23, 394.22]\n\nFrame 5 (current):\n  Drone pose: [100.75, -11.9, 20.27, -46.59, -84.82, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.03, \"ymin\": 324.73, \"xmax\": 655.27, \"ymax\": 394.26}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.02, \"dz\": -0.05, \"dpitch\": 0.05, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.53, \"dz\": -0.08, \"dpitch\": 0.07, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -2.04, \"dz\": -0.1, \"dpitch\": 0.08, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -2.55, \"dz\": -0.12, \"dpitch\": 0.1, \"dyaw\": 0.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.02, "window_alt_abs_m": 0.12, "target_px_mean_hist": 590.8, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.74, -14.95, 20.13, -46.48, -84.79, 0.0]\n  Target bbox: [619.28, 325.5, 661.1, 393.58]\n\nFrame 2:\n  Drone pose: [100.74, -15.46, 20.12, -46.47, -84.79, 0.0]\n  Target bbox: [615.44, 324.45, 665.02, 394.62]\n\nFrame 3:\n  Drone pose: [100.74, -15.97, 20.1, -46.46, -84.78, 0.0]\n  Target bbox: [621.91, 323.74, 658.46, 395.28]\n\nFrame 4:\n  Drone pose: [100.74, -16.48, 20.09, -46.45, -84.78, 0.0]\n  Target bbox: [624.79, 323.95, 655.53, 395.07]\n\nFrame 5 (current):\n  Drone pose: [100.74, -16.98, 20.08, -46.44, -84.77, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.95, \"ymin\": 323.04, \"xmax\": 662.49, \"ymax\": 395.93}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.02, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.52, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.03, \"dz\": -0.04, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.54, \"dz\": -0.04, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.02, "window_alt_abs_m": 0.06, "target_px_mean_hist": 613.2, "cur_frame_id": 24, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.74, -20.53, 20.03, -46.43, -84.75, 0.0]\n  Target bbox: [620.67, 323.15, 659.74, 395.86]\n\nFrame 2:\n  Drone pose: [100.74, -21.03, 20.02, -46.43, -84.75, 0.0]\n  Target bbox: [619.68, 324.6, 660.69, 394.37]\n\nFrame 3:\n  Drone pose: [100.74, -21.54, 20.02, -46.43, -84.75, 0.0]\n  Target bbox: [620.88, 324.4, 659.48, 394.56]\n\nFrame 4:\n  Drone pose: [100.74, -22.04, 20.02, -46.44, -84.75, 0.0]\n  Target bbox: [620.65, 324.42, 659.7, 394.55]\n\nFrame 5 (current):\n  Drone pose: [100.74, -22.55, 20.01, -46.44, -84.74, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.04, \"ymin\": 324.27, \"xmax\": 656.28, \"ymax\": 394.71}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -2.01, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -2.52, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.01, "target_px_mean_hist": 606.0, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.73, -25.57, 20.0, -46.46, -84.73, 0.0]\n  Target bbox: [619.12, 322.84, 661.32, 396.15]\n\nFrame 2:\n  Drone pose: [100.73, -26.07, 20.0, -46.47, -84.73, 0.0]\n  Target bbox: [626.58, 324.25, 653.7, 394.74]\n\nFrame 3:\n  Drone pose: [100.73, -26.58, 20.0, -46.47, -84.73, 0.0]\n  Target bbox: [627.35, 324.16, 652.92, 394.84]\n\nFrame 4:\n  Drone pose: [100.73, -27.08, 20.0, -46.48, -84.73, 0.0]\n  Target bbox: [623.06, 324.23, 657.27, 394.73]\n\nFrame 5 (current):\n  Drone pose: [100.73, -27.58, 20.0, -46.48, -84.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.52, \"ymin\": 324.83, \"xmax\": 658.84, \"ymax\": 394.23}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.01, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.52, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 608.8, "cur_frame_id": 45, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.73, -31.1, 20.0, -46.51, -84.72, 0.0]\n  Target bbox: [620.15, 324.62, 660.23, 394.42]\n\nFrame 2:\n  Drone pose: [100.73, -31.6, 20.0, -46.51, -84.71, 0.0]\n  Target bbox: [619.98, 325.35, 660.39, 393.72]\n\nFrame 3:\n  Drone pose: [100.73, -32.11, 20.0, -46.51, -84.71, 0.0]\n  Target bbox: [617.76, 322.83, 662.68, 396.13]\n\nFrame 4:\n  Drone pose: [100.73, -32.61, 20.0, -46.52, -84.71, 0.0]\n  Target bbox: [623.55, 323.63, 656.79, 395.37]\n\nFrame 5 (current):\n  Drone pose: [100.73, -33.11, 20.0, -46.52, -84.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.02, \"ymin\": 323.93, \"xmax\": 654.27, \"ymax\": 395.04}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 617.2, "cur_frame_id": 56, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.73, -36.12, 20.0, -46.54, -84.71, 0.0]\n  Target bbox: [618.16, 325.07, 662.24, 394.0]\n\nFrame 2:\n  Drone pose: [100.73, -36.62, 20.0, -46.54, -84.71, 0.0]\n  Target bbox: [626.08, 323.85, 654.22, 395.15]\n\nFrame 3:\n  Drone pose: [100.73, -37.12, 20.0, -46.53, -84.7, 0.0]\n  Target bbox: [619.39, 324.56, 660.98, 394.41]\n\nFrame 4:\n  Drone pose: [100.73, -37.62, 20.0, -46.31, -84.7, 0.0]\n  Target bbox: [626.61, 323.71, 653.67, 395.27]\n\nFrame 5 (current):\n  Drone pose: [100.73, -38.13, 20.0, -46.31, -84.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.25, \"ymin\": 324.75, \"xmax\": 659.11, \"ymax\": 394.31}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 611.8, "cur_frame_id": 66, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.73, -41.13, 20.0, -46.32, -84.7, 0.0]\n  Target bbox: [619.43, 324.3, 660.96, 394.73]\n\nFrame 2:\n  Drone pose: [100.73, -41.63, 20.0, -46.33, -84.7, 0.0]\n  Target bbox: [619.89, 324.37, 660.5, 394.67]\n\nFrame 3:\n  Drone pose: [100.73, -42.13, 20.0, -46.33, -84.7, 0.0]\n  Target bbox: [615.07, 323.44, 665.4, 395.58]\n\nFrame 4:\n  Drone pose: [100.73, -42.64, 20.0, -46.33, -84.7, 0.0]\n  Target bbox: [616.71, 322.53, 663.76, 396.43]\n\nFrame 5 (current):\n  Drone pose: [100.73, -43.14, 20.0, -46.33, -84.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.58, \"ymin\": 323.69, \"xmax\": 653.71, \"ymax\": 395.3}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 621.2, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.74, -46.64, 20.0, -46.34, -84.72, 0.0]\n  Target bbox: [616.12, 324.47, 664.33, 394.6]\n\nFrame 2:\n  Drone pose: [100.74, -47.14, 20.0, -46.35, -84.74, 0.0]\n  Target bbox: [619.05, 324.41, 661.33, 394.57]\n\nFrame 3:\n  Drone pose: [100.75, -47.64, 20.0, -46.35, -84.77, 0.0]\n  Target bbox: [617.6, 324.69, 662.82, 394.38]\n\nFrame 4:\n  Drone pose: [100.76, -48.14, 20.0, -46.36, -84.81, 0.0]\n  Target bbox: [615.66, 322.79, 664.83, 396.21]\n\nFrame 5 (current):\n  Drone pose: [100.78, -48.65, 20.0, -46.36, -84.87, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.72, \"ymin\": 324.22, \"xmax\": 661.68, \"ymax\": 394.79}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.33, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": -2.01, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -2.51, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.15, "window_alt_abs_m": 0.0, "target_px_mean_hist": 614.5, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/ORI/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.08, -51.67, 20.0, -46.45, -85.81, 0.0]\n  Target bbox: [617.3, 322.78, 663.17, 396.15]\n\nFrame 2:\n  Drone pose: [101.17, -52.17, 20.0, -46.47, -86.08, 0.0]\n  Target bbox: [623.33, 323.94, 657.02, 394.99]\n\nFrame 3:\n  Drone pose: [101.27, -52.68, 20.0, -46.5, -86.39, 0.0]\n  Target bbox: [620.16, 325.39, 660.23, 393.66]\n\nFrame 4:\n  Drone pose: [101.37, -53.19, 20.0, -46.53, -86.74, 0.0]\n  Target bbox: [618.43, 324.7, 661.99, 394.26]\n\nFrame 5 (current):\n  Drone pose: [101.49, -53.7, 20.0, -46.56, -87.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.75, \"ymin\": 323.84, \"xmax\": 664.76, \"ymax\": 395.13}, \"waypoint_deltas\": [{\"dx\": 0.13, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.4, \"droll\": 0.0}, {\"dx\": 0.25, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.8, \"droll\": 0.0}, {\"dx\": 0.37, \"dy\": -1.55, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -1.19, \"droll\": 0.0}, {\"dx\": 0.48, \"dy\": -2.07, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": -1.54, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": -2.59, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": -1.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.3, "window_alt_abs_m": 0.0, "target_px_mean_hist": 629.8, "cur_frame_id": 97, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.38, -2.99, 21.89, -51.17, -89.55, 0.0]\n  Target bbox: [617.16, 244.62, 652.94, 306.38]\n\nFrame 2:\n  Drone pose: [101.32, -4.58, 21.38, -50.89, -87.28, 0.0]\n  Target bbox: [624.44, 262.38, 667.75, 326.34]\n\nFrame 3:\n  Drone pose: [100.91, -5.57, 20.67, -48.27, -85.45, 0.0]\n  Target bbox: [625.81, 300.54, 655.43, 365.95]\n\nFrame 4:\n  Drone pose: [100.82, -6.3, 20.71, -47.1, -85.07, 0.0]\n  Target bbox: [623.86, 325.37, 656.45, 393.62]\n\nFrame 5 (current):\n  Drone pose: [100.75, -6.77, 20.62, -46.9, -84.87, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.2, \"ymin\": 325.56, \"xmax\": 660.16, \"ymax\": 393.5}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.54, \"dz\": -0.07, \"dpitch\": 0.03, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.06, \"dz\": -0.09, \"dpitch\": 0.04, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.57, \"dz\": -0.2, \"dpitch\": 0.18, \"dyaw\": 0.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.68, "window_alt_abs_m": 1.35, "target_px_mean_hist": 546.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.75, -9.85, 20.39, -51.69, -86.47, 0.0]\n  Target bbox: [640.62, 240.18, 678.38, 311.0]\n\nFrame 2:\n  Drone pose: [100.75, -10.36, 20.36, -48.02, -79.83, 0.0]\n  Target bbox: [565.33, 302.75, 599.03, 374.52]\n\nFrame 3:\n  Drone pose: [100.75, -10.88, 20.33, -46.64, -84.83, 0.0]\n  Target bbox: [620.5, 323.59, 659.9, 395.43]\n\nFrame 4:\n  Drone pose: [100.67, -11.38, 20.34, -41.66, -85.72, 0.0]\n  Target bbox: [631.73, 407.43, 675.66, 479.95]\n\nFrame 5 (current):\n  Drone pose: [100.83, -11.93, 20.41, -46.85, -85.08, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.77, \"ymin\": 324.91, \"xmax\": 654.52, \"ymax\": 394.09}, \"waypoint_deltas\": [{\"dx\": -0.09, \"dy\": -0.48, \"dz\": -0.17, \"dpitch\": 0.28, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": -0.99, \"dz\": -0.19, \"dpitch\": 0.31, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": -1.5, \"dz\": -0.22, \"dpitch\": 0.33, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": -2.01, \"dz\": -0.24, \"dpitch\": 0.34, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": -2.52, \"dz\": -0.26, \"dpitch\": 0.36, \"dyaw\": 0.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.15, "window_alt_abs_m": 0.14, "target_px_mean_hist": 601.5, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.78, -15.07, 20.27, -44.73, -84.31, 0.0]\n  Target bbox: [613.42, 359.37, 653.71, 431.46]\n\nFrame 2:\n  Drone pose: [100.74, -15.46, 20.12, -46.47, -84.79, 0.0]\n  Target bbox: [624.7, 323.99, 655.62, 395.03]\n\nFrame 3:\n  Drone pose: [100.7, -16.08, 20.04, -46.53, -84.62, 0.0]\n  Target bbox: [616.27, 322.94, 664.2, 396.04]\n\nFrame 4:\n  Drone pose: [100.86, -16.34, 20.15, -42.85, -89.23, 0.0]\n  Target bbox: [673.7, 384.59, 700.87, 454.14]\n\nFrame 5 (current):\n  Drone pose: [100.74, -16.98, 20.08, -46.44, -84.77, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.19, \"ymin\": 324.81, \"xmax\": 662.2, \"ymax\": 394.19}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.51, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.02, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.52, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.03, \"dz\": -0.04, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.54, \"dz\": -0.04, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.71, "window_alt_abs_m": 0.41, "target_px_mean_hist": 610.2, "cur_frame_id": 24, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.77, -20.71, 19.99, -44.83, -89.62, 0.0]\n  Target bbox: [677.68, 358.39, 714.32, 425.91]\n\nFrame 2:\n  Drone pose: [100.84, -20.89, 20.06, -46.28, -85.13, 0.0]\n  Target bbox: [625.38, 325.57, 654.91, 393.53]\n\nFrame 3:\n  Drone pose: [100.71, -21.68, 19.92, -46.51, -84.63, 0.0]\n  Target bbox: [620.37, 324.5, 660.01, 394.54]\n\nFrame 4:\n  Drone pose: [100.74, -22.04, 20.02, -46.44, -84.75, 0.0]\n  Target bbox: [625.26, 325.63, 655.03, 393.46]\n\nFrame 5 (current):\n  Drone pose: [100.66, -22.55, 20.01, -48.79, -85.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 632.52, \"ymin\": 285.51, \"xmax\": 679.8, \"ymax\": 354.44}, \"waypoint_deltas\": [{\"dx\": 0.07, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 2.35, \"dyaw\": 1.1, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": 2.34, \"dyaw\": 1.1, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": 2.34, \"dyaw\": 1.1, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -2.01, \"dz\": 0.0, \"dpitch\": 2.33, \"dyaw\": 1.11, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -2.52, \"dz\": -0.01, \"dpitch\": 2.33, \"dyaw\": 1.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.2, "window_alt_abs_m": 0.3, "target_px_mean_hist": 609.2, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.73, -25.57, 20.0, -43.21, -85.08, 0.0]\n  Target bbox: [621.1, 377.18, 667.74, 451.02]\n\nFrame 2:\n  Drone pose: [100.73, -26.07, 20.0, -44.73, -79.73, 0.0]\n  Target bbox: [557.56, 352.48, 606.09, 428.69]\n\nFrame 3:\n  Drone pose: [100.73, -26.58, 20.0, -46.47, -84.73, 0.0]\n  Target bbox: [625.86, 324.24, 654.43, 394.75]\n\nFrame 4:\n  Drone pose: [100.73, -27.08, 20.0, -43.86, -81.57, 0.0]\n  Target bbox: [581.13, 368.27, 625.35, 440.03]\n\nFrame 5 (current):\n  Drone pose: [100.7, -27.77, 20.0, -47.86, -85.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.84, \"ymin\": 305.83, \"xmax\": 680.0, \"ymax\": 376.7}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.32, \"dz\": 0.0, \"dpitch\": 1.38, \"dyaw\": 1.08, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -0.82, \"dz\": 0.0, \"dpitch\": 1.37, \"dyaw\": 1.08, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -1.32, \"dz\": 0.0, \"dpitch\": 1.37, \"dyaw\": 1.08, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -1.82, \"dz\": 0.0, \"dpitch\": 1.36, \"dyaw\": 1.08, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -2.33, \"dz\": 0.0, \"dpitch\": 1.36, \"dyaw\": 1.08, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.73, "window_alt_abs_m": 0.0, "target_px_mean_hist": 625.0, "cur_frame_id": 45, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.73, -31.1, 20.0, -51.15, -80.48, 0.0]\n  Target bbox: [564.73, 246.39, 616.24, 319.25]\n\nFrame 2:\n  Drone pose: [100.73, -31.6, 20.0, -43.03, -79.71, 0.0]\n  Target bbox: [558.62, 383.33, 605.02, 456.17]\n\nFrame 3:\n  Drone pose: [100.73, -32.11, 20.0, -45.36, -84.15, 0.0]\n  Target bbox: [612.16, 344.47, 654.99, 413.44]\n\nFrame 4:\n  Drone pose: [100.73, -32.61, 20.0, -46.52, -84.71, 0.0]\n  Target bbox: [618.11, 324.67, 662.28, 394.32]\n\nFrame 5 (current):\n  Drone pose: [100.72, -33.22, 20.11, -46.39, -89.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 680.86, \"ymin\": 335.69, \"xmax\": 714.99, \"ymax\": 402.66}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.39, \"dz\": -0.11, \"dpitch\": -0.13, \"dyaw\": 4.92, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -0.89, \"dz\": -0.11, \"dpitch\": -0.14, \"dyaw\": 4.92, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.4, \"dz\": -0.11, \"dpitch\": -0.14, \"dyaw\": 4.92, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.9, \"dz\": -0.11, \"dpitch\": -0.14, \"dyaw\": 4.92, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -2.4, \"dz\": -0.11, \"dpitch\": -0.14, \"dyaw\": 4.92, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.69, "window_alt_abs_m": 0.11, "target_px_mean_hist": 619.5, "cur_frame_id": 56, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.68, -36.23, 19.92, -44.38, -88.58, 0.0]\n  Target bbox: [665.66, 361.8, 709.86, 433.53]\n\nFrame 2:\n  Drone pose: [100.73, -36.62, 20.0, -45.57, -89.47, 0.0]\n  Target bbox: [680.11, 342.34, 710.92, 412.39]\n\nFrame 3:\n  Drone pose: [100.73, -37.12, 20.0, -44.96, -87.13, 0.0]\n  Target bbox: [654.75, 350.86, 682.28, 421.49]\n\nFrame 4:\n  Drone pose: [100.6, -37.65, 19.86, -48.66, -81.11, 0.0]\n  Target bbox: [585.71, 281.11, 619.63, 354.28]\n\nFrame 5 (current):\n  Drone pose: [100.78, -38.09, 20.18, -43.04, -82.82, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 601.82, \"ymin\": 382.63, \"xmax\": 630.5, \"ymax\": 454.56}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.54, \"dz\": -0.18, \"dpitch\": -3.28, \"dyaw\": -1.88, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -1.04, \"dz\": -0.18, \"dpitch\": -3.28, \"dyaw\": -1.88, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -1.54, \"dz\": -0.18, \"dpitch\": -3.28, \"dyaw\": -1.88, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -2.04, \"dz\": -0.18, \"dpitch\": -3.28, \"dyaw\": -1.88, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -2.54, \"dz\": -0.18, \"dpitch\": -3.28, \"dyaw\": -1.88, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.97, "window_alt_abs_m": 0.54, "target_px_mean_hist": 617.5, "cur_frame_id": 66, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.63, -40.98, 20.01, -49.15, -89.43, 0.0]\n  Target bbox: [677.0, 275.74, 721.02, 344.19]\n\nFrame 2:\n  Drone pose: [100.73, -41.63, 20.0, -46.65, -84.28, 0.0]\n  Target bbox: [620.53, 319.64, 649.82, 388.47]\n\nFrame 3:\n  Drone pose: [100.73, -42.13, 20.0, -46.33, -84.7, 0.0]\n  Target bbox: [615.43, 323.09, 665.04, 395.91]\n\nFrame 4:\n  Drone pose: [100.74, -42.47, 20.05, -44.7, -87.53, 0.0]\n  Target bbox: [655.32, 348.87, 690.15, 419.93]\n\nFrame 5 (current):\n  Drone pose: [100.79, -43.02, 19.85, -48.12, -81.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 584.9, \"ymin\": 287.11, \"xmax\": 624.57, \"ymax\": 359.72}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": -0.62, \"dz\": 0.15, \"dpitch\": 1.79, \"dyaw\": -2.77, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -1.12, \"dz\": 0.15, \"dpitch\": 1.79, \"dyaw\": -2.77, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -1.62, \"dz\": 0.15, \"dpitch\": 1.79, \"dyaw\": -2.77, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -2.12, \"dz\": 0.15, \"dpitch\": 1.78, \"dyaw\": -2.77, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -2.62, \"dz\": 0.15, \"dpitch\": 1.78, \"dyaw\": -2.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.0, "window_alt_abs_m": 0.26, "target_px_mean_hist": 611.0, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.74, -46.64, 20.0, -43.84, -82.18, 0.0]\n  Target bbox: [591.1, 367.17, 629.46, 436.86]\n\nFrame 2:\n  Drone pose: [100.7, -47.13, 20.11, -43.35, -83.37, 0.0]\n  Target bbox: [604.72, 375.16, 646.77, 449.32]\n\nFrame 3:\n  Drone pose: [100.73, -47.63, 20.15, -46.55, -84.71, 0.0]\n  Target bbox: [618.97, 324.65, 661.4, 394.32]\n\nFrame 4:\n  Drone pose: [100.66, -48.1, 19.97, -45.1, -79.5, 0.0]\n  Target bbox: [558.42, 343.67, 604.77, 416.79]\n\nFrame 5 (current):\n  Drone pose: [100.78, -48.65, 20.0, -50.55, -89.74, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 677.91, \"ymin\": 256.83, \"xmax\": 716.41, \"ymax\": 325.05}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 4.18, \"dyaw\": 4.79, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 4.17, \"dyaw\": 4.68, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 4.15, \"dyaw\": 4.54, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": -2.01, \"dz\": 0.0, \"dpitch\": 4.14, \"dyaw\": 4.37, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -2.51, \"dz\": 0.0, \"dpitch\": 4.12, \"dyaw\": 4.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.98, "window_alt_abs_m": 0.36, "target_px_mean_hist": 619.5, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694/aug_001/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.93, -51.66, 20.0, -45.53, -84.65, 0.0]\n  Target bbox: [618.79, 338.76, 645.47, 409.76]\n\nFrame 2:\n  Drone pose: [101.29, -52.09, 20.13, -45.65, -90.37, 0.0]\n  Target bbox: [670.73, 341.97, 699.48, 408.86]\n\nFrame 3:\n  Drone pose: [101.27, -52.68, 20.0, -43.8, -90.34, 0.0]\n  Target bbox: [664.76, 372.03, 707.95, 439.88]\n\nFrame 4:\n  Drone pose: [101.42, -53.33, 20.14, -51.95, -84.39, 0.0]\n  Target bbox: [587.24, 238.35, 634.91, 313.26]\n\nFrame 5 (current):\n  Drone pose: [101.36, -53.79, 19.92, -46.57, -86.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.05, \"ymin\": 325.55, \"xmax\": 657.29, \"ymax\": 393.48}, \"waypoint_deltas\": [{\"dx\": 0.26, \"dy\": -0.43, \"dz\": 0.08, \"dpitch\": -0.02, \"dyaw\": -0.84, \"droll\": 0.0}, {\"dx\": 0.38, \"dy\": -0.94, \"dz\": 0.08, \"dpitch\": -0.06, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": -1.46, \"dz\": 0.08, \"dpitch\": -0.09, \"dyaw\": -1.63, \"droll\": 0.0}, {\"dx\": 0.61, \"dy\": -1.98, \"dz\": 0.08, \"dpitch\": -0.13, \"dyaw\": -1.98, \"droll\": 0.0}, {\"dx\": 0.71, \"dy\": -2.5, \"dz\": 0.08, \"dpitch\": -0.17, \"dyaw\": -2.28, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.98, "window_alt_abs_m": 0.6, "target_px_mean_hist": 617.5, "cur_frame_id": 97, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776299694", "difficulty_score": 0.1962, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-54.11, 19.97, 22.0, -46.4, 87.14, 0.0]\n  Target bbox: [617.35, 322.69, 662.64, 396.77]\n\nFrame 2:\n  Drone pose: [-54.94, 19.57, 21.2, -43.9, 83.61, 0.0]\n  Target bbox: [621.33, 327.53, 658.36, 391.79]\n\nFrame 3:\n  Drone pose: [-55.24, 19.7, 20.67, -42.59, 82.94, 0.0]\n  Target bbox: [619.95, 326.92, 659.72, 392.42]\n\nFrame 4:\n  Drone pose: [-55.28, 20.1, 20.64, -42.41, 82.86, 0.0]\n  Target bbox: [619.68, 327.12, 660.0, 392.21]\n\nFrame 5 (current):\n  Drone pose: [-55.24, 20.6, 20.62, -42.38, 82.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.92, \"ymin\": 328.01, \"xmax\": 656.79, \"ymax\": 391.39}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 1.02, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 1.54, \"dz\": -0.07, \"dpitch\": 0.04, \"dyaw\": 0.21, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 2.05, \"dz\": -0.09, \"dpitch\": 0.05, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 2.56, \"dz\": -0.2, \"dpitch\": 0.19, \"dyaw\": 0.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.37, "window_alt_abs_m": 1.38, "target_px_mean_hist": 505.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.08, 23.67, 20.39, -42.16, 83.37, 0.0]\n  Target bbox: [626.28, 327.82, 653.46, 391.58]\n\nFrame 2:\n  Drone pose: [-55.06, 24.18, 20.36, -42.13, 83.43, 0.0]\n  Target bbox: [624.03, 327.56, 655.68, 391.83]\n\nFrame 3:\n  Drone pose: [-55.04, 24.69, 20.33, -42.11, 83.48, 0.0]\n  Target bbox: [617.43, 326.74, 662.19, 392.66]\n\nFrame 4:\n  Drone pose: [-55.02, 25.2, 20.3, -42.08, 83.52, 0.0]\n  Target bbox: [624.22, 326.51, 655.49, 392.79]\n\nFrame 5 (current):\n  Drone pose: [-55.01, 25.71, 20.27, -42.06, 83.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.46, \"ymin\": 326.72, \"xmax\": 662.15, \"ymax\": 392.68}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 0.03, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": 1.03, \"dz\": -0.05, \"dpitch\": 0.06, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": 1.54, \"dz\": -0.08, \"dpitch\": 0.08, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": 2.05, \"dz\": -0.1, \"dpitch\": 0.1, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": 2.57, \"dz\": -0.12, \"dpitch\": 0.12, \"dyaw\": -0.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.17, "window_alt_abs_m": 0.12, "target_px_mean_hist": 531.0, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.4, 29.33, 20.12, -41.93, 82.46, 0.0]\n  Target bbox: [622.05, 325.4, 657.61, 393.94]\n\nFrame 2:\n  Drone pose: [-55.53, 29.87, 20.1, -41.93, 82.1, 0.0]\n  Target bbox: [624.81, 326.09, 654.91, 393.23]\n\nFrame 3:\n  Drone pose: [-55.67, 30.41, 20.09, -41.93, 81.71, 0.0]\n  Target bbox: [620.81, 326.97, 658.87, 392.43]\n\nFrame 4:\n  Drone pose: [-55.81, 30.94, 20.08, -41.93, 81.33, 0.0]\n  Target bbox: [621.75, 324.98, 657.91, 394.39]\n\nFrame 5 (current):\n  Drone pose: [-55.93, 31.47, 20.07, -41.93, 81.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.94, \"ymin\": 326.08, \"xmax\": 660.74, \"ymax\": 393.27}, \"waypoint_deltas\": [{\"dx\": -0.09, \"dy\": 0.53, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": 1.08, \"dz\": -0.02, \"dpitch\": -0.04, \"dyaw\": -0.46, \"droll\": 0.0}, {\"dx\": -0.2, \"dy\": 1.63, \"dz\": -0.03, \"dpitch\": -0.09, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": 2.18, \"dz\": -0.03, \"dpitch\": -0.15, \"dyaw\": -0.69, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": 2.73, \"dz\": -0.04, \"dpitch\": -0.19, \"dyaw\": -0.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.46, "window_alt_abs_m": 0.05, "target_px_mean_hist": 543.5, "cur_frame_id": 25, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-56.16, 34.73, 20.03, -42.16, 80.28, 0.0]\n  Target bbox: [618.77, 325.83, 660.92, 393.5]\n\nFrame 2:\n  Drone pose: [-56.11, 35.27, 20.02, -42.22, 80.38, 0.0]\n  Target bbox: [620.78, 325.59, 659.19, 393.87]\n\nFrame 3:\n  Drone pose: [-56.03, 35.81, 20.02, -42.17, 79.23, 0.0]\n  Target bbox: [617.56, 322.46, 662.58, 397.21]\n\nFrame 4:\n  Drone pose: [-55.91, 36.37, 20.02, -42.14, 78.18, 0.0]\n  Target bbox: [617.28, 322.06, 662.79, 397.51]\n\nFrame 5 (current):\n  Drone pose: [-55.77, 36.94, 20.01, -42.12, 77.2, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.76, \"ymin\": 324.27, \"xmax\": 658.35, \"ymax\": 395.14}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": 0.56, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.92, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": 1.13, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -1.8, \"droll\": 0.0}, {\"dx\": 0.55, \"dy\": 1.71, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -2.66, \"droll\": 0.0}, {\"dx\": 0.73, \"dy\": 2.29, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -3.55, \"droll\": 0.0}, {\"dx\": 0.9, \"dy\": 2.86, \"dz\": -0.01, \"dpitch\": 0.09, \"dyaw\": -4.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.28, "window_alt_abs_m": 0.01, "target_px_mean_hist": 525.0, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-54.58, 40.93, 20.0, -42.1, 72.07, 0.0]\n  Target bbox: [619.07, 323.06, 661.09, 396.42]\n\nFrame 2:\n  Drone pose: [-54.47, 41.48, 20.0, -42.0, 71.03, 0.0]\n  Target bbox: [624.3, 324.71, 655.46, 394.66]\n\nFrame 3:\n  Drone pose: [-54.37, 42.03, 20.0, -42.11, 71.25, 0.0]\n  Target bbox: [619.49, 325.27, 660.58, 394.18]\n\nFrame 4:\n  Drone pose: [-54.27, 42.57, 20.0, -41.97, 70.18, 0.0]\n  Target bbox: [615.23, 322.49, 664.53, 397.02]\n\nFrame 5 (current):\n  Drone pose: [-54.18, 43.1, 20.0, -42.06, 70.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.38, \"ymin\": 324.16, \"xmax\": 661.39, \"ymax\": 395.31}, \"waypoint_deltas\": [{\"dx\": 0.11, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.24, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": 1.06, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -0.77, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": 1.58, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.47, \"droll\": 0.0}, {\"dx\": 0.49, \"dy\": 2.11, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -1.41, \"droll\": 0.0}, {\"dx\": 0.63, \"dy\": 2.63, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -1.06, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.55, "window_alt_abs_m": 0.0, "target_px_mean_hist": 536.0, "cur_frame_id": 46, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.24, 46.77, 20.0, -42.01, 68.8, 0.0]\n  Target bbox: [619.42, 323.41, 660.78, 396.06]\n\nFrame 2:\n  Drone pose: [-53.08, 47.3, 20.0, -41.87, 67.91, 0.0]\n  Target bbox: [617.88, 322.69, 661.9, 396.78]\n\nFrame 3:\n  Drone pose: [-52.93, 47.81, 20.0, -41.97, 68.27, 0.0]\n  Target bbox: [618.84, 320.95, 661.51, 398.64]\n\nFrame 4:\n  Drone pose: [-52.79, 48.33, 20.0, -41.81, 67.36, 0.0]\n  Target bbox: [617.25, 323.71, 662.59, 395.78]\n\nFrame 5 (current):\n  Drone pose: [-52.64, 48.84, 20.0, -41.89, 67.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.19, \"ymin\": 324.65, \"xmax\": 655.59, \"ymax\": 394.74}, \"waypoint_deltas\": [{\"dx\": 0.15, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": 1.03, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -0.49, \"droll\": 0.0}, {\"dx\": 0.48, \"dy\": 1.54, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 0.67, \"dy\": 2.05, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.87, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": 2.56, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.5, "window_alt_abs_m": 0.0, "target_px_mean_hist": 530.5, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-51.57, 51.91, 20.0, -41.75, 66.58, 0.0]\n  Target bbox: [616.22, 321.92, 663.56, 397.61]\n\nFrame 2:\n  Drone pose: [-51.36, 52.41, 20.0, -41.87, 67.1, 0.0]\n  Target bbox: [618.88, 323.31, 661.41, 396.32]\n\nFrame 3:\n  Drone pose: [-51.14, 52.91, 20.0, -41.72, 66.39, 0.0]\n  Target bbox: [618.12, 323.86, 661.72, 395.63]\n\nFrame 4:\n  Drone pose: [-50.93, 53.41, 20.0, -41.83, 66.93, 0.0]\n  Target bbox: [622.5, 327.31, 657.61, 392.08]\n\nFrame 5 (current):\n  Drone pose: [-50.7, 53.9, 20.0, -41.68, 66.26, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.8, \"ymin\": 324.42, \"xmax\": 662.03, \"ymax\": 395.15}, \"waypoint_deltas\": [{\"dx\": 0.23, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 0.58, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": 1.22, \"droll\": 0.0}, {\"dx\": 0.74, \"dy\": 1.45, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.66, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 1.92, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 1.39, \"droll\": 0.0}, {\"dx\": 1.33, \"dy\": 2.4, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.93, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.44, "window_alt_abs_m": 0.0, "target_px_mean_hist": 534.2, "cur_frame_id": 67, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.73, 57.24, 20.0, -41.72, 67.6, 0.0]\n  Target bbox: [622.47, 325.74, 657.33, 393.76]\n\nFrame 2:\n  Drone pose: [-48.4, 57.71, 20.0, -41.85, 68.46, 0.0]\n  Target bbox: [623.42, 323.76, 656.84, 395.7]\n\nFrame 3:\n  Drone pose: [-48.08, 58.17, 20.0, -41.72, 68.04, 0.0]\n  Target bbox: [617.42, 323.83, 662.41, 395.65]\n\nFrame 4:\n  Drone pose: [-47.77, 58.64, 20.0, -41.83, 68.84, 0.0]\n  Target bbox: [618.37, 322.33, 661.93, 397.32]\n\nFrame 5 (current):\n  Drone pose: [-47.5, 59.11, 20.0, -41.68, 68.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.67, \"ymin\": 323.86, \"xmax\": 662.14, \"ymax\": 395.64}, \"waypoint_deltas\": [{\"dx\": 0.24, \"dy\": 0.46, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": 0.45, \"dy\": 0.92, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 1.19, \"droll\": 0.0}, {\"dx\": 0.62, \"dy\": 1.38, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 1.66, \"droll\": 0.0}, {\"dx\": 0.76, \"dy\": 1.85, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 2.05, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": 2.32, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 2.36, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.59, "window_alt_abs_m": 0.0, "target_px_mean_hist": 526.8, "cur_frame_id": 78, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-46.62, 62.16, 20.0, -42.16, 70.48, 0.0]\n  Target bbox: [623.6, 325.98, 656.18, 393.47]\n\nFrame 2:\n  Drone pose: [-46.61, 62.89, 20.0, -42.46, 70.28, 0.0]\n  Target bbox: [614.46, 322.99, 665.31, 396.52]\n\nFrame 3:\n  Drone pose: [-46.61, 63.63, 20.0, -42.77, 70.08, 0.0]\n  Target bbox: [613.99, 322.06, 665.76, 397.44]\n\nFrame 4:\n  Drone pose: [-46.6, 64.36, 20.0, -43.08, 69.88, 0.0]\n  Target bbox: [624.07, 324.34, 655.7, 394.96]\n\nFrame 5 (current):\n  Drone pose: [-46.59, 65.1, 20.0, -43.4, 69.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.13, \"ymin\": 321.14, \"xmax\": 664.6, \"ymax\": 398.25}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": 0.73, \"dz\": 0.0, \"dpitch\": -0.32, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": 1.47, \"dz\": 0.0, \"dpitch\": -0.64, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": 2.2, \"dz\": 0.0, \"dpitch\": -0.96, \"dyaw\": -0.65, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 2.93, \"dz\": 0.0, \"dpitch\": -1.29, \"dyaw\": -0.88, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": 3.67, \"dz\": 0.0, \"dpitch\": -1.62, \"dyaw\": -1.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.81, "window_alt_abs_m": 0.0, "target_px_mean_hist": 556.2, "cur_frame_id": 88, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/ORI/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-46.53, 70.24, 20.0, -45.7, 68.08, 0.0]\n  Target bbox: [614.8, 319.83, 664.94, 399.39]\n\nFrame 2:\n  Drone pose: [-46.53, 70.97, 20.0, -46.04, 67.83, 0.0]\n  Target bbox: [613.73, 319.3, 666.02, 399.9]\n\nFrame 3:\n  Drone pose: [-46.52, 71.71, 20.0, -46.38, 67.57, 0.0]\n  Target bbox: [619.69, 322.33, 660.5, 396.73]\n\nFrame 4:\n  Drone pose: [-46.51, 72.44, 20.0, -46.42, 65.85, 0.0]\n  Target bbox: [622.27, 321.02, 658.05, 398.12]\n\nFrame 5 (current):\n  Drone pose: [-46.5, 73.17, 20.0, -46.42, 64.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.08, \"ymin\": 320.74, \"xmax\": 663.24, \"ymax\": 398.46}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": 0.74, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -1.72, \"droll\": 0.0}, {\"dx\": 0.29, \"dy\": 1.31, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -2.43, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": 1.88, \"dz\": 0.0, \"dpitch\": 0.37, \"dyaw\": -3.13, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": 2.45, \"dz\": 0.0, \"dpitch\": 0.44, \"dyaw\": -3.84, \"droll\": 0.0}, {\"dx\": 1.14, \"dy\": 3.02, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -5.35, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.96, "window_alt_abs_m": 0.0, "target_px_mean_hist": 607.5, "cur_frame_id": 99, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-54.16, 20.14, 22.01, -43.78, 90.69, 0.0]\n  Target bbox: [575.54, 373.54, 617.75, 444.8]\n\nFrame 2:\n  Drone pose: [-54.94, 19.57, 21.2, -42.69, 85.3, 0.0]\n  Target bbox: [597.25, 346.88, 640.65, 413.26]\n\nFrame 3:\n  Drone pose: [-55.31, 19.69, 20.76, -42.69, 82.77, 0.0]\n  Target bbox: [623.99, 326.9, 655.74, 392.4]\n\nFrame 4:\n  Drone pose: [-55.18, 20.11, 20.7, -40.26, 78.11, 0.0]\n  Target bbox: [682.22, 366.12, 721.98, 432.88]\n\nFrame 5 (current):\n  Drone pose: [-55.24, 20.6, 20.62, -41.48, 87.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 563.65, \"ymin\": 342.55, \"xmax\": 607.7, \"ymax\": 409.46}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": -0.88, \"dyaw\": -4.24, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 1.02, \"dz\": -0.05, \"dpitch\": -0.87, \"dyaw\": -4.18, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 1.54, \"dz\": -0.07, \"dpitch\": -0.86, \"dyaw\": -4.12, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 2.05, \"dz\": -0.09, \"dpitch\": -0.85, \"dyaw\": -4.05, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 2.56, \"dz\": -0.2, \"dpitch\": -0.71, \"dyaw\": -3.99, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.76, "window_alt_abs_m": 1.4, "target_px_mean_hist": 521.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.08, 23.67, 20.39, -42.16, 83.37, 0.0]\n  Target bbox: [619.69, 327.06, 659.99, 392.27]\n\nFrame 2:\n  Drone pose: [-55.06, 24.18, 20.36, -41.86, 83.83, 0.0]\n  Target bbox: [620.63, 331.24, 648.77, 397.19]\n\nFrame 3:\n  Drone pose: [-55.04, 24.69, 20.33, -41.87, 82.89, 0.0]\n  Target bbox: [629.07, 330.55, 665.69, 396.8]\n\nFrame 4:\n  Drone pose: [-55.13, 25.16, 20.42, -42.18, 83.23, 0.0]\n  Target bbox: [622.41, 326.55, 657.29, 392.75]\n\nFrame 5 (current):\n  Drone pose: [-54.94, 25.83, 20.37, -45.03, 78.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 682.08, \"ymin\": 282.9, \"xmax\": 722.6, \"ymax\": 350.71}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": 0.39, \"dz\": -0.13, \"dpitch\": 3.0, \"dyaw\": 4.82, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": 0.91, \"dz\": -0.15, \"dpitch\": 3.03, \"dyaw\": 4.76, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": 1.42, \"dz\": -0.18, \"dpitch\": 3.05, \"dyaw\": 4.66, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": 1.93, \"dz\": -0.2, \"dpitch\": 3.07, \"dyaw\": 4.52, \"droll\": 0.0}, {\"dx\": -0.25, \"dy\": 2.45, \"dz\": -0.22, \"dpitch\": 3.09, \"dyaw\": 4.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.29, "window_alt_abs_m": 0.2, "target_px_mean_hist": 526.0, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.4, 29.33, 20.12, -39.09, 84.61, 0.0]\n  Target bbox: [589.66, 373.42, 635.18, 441.73]\n\nFrame 2:\n  Drone pose: [-55.53, 29.87, 20.1, -38.09, 81.29, 0.0]\n  Target bbox: [629.09, 390.92, 671.17, 457.49]\n\nFrame 3:\n  Drone pose: [-55.72, 30.6, 20.12, -44.93, 81.35, 0.0]\n  Target bbox: [627.18, 281.5, 657.03, 347.17]\n\nFrame 4:\n  Drone pose: [-55.8, 30.94, 19.93, -42.85, 82.9, 0.0]\n  Target bbox: [598.69, 307.47, 641.71, 373.86]\n\nFrame 5 (current):\n  Drone pose: [-56.01, 31.51, 20.22, -42.19, 80.77, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.39, \"ymin\": 326.94, \"xmax\": 656.32, \"ymax\": 392.46}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.49, \"dz\": -0.16, \"dpitch\": 0.25, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": -0.08, \"dy\": 1.04, \"dz\": -0.17, \"dpitch\": 0.22, \"dyaw\": -0.23, \"droll\": 0.0}, {\"dx\": -0.12, \"dy\": 1.59, \"dz\": -0.18, \"dpitch\": 0.17, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": -0.15, \"dy\": 2.14, \"dz\": -0.18, \"dpitch\": 0.11, \"dyaw\": -0.46, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": 2.69, \"dz\": -0.19, \"dpitch\": 0.07, \"dyaw\": -0.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.07, "window_alt_abs_m": 0.51, "target_px_mean_hist": 544.0, "cur_frame_id": 25, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-56.22, 34.69, 20.07, -43.4, 76.94, 0.0]\n  Target bbox: [664.41, 306.47, 695.0, 372.83]\n\nFrame 2:\n  Drone pose: [-56.18, 35.2, 20.09, -41.57, 79.54, 0.0]\n  Target bbox: [628.83, 334.19, 668.91, 406.62]\n\nFrame 3:\n  Drone pose: [-56.0, 35.87, 20.13, -42.42, 79.3, 0.0]\n  Target bbox: [618.08, 322.32, 662.0, 397.22]\n\nFrame 4:\n  Drone pose: [-55.87, 36.49, 20.12, -38.6, 73.22, 0.0]\n  Target bbox: [679.11, 389.25, 726.0, 463.89]\n\nFrame 5 (current):\n  Drone pose: [-55.69, 36.79, 19.97, -41.9, 77.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.11, \"ymin\": 323.88, \"xmax\": 661.0, \"ymax\": 395.71}, \"waypoint_deltas\": [{\"dx\": 0.09, \"dy\": 0.71, \"dz\": 0.04, \"dpitch\": -0.21, \"dyaw\": -1.2, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 1.28, \"dz\": 0.04, \"dpitch\": -0.19, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": 1.86, \"dz\": 0.04, \"dpitch\": -0.18, \"dyaw\": -2.94, \"droll\": 0.0}, {\"dx\": 0.65, \"dy\": 2.44, \"dz\": 0.04, \"dpitch\": -0.17, \"dyaw\": -3.83, \"droll\": 0.0}, {\"dx\": 0.82, \"dy\": 3.01, \"dz\": 0.03, \"dpitch\": -0.13, \"dyaw\": -4.74, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.16, "window_alt_abs_m": 0.22, "target_px_mean_hist": 529.0, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-54.72, 40.91, 19.93, -41.92, 71.74, 0.0]\n  Target bbox: [622.26, 322.85, 658.01, 396.65]\n\nFrame 2:\n  Drone pose: [-54.44, 41.62, 19.99, -42.18, 70.98, 0.0]\n  Target bbox: [617.52, 324.55, 662.25, 394.96]\n\nFrame 3:\n  Drone pose: [-54.37, 42.03, 20.0, -41.78, 68.56, 0.0]\n  Target bbox: [653.13, 330.03, 695.04, 401.44]\n\nFrame 4:\n  Drone pose: [-54.26, 42.68, 19.86, -38.77, 65.59, 0.0]\n  Target bbox: [673.93, 376.05, 719.6, 451.53]\n\nFrame 5 (current):\n  Drone pose: [-54.07, 43.24, 20.08, -41.54, 73.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 591.65, \"ymin\": 341.14, \"xmax\": 624.79, \"ymax\": 408.28}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.39, \"dz\": -0.08, \"dpitch\": -0.6, \"dyaw\": -2.43, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 0.92, \"dz\": -0.08, \"dpitch\": -0.46, \"dyaw\": -3.44, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": 1.44, \"dz\": -0.08, \"dpitch\": -0.55, \"dyaw\": -3.14, \"droll\": 0.0}, {\"dx\": 0.38, \"dy\": 1.97, \"dz\": -0.08, \"dpitch\": -0.41, \"dyaw\": -4.08, \"droll\": 0.0}, {\"dx\": 0.52, \"dy\": 2.49, \"dz\": -0.08, \"dpitch\": -0.51, \"dyaw\": -3.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.62, "window_alt_abs_m": 0.42, "target_px_mean_hist": 538.5, "cur_frame_id": 46, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.12, 46.7, 19.86, -38.39, 74.16, 0.0]\n  Target bbox: [558.43, 381.07, 595.74, 455.24]\n\nFrame 2:\n  Drone pose: [-52.93, 47.36, 20.02, -42.05, 68.24, 0.0]\n  Target bbox: [614.54, 322.36, 665.25, 397.19]\n\nFrame 3:\n  Drone pose: [-52.93, 47.81, 20.0, -41.67, 65.07, 0.0]\n  Target bbox: [661.7, 333.51, 698.81, 397.45]\n\nFrame 4:\n  Drone pose: [-52.78, 48.23, 20.09, -40.93, 72.48, 0.0]\n  Target bbox: [555.64, 341.44, 598.2, 411.58]\n\nFrame 5 (current):\n  Drone pose: [-52.79, 48.81, 20.04, -44.96, 64.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 654.8, \"ymin\": 271.7, \"xmax\": 696.57, \"ymax\": 344.19}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": 0.54, \"dz\": -0.04, \"dpitch\": 2.98, \"dyaw\": 3.52, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 1.06, \"dz\": -0.04, \"dpitch\": 3.13, \"dyaw\": 2.66, \"droll\": 0.0}, {\"dx\": 0.63, \"dy\": 1.57, \"dz\": -0.04, \"dpitch\": 3.03, \"dyaw\": 3.08, \"droll\": 0.0}, {\"dx\": 0.82, \"dy\": 2.08, \"dz\": -0.04, \"dpitch\": 3.17, \"dyaw\": 2.28, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 2.59, \"dz\": -0.04, \"dpitch\": 3.06, \"dyaw\": 2.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.42, "window_alt_abs_m": 0.31, "target_px_mean_hist": 547.0, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-51.68, 51.81, 20.05, -41.66, 66.42, 0.0]\n  Target bbox: [618.69, 324.21, 661.13, 395.33]\n\nFrame 2:\n  Drone pose: [-51.22, 52.36, 19.91, -41.74, 67.49, 0.0]\n  Target bbox: [620.28, 325.12, 659.92, 394.41]\n\nFrame 3:\n  Drone pose: [-51.17, 52.92, 20.17, -41.98, 66.33, 0.0]\n  Target bbox: [618.03, 323.94, 661.79, 395.58]\n\nFrame 4:\n  Drone pose: [-50.93, 53.41, 20.0, -41.83, 66.93, 0.0]\n  Target bbox: [618.27, 322.38, 662.05, 397.27]\n\nFrame 5 (current):\n  Drone pose: [-50.66, 53.73, 19.96, -41.44, 66.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.49, \"ymin\": 325.29, \"xmax\": 658.31, \"ymax\": 394.24}, \"waypoint_deltas\": [{\"dx\": 0.19, \"dy\": 0.66, \"dz\": 0.04, \"dpitch\": -0.34, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": 0.43, \"dy\": 1.14, \"dz\": 0.04, \"dpitch\": -0.44, \"dyaw\": 0.93, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": 1.62, \"dz\": 0.04, \"dpitch\": -0.3, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": 2.09, \"dz\": 0.04, \"dpitch\": -0.42, \"dyaw\": 1.1, \"droll\": 0.0}, {\"dx\": 1.29, \"dy\": 2.57, \"dz\": 0.04, \"dpitch\": -0.29, \"dyaw\": 0.64, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.22, "window_alt_abs_m": 0.62, "target_px_mean_hist": 532.8, "cur_frame_id": 67, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.57, 57.15, 19.93, -41.45, 64.22, 0.0]\n  Target bbox: [670.1, 327.33, 707.3, 398.87]\n\nFrame 2:\n  Drone pose: [-48.4, 57.71, 20.0, -41.85, 68.46, 0.0]\n  Target bbox: [619.09, 321.06, 661.26, 398.53]\n\nFrame 3:\n  Drone pose: [-48.08, 58.17, 20.0, -41.03, 73.04, 0.0]\n  Target bbox: [557.76, 339.52, 596.02, 406.74]\n\nFrame 4:\n  Drone pose: [-47.77, 58.64, 20.0, -41.83, 68.84, 0.0]\n  Target bbox: [618.76, 323.17, 661.46, 396.36]\n\nFrame 5 (current):\n  Drone pose: [-47.52, 59.01, 19.95, -36.87, 68.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.15, \"ymin\": 399.43, \"xmax\": 659.85, \"ymax\": 474.85}, \"waypoint_deltas\": [{\"dx\": 0.26, \"dy\": 0.56, \"dz\": 0.05, \"dpitch\": -4.88, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": 1.02, \"dz\": 0.05, \"dpitch\": -4.92, \"dyaw\": 0.84, \"droll\": 0.0}, {\"dx\": 0.64, \"dy\": 1.48, \"dz\": 0.05, \"dpitch\": -4.95, \"dyaw\": 1.31, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": 1.95, \"dz\": 0.05, \"dpitch\": -4.97, \"dyaw\": 1.7, \"droll\": 0.0}, {\"dx\": 0.89, \"dy\": 2.42, \"dz\": 0.05, \"dpitch\": -4.99, \"dyaw\": 2.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.21, "window_alt_abs_m": 0.12, "target_px_mean_hist": 535.0, "cur_frame_id": 78, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-46.62, 62.16, 20.0, -38.1, 75.48, 0.0]\n  Target bbox: [554.8, 393.45, 599.29, 466.06]\n\nFrame 2:\n  Drone pose: [-46.53, 62.89, 19.87, -46.39, 65.51, 0.0]\n  Target bbox: [684.77, 258.09, 720.18, 327.44]\n\nFrame 3:\n  Drone pose: [-46.68, 63.76, 19.88, -42.72, 69.76, 0.0]\n  Target bbox: [615.85, 321.57, 663.89, 397.84]\n\nFrame 4:\n  Drone pose: [-46.78, 64.39, 20.02, -46.18, 64.38, 0.0]\n  Target bbox: [675.32, 270.23, 728.51, 348.19]\n\nFrame 5 (current):\n  Drone pose: [-46.64, 65.23, 19.87, -43.36, 69.41, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.46, \"ymin\": 324.88, \"xmax\": 657.31, \"ymax\": 394.5}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.6, \"dz\": 0.13, \"dpitch\": -0.36, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 1.34, \"dz\": 0.13, \"dpitch\": -0.68, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 2.07, \"dz\": 0.13, \"dpitch\": -1.0, \"dyaw\": -0.39, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 2.8, \"dz\": 0.13, \"dpitch\": -1.33, \"dyaw\": -0.62, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": 3.54, \"dz\": 0.13, \"dpitch\": -1.66, \"dyaw\": -0.85, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.62, "window_alt_abs_m": 0.43, "target_px_mean_hist": 563.2, "cur_frame_id": 88, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777/aug_001/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-46.53, 70.37, 20.12, -44.85, 62.94, 0.0]\n  Target bbox: [676.98, 344.85, 720.21, 419.18]\n\nFrame 2:\n  Drone pose: [-46.59, 70.99, 20.04, -47.2, 72.6, 0.0]\n  Target bbox: [555.87, 305.47, 606.28, 380.01]\n\nFrame 3:\n  Drone pose: [-46.52, 71.71, 20.0, -45.27, 70.33, 0.0]\n  Target bbox: [586.94, 338.37, 628.49, 419.38]\n\nFrame 4:\n  Drone pose: [-46.51, 72.44, 20.0, -46.42, 65.85, 0.0]\n  Target bbox: [617.63, 321.13, 662.69, 398.15]\n\nFrame 5 (current):\n  Drone pose: [-46.5, 73.17, 20.0, -43.32, 65.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 595.32, \"ymin\": 370.41, \"xmax\": 641.39, \"ymax\": 453.45}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": 0.74, \"dz\": 0.0, \"dpitch\": -3.08, \"dyaw\": -3.56, \"droll\": 0.0}, {\"dx\": 0.29, \"dy\": 1.31, \"dz\": 0.0, \"dpitch\": -3.02, \"dyaw\": -4.27, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": 1.88, \"dz\": 0.0, \"dpitch\": -2.73, \"dyaw\": -4.97, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": 2.45, \"dz\": 0.0, \"dpitch\": -2.66, \"dyaw\": -5.68, \"droll\": 0.0}, {\"dx\": 1.14, \"dy\": 3.02, \"dz\": 0.0, \"dpitch\": -3.26, \"dyaw\": -7.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.54, "window_alt_abs_m": 0.12, "target_px_mean_hist": 600.5, "cur_frame_id": 99, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776421777", "difficulty_score": 0.5311, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-82.25, -68.25, 22.0, -49.52, 47.39, 0.0]\n  Target bbox: [613.75, 319.36, 666.36, 399.95]\n\nFrame 2:\n  Drone pose: [-82.01, -68.01, 21.2, -48.36, 48.5, 0.0]\n  Target bbox: [617.77, 321.23, 662.15, 397.95]\n\nFrame 3:\n  Drone pose: [-81.78, -67.77, 20.67, -47.52, 49.6, 0.0]\n  Target bbox: [614.33, 318.64, 665.69, 400.64]\n\nFrame 4:\n  Drone pose: [-81.55, -67.53, 20.64, -47.19, 50.7, 0.0]\n  Target bbox: [623.39, 324.49, 656.93, 394.47]\n\nFrame 5 (current):\n  Drone pose: [-81.31, -67.29, 20.62, -46.57, 50.57, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.85, \"ymin\": 322.11, \"xmax\": 661.67, \"ymax\": 397.1}, \"waypoint_deltas\": [{\"dx\": 0.23, \"dy\": 0.24, \"dz\": -0.03, \"dpitch\": 0.61, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 0.48, \"dz\": -0.05, \"dpitch\": 1.2, \"dyaw\": -0.24, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": 0.72, \"dz\": -0.07, \"dpitch\": 1.32, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": 0.93, \"dy\": 0.96, \"dz\": -0.09, \"dpitch\": 1.44, \"dyaw\": 1.82, \"droll\": 0.0}, {\"dx\": 1.16, \"dy\": 1.19, \"dz\": -0.2, \"dpitch\": 1.7, \"dyaw\": 2.83, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.44, "window_alt_abs_m": 1.38, "target_px_mean_hist": 705.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-79.68, -65.62, 20.36, -44.14, 54.23, 0.0]\n  Target bbox: [622.98, 324.08, 657.36, 395.17]\n\nFrame 2:\n  Drone pose: [-79.45, -65.38, 20.33, -43.57, 54.04, 0.0]\n  Target bbox: [620.3, 322.53, 660.16, 396.89]\n\nFrame 3:\n  Drone pose: [-79.21, -65.14, 20.3, -43.01, 53.87, 0.0]\n  Target bbox: [620.79, 322.91, 659.66, 396.53]\n\nFrame 4:\n  Drone pose: [-78.98, -64.9, 20.27, -42.46, 53.7, 0.0]\n  Target bbox: [619.44, 323.32, 661.0, 396.09]\n\nFrame 5 (current):\n  Drone pose: [-78.47, -64.52, 20.24, -42.29, 53.91, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.96, \"ymin\": 324.31, \"xmax\": 657.38, \"ymax\": 395.04}, \"waypoint_deltas\": [{\"dx\": 0.38, \"dy\": 0.37, \"dz\": -0.02, \"dpitch\": 0.27, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": 0.82, \"dy\": 0.7, \"dz\": -0.05, \"dpitch\": 0.55, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 1.65, \"dy\": 0.93, \"dz\": -0.07, \"dpitch\": 0.61, \"dyaw\": 1.24, \"droll\": 0.0}, {\"dx\": 3.18, \"dy\": 1.08, \"dz\": -0.09, \"dpitch\": -0.09, \"dyaw\": 5.24, \"droll\": 0.0}, {\"dx\": 5.68, \"dy\": 1.25, \"dz\": -0.11, \"dpitch\": -1.26, \"dyaw\": 11.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.74, "window_alt_abs_m": 0.11, "target_px_mean_hist": 697.8, "cur_frame_id": 15, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00027/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-61.52, -61.47, 20.09, -46.18, 99.78, 0.0]\n  Target bbox: [620.35, 321.82, 660.06, 397.31]\n\nFrame 2:\n  Drone pose: [-58.2, -60.55, 20.08, -45.46, 110.13, 0.0]\n  Target bbox: [616.89, 320.0, 663.44, 399.26]\n\nFrame 3:\n  Drone pose: [-55.99, -59.81, 20.07, -44.45, 116.53, 0.0]\n  Target bbox: [615.83, 320.2, 664.38, 399.19]\n\nFrame 4:\n  Drone pose: [-54.81, -59.29, 20.06, -43.65, 119.61, 0.0]\n  Target bbox: [615.02, 320.24, 665.12, 399.27]\n\nFrame 5 (current):\n  Drone pose: [-54.3, -58.86, 20.05, -43.19, 120.77, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.16, \"ymin\": 319.76, \"xmax\": 665.98, \"ymax\": 399.84}, \"waypoint_deltas\": [{\"dx\": 0.21, \"dy\": 0.44, \"dz\": -0.01, \"dpitch\": 0.25, \"dyaw\": 0.42, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 0.9, \"dz\": -0.01, \"dpitch\": 0.36, \"dyaw\": 0.55, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": 1.38, \"dz\": -0.02, \"dpitch\": 0.41, \"dyaw\": 0.58, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": 1.87, \"dz\": -0.02, \"dpitch\": 0.43, \"dyaw\": 0.59, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": 2.36, \"dz\": -0.03, \"dpitch\": 0.45, \"dyaw\": 0.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.99, "window_alt_abs_m": 0.04, "target_px_mean_hist": 754.5, "cur_frame_id": 27, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.97, -54.99, 20.01, -42.73, 121.38, 0.0]\n  Target bbox: [615.27, 320.59, 664.85, 399.0]\n\nFrame 2:\n  Drone pose: [-53.96, -54.49, 20.01, -42.72, 121.4, 0.0]\n  Target bbox: [614.14, 319.99, 665.98, 399.65]\n\nFrame 3:\n  Drone pose: [-54.03, -54.06, 20.01, -42.68, 121.15, 0.0]\n  Target bbox: [615.91, 320.5, 664.25, 399.08]\n\nFrame 4:\n  Drone pose: [-54.09, -53.63, 20.01, -42.64, 120.89, 0.0]\n  Target bbox: [620.59, 322.02, 659.62, 397.39]\n\nFrame 5 (current):\n  Drone pose: [-54.15, -53.2, 20.01, -42.6, 120.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.97, \"ymin\": 320.6, \"xmax\": 664.2, \"ymax\": 398.98}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": 0.43, \"dz\": -0.01, \"dpitch\": 0.04, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": 0.86, \"dz\": -0.01, \"dpitch\": 0.08, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 1.29, \"dz\": -0.01, \"dpitch\": 0.12, \"dyaw\": -0.75, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 1.72, \"dz\": -0.01, \"dpitch\": 0.17, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": 2.15, \"dz\": -0.01, \"dpitch\": -0.13, \"dyaw\": -2.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.78, "window_alt_abs_m": 0.01, "target_px_mean_hist": 701.2, "cur_frame_id": 39, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00051/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-54.66, -49.76, 20.0, -43.51, 113.58, 0.0]\n  Target bbox: [616.61, 320.45, 663.05, 399.02]\n\nFrame 2:\n  Drone pose: [-54.73, -49.33, 20.0, -43.73, 112.0, 0.0]\n  Target bbox: [619.08, 321.62, 660.66, 397.73]\n\nFrame 3:\n  Drone pose: [-54.79, -48.9, 20.0, -43.93, 110.39, 0.0]\n  Target bbox: [614.93, 318.98, 664.7, 400.59]\n\nFrame 4:\n  Drone pose: [-54.85, -48.47, 20.0, -44.11, 108.77, 0.0]\n  Target bbox: [618.76, 321.68, 661.02, 397.66]\n\nFrame 5 (current):\n  Drone pose: [-54.92, -48.04, 20.0, -44.27, 107.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.66, \"ymin\": 321.14, \"xmax\": 664.69, \"ymax\": 398.23}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": 0.43, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.24, \"droll\": 0.0}, {\"dx\": -0.12, \"dy\": 0.87, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -0.48, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 1.3, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": -0.72, \"droll\": 0.0}, {\"dx\": -0.25, \"dy\": 1.73, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": -2.37, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": 2.16, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": -2.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.46, "window_alt_abs_m": 0.0, "target_px_mean_hist": 709.5, "cur_frame_id": 51, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00063/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.42, -44.59, 20.0, -43.88, 103.82, 0.0]\n  Target bbox: [615.46, 322.23, 664.9, 397.09]\n\nFrame 2:\n  Drone pose: [-55.49, -44.16, 20.0, -43.8, 103.59, 0.0]\n  Target bbox: [617.43, 322.25, 662.94, 397.07]\n\nFrame 3:\n  Drone pose: [-55.55, -43.73, 20.0, -43.73, 103.37, 0.0]\n  Target bbox: [615.69, 322.13, 664.68, 397.19]\n\nFrame 4:\n  Drone pose: [-55.61, -43.3, 20.0, -43.65, 103.14, 0.0]\n  Target bbox: [617.91, 322.37, 662.46, 396.95]\n\nFrame 5 (current):\n  Drone pose: [-55.68, -42.87, 20.0, -43.57, 102.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.99, \"ymin\": 322.52, \"xmax\": 664.38, \"ymax\": 396.8}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": 0.43, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.23, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": 0.86, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -0.45, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 1.29, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": 1.45, \"dz\": 0.0, \"dpitch\": 0.48, \"dyaw\": -3.27, \"droll\": 0.0}, {\"dx\": -1.71, \"dy\": 1.7, \"dz\": 0.0, \"dpitch\": 0.69, \"dyaw\": -5.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.91, "window_alt_abs_m": 0.0, "target_px_mean_hist": 724.0, "cur_frame_id": 63, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.2, -40.41, 20.0, -42.69, 95.36, 0.0]\n  Target bbox: [619.73, 324.39, 660.7, 394.91]\n\nFrame 2:\n  Drone pose: [-58.25, -39.87, 20.0, -42.75, 95.21, 0.0]\n  Target bbox: [617.76, 324.65, 662.69, 394.63]\n\nFrame 3:\n  Drone pose: [-58.32, -39.33, 20.0, -42.81, 95.05, 0.0]\n  Target bbox: [618.65, 324.29, 661.81, 395.02]\n\nFrame 4:\n  Drone pose: [-58.38, -38.72, 20.0, -42.96, 94.91, 0.0]\n  Target bbox: [617.62, 324.44, 662.85, 394.83]\n\nFrame 5 (current):\n  Drone pose: [-58.39, -38.04, 20.0, -43.22, 94.91, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.15, \"ymin\": 324.2, \"xmax\": 661.31, \"ymax\": 395.07}, \"waypoint_deltas\": [{\"dx\": -0.42, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -1.18, \"droll\": 0.0}, {\"dx\": -0.68, \"dy\": 1.09, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": -1.91, \"droll\": 0.0}, {\"dx\": -1.1, \"dy\": 1.4, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -3.15, \"droll\": 0.0}, {\"dx\": -1.26, \"dy\": 1.87, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -3.6, \"droll\": 0.0}, {\"dx\": -1.37, \"dy\": 2.33, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -3.92, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.45, "window_alt_abs_m": 0.0, "target_px_mean_hist": 713.0, "cur_frame_id": 74, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00086/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-59.92, -34.55, 20.0, -42.6, 90.54, 0.0]\n  Target bbox: [622.62, 324.8, 657.68, 394.45]\n\nFrame 2:\n  Drone pose: [-59.93, -34.21, 20.0, -42.39, 90.5, 0.0]\n  Target bbox: [624.64, 325.27, 655.64, 394.03]\n\nFrame 3:\n  Drone pose: [-59.92, -33.82, 20.0, -42.24, 90.52, 0.0]\n  Target bbox: [623.43, 325.19, 656.85, 394.06]\n\nFrame 4:\n  Drone pose: [-59.9, -33.39, 20.0, -42.14, 90.57, 0.0]\n  Target bbox: [619.18, 326.13, 661.12, 393.18]\n\nFrame 5 (current):\n  Drone pose: [-59.86, -32.96, 20.0, -42.05, 90.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.01, \"ymin\": 325.42, \"xmax\": 655.36, \"ymax\": 393.9}, \"waypoint_deltas\": [{\"dx\": 0.09, \"dy\": 0.44, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.22, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": 0.88, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": 0.59, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": 1.34, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 1.11, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": 1.82, \"dz\": 0.0, \"dpitch\": 0.26, \"dyaw\": 0.41, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": 2.3, \"dz\": 0.0, \"dpitch\": 0.28, \"dyaw\": -0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.21, "window_alt_abs_m": 0.0, "target_px_mean_hist": 709.2, "cur_frame_id": 86, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00098/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-57.97, -29.21, 20.0, -41.7, 89.02, 0.0]\n  Target bbox: [616.28, 320.61, 663.76, 399.07]\n\nFrame 2:\n  Drone pose: [-57.66, -28.74, 20.0, -41.66, 88.51, 0.0]\n  Target bbox: [614.3, 319.85, 665.72, 399.99]\n\nFrame 3:\n  Drone pose: [-57.36, -28.27, 20.0, -41.61, 87.96, 0.0]\n  Target bbox: [615.53, 320.3, 664.52, 399.45]\n\nFrame 4:\n  Drone pose: [-57.09, -27.79, 20.0, -41.79, 87.34, 0.0]\n  Target bbox: [614.39, 320.07, 665.64, 399.76]\n\nFrame 5 (current):\n  Drone pose: [-56.85, -27.31, 20.0, -41.76, 86.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.15, \"ymin\": 323.04, \"xmax\": 659.88, \"ymax\": 396.44}, \"waypoint_deltas\": [{\"dx\": 0.23, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.73, \"droll\": 0.0}, {\"dx\": 0.43, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -1.52, \"droll\": 0.0}, {\"dx\": 0.61, \"dy\": 1.53, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -2.4, \"droll\": 0.0}, {\"dx\": 0.75, \"dy\": 2.05, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": -3.37, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": 2.56, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -4.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.35, "window_alt_abs_m": 0.0, "target_px_mean_hist": 700.5, "cur_frame_id": 98, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/ORI/frames_playback/frame_00110/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.88, -23.22, 20.0, -41.68, 82.55, 0.0]\n  Target bbox: [617.48, 325.17, 662.1, 394.25]\n\nFrame 2:\n  Drone pose: [-55.87, -22.72, 20.0, -41.69, 82.57, 0.0]\n  Target bbox: [620.55, 324.64, 659.06, 394.74]\n\nFrame 3:\n  Drone pose: [-55.86, -22.21, 20.0, -41.69, 82.58, 0.0]\n  Target bbox: [620.95, 324.7, 658.66, 394.71]\n\nFrame 4:\n  Drone pose: [-55.86, -21.71, 20.0, -41.69, 82.6, 0.0]\n  Target bbox: [618.77, 324.63, 660.8, 394.8]\n\nFrame 5 (current):\n  Drone pose: [-55.84, -21.22, 20.0, -41.69, 82.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.1, \"ymin\": 324.89, \"xmax\": 661.48, \"ymax\": 394.54}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 1.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 2.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.24, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 2.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.09, "window_alt_abs_m": 0.0, "target_px_mean_hist": 687.0, "cur_frame_id": 110, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-82.15, -68.13, 22.05, -48.98, 49.94, 0.0]\n  Target bbox: [586.07, 334.78, 636.49, 413.82]\n\nFrame 2:\n  Drone pose: [-82.01, -68.01, 21.2, -48.36, 48.5, 0.0]\n  Target bbox: [613.63, 318.57, 666.42, 400.74]\n\nFrame 3:\n  Drone pose: [-81.64, -67.71, 20.63, -47.69, 49.81, 0.0]\n  Target bbox: [613.94, 318.4, 666.08, 400.89]\n\nFrame 4:\n  Drone pose: [-81.55, -67.53, 20.64, -46.54, 45.7, 0.0]\n  Target bbox: [677.72, 335.15, 717.78, 409.37]\n\nFrame 5 (current):\n  Drone pose: [-81.31, -67.29, 20.62, -46.57, 50.57, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.8, \"ymin\": 323.51, \"xmax\": 657.57, \"ymax\": 395.53}, \"waypoint_deltas\": [{\"dx\": 0.23, \"dy\": 0.24, \"dz\": -0.03, \"dpitch\": 0.61, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 0.48, \"dz\": -0.05, \"dpitch\": 1.2, \"dyaw\": -0.24, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": 0.72, \"dz\": -0.07, \"dpitch\": 1.32, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": 0.93, \"dy\": 0.96, \"dz\": -0.09, \"dpitch\": 1.44, \"dyaw\": 1.82, \"droll\": 0.0}, {\"dx\": 1.16, \"dy\": 1.19, \"dz\": -0.2, \"dpitch\": 1.7, \"dyaw\": 2.83, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.73, "window_alt_abs_m": 1.46, "target_px_mean_hist": 699.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-79.6, -65.68, 20.46, -43.82, 59.51, 0.0]\n  Target bbox: [556.24, 330.89, 603.03, 407.64]\n\nFrame 2:\n  Drone pose: [-79.27, -65.37, 20.29, -45.18, 50.35, 0.0]\n  Target bbox: [671.87, 299.99, 708.29, 371.07]\n\nFrame 3:\n  Drone pose: [-79.12, -65.07, 20.29, -43.14, 53.97, 0.0]\n  Target bbox: [621.29, 324.2, 659.07, 395.12]\n\nFrame 4:\n  Drone pose: [-78.98, -64.9, 20.27, -42.46, 53.7, 0.0]\n  Target bbox: [619.47, 323.03, 661.02, 396.5]\n\nFrame 5 (current):\n  Drone pose: [-78.43, -64.58, 20.17, -38.71, 50.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 662.25, \"ymin\": 382.09, \"xmax\": 701.58, \"ymax\": 454.58}, \"waypoint_deltas\": [{\"dx\": 0.34, \"dy\": 0.43, \"dz\": 0.05, \"dpitch\": -3.31, \"dyaw\": 3.09, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": 0.76, \"dz\": 0.02, \"dpitch\": -3.03, \"dyaw\": 3.23, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": 0.99, \"dz\": 0.0, \"dpitch\": -2.97, \"dyaw\": 4.37, \"droll\": 0.0}, {\"dx\": 3.14, \"dy\": 1.14, \"dz\": -0.02, \"dpitch\": -3.67, \"dyaw\": 8.37, \"droll\": 0.0}, {\"dx\": 5.64, \"dy\": 1.31, \"dz\": -0.04, \"dpitch\": -4.84, \"dyaw\": 15.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.96, "window_alt_abs_m": 0.28, "target_px_mean_hist": 710.5, "cur_frame_id": 15, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00027/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-61.52, -61.47, 20.09, -46.18, 99.78, 0.0]\n  Target bbox: [621.97, 321.79, 658.4, 397.26]\n\nFrame 2:\n  Drone pose: [-58.05, -60.5, 20.18, -45.6, 110.6, 0.0]\n  Target bbox: [613.63, 320.26, 666.64, 399.05]\n\nFrame 3:\n  Drone pose: [-56.05, -59.79, 20.24, -44.77, 116.41, 0.0]\n  Target bbox: [619.81, 321.33, 660.44, 397.96]\n\nFrame 4:\n  Drone pose: [-54.96, -59.27, 20.12, -44.86, 114.25, 0.0]\n  Target bbox: [677.62, 306.13, 724.69, 383.69]\n\nFrame 5 (current):\n  Drone pose: [-54.3, -58.86, 20.05, -45.43, 121.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 608.59, \"ymin\": 282.02, \"xmax\": 660.58, \"ymax\": 362.19}, \"waypoint_deltas\": [{\"dx\": 0.21, \"dy\": 0.44, \"dz\": -0.01, \"dpitch\": 2.49, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 0.9, \"dz\": -0.01, \"dpitch\": 2.6, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": 1.38, \"dz\": -0.02, \"dpitch\": 2.65, \"dyaw\": 0.14, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": 1.87, \"dz\": -0.02, \"dpitch\": 2.67, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": 2.36, \"dz\": -0.03, \"dpitch\": 2.69, \"dyaw\": 0.15, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 25.74, "window_alt_abs_m": 0.34, "target_px_mean_hist": 744.5, "cur_frame_id": 27, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.84, -54.88, 20.07, -42.85, 121.87, 0.0]\n  Target bbox: [616.81, 320.6, 663.36, 398.93]\n\nFrame 2:\n  Drone pose: [-54.09, -54.47, 19.89, -42.65, 121.13, 0.0]\n  Target bbox: [614.44, 319.94, 665.69, 399.67]\n\nFrame 3:\n  Drone pose: [-54.09, -54.11, 20.06, -41.27, 125.92, 0.0]\n  Target bbox: [550.78, 346.24, 604.83, 426.58]\n\nFrame 4:\n  Drone pose: [-54.02, -53.63, 19.89, -42.41, 121.07, 0.0]\n  Target bbox: [617.66, 321.13, 662.53, 398.42]\n\nFrame 5 (current):\n  Drone pose: [-54.15, -53.2, 20.01, -42.6, 120.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.36, \"ymin\": 321.63, \"xmax\": 660.84, \"ymax\": 397.83}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": 0.43, \"dz\": -0.01, \"dpitch\": 0.04, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": 0.86, \"dz\": -0.01, \"dpitch\": 0.08, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 1.29, \"dz\": -0.01, \"dpitch\": 0.12, \"dyaw\": -0.75, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 1.72, \"dz\": -0.01, \"dpitch\": 0.17, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": 2.15, \"dz\": -0.01, \"dpitch\": -0.13, \"dyaw\": -2.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.8, "window_alt_abs_m": 0.64, "target_px_mean_hist": 720.8, "cur_frame_id": 39, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00051/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-54.82, -49.69, 19.99, -43.68, 113.24, 0.0]\n  Target bbox: [617.27, 320.75, 662.41, 398.69]\n\nFrame 2:\n  Drone pose: [-54.69, -49.17, 19.91, -43.79, 112.27, 0.0]\n  Target bbox: [617.91, 320.9, 661.82, 398.44]\n\nFrame 3:\n  Drone pose: [-54.68, -48.84, 19.85, -43.73, 110.75, 0.0]\n  Target bbox: [617.15, 319.69, 662.51, 399.77]\n\nFrame 4:\n  Drone pose: [-54.78, -48.47, 19.84, -48.83, 108.69, 0.0]\n  Target bbox: [618.6, 235.13, 667.9, 316.12]\n\nFrame 5 (current):\n  Drone pose: [-55.01, -47.9, 19.91, -44.37, 106.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.82, \"ymin\": 321.08, \"xmax\": 664.51, \"ymax\": 398.22}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.29, \"dz\": 0.09, \"dpitch\": 0.17, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": 0.73, \"dz\": 0.09, \"dpitch\": 0.24, \"dyaw\": -0.34, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": 1.16, \"dz\": 0.09, \"dpitch\": 0.31, \"dyaw\": -0.58, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": 1.59, \"dz\": 0.09, \"dpitch\": 0.19, \"dyaw\": -2.23, \"droll\": 0.0}, {\"dx\": -0.22, \"dy\": 2.02, \"dz\": 0.09, \"dpitch\": 0.26, \"dyaw\": -2.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.26, "window_alt_abs_m": 0.22, "target_px_mean_hist": 705.2, "cur_frame_id": 51, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00063/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.35, -44.51, 20.13, -41.91, 99.11, 0.0]\n  Target bbox: [676.67, 362.75, 725.26, 436.07]\n\nFrame 2:\n  Drone pose: [-55.49, -44.16, 20.0, -38.8, 101.82, 0.0]\n  Target bbox: [637.56, 406.56, 686.99, 481.45]\n\nFrame 3:\n  Drone pose: [-55.55, -43.73, 20.0, -46.12, 105.18, 0.0]\n  Target bbox: [596.84, 281.9, 638.44, 357.51]\n\nFrame 4:\n  Drone pose: [-55.55, -43.34, 19.85, -41.25, 98.29, 0.0]\n  Target bbox: [682.3, 360.21, 721.02, 433.45]\n\nFrame 5 (current):\n  Drone pose: [-55.68, -42.87, 20.0, -43.57, 102.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.8, \"ymin\": 322.38, \"xmax\": 659.56, \"ymax\": 396.89}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": 0.43, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.23, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": 0.86, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -0.45, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 1.29, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": 1.45, \"dz\": 0.0, \"dpitch\": 0.48, \"dyaw\": -3.27, \"droll\": 0.0}, {\"dx\": -1.71, \"dy\": 1.7, \"dz\": 0.0, \"dpitch\": 0.69, \"dyaw\": -5.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.59, "window_alt_abs_m": 0.43, "target_px_mean_hist": 735.0, "cur_frame_id": 63, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.2, -40.41, 20.0, -39.76, 95.86, 0.0]\n  Target bbox: [617.39, 373.42, 650.19, 444.37]\n\nFrame 2:\n  Drone pose: [-58.25, -39.87, 20.0, -42.75, 95.21, 0.0]\n  Target bbox: [621.35, 324.09, 659.05, 395.18]\n\nFrame 3:\n  Drone pose: [-58.26, -39.31, 19.86, -42.06, 91.6, 0.0]\n  Target bbox: [662.59, 335.47, 708.42, 404.34]\n\nFrame 4:\n  Drone pose: [-58.36, -38.7, 19.96, -42.93, 94.96, 0.0]\n  Target bbox: [620.69, 323.9, 659.75, 395.4]\n\nFrame 5 (current):\n  Drone pose: [-58.43, -38.11, 20.1, -42.3, 96.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 601.63, \"ymin\": 340.65, \"xmax\": 644.41, \"ymax\": 411.5}, \"waypoint_deltas\": [{\"dx\": -0.38, \"dy\": 0.6, \"dz\": -0.1, \"dpitch\": -1.01, \"dyaw\": -2.43, \"droll\": 0.0}, {\"dx\": -0.64, \"dy\": 1.16, \"dz\": -0.1, \"dpitch\": -1.12, \"dyaw\": -3.16, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": 1.47, \"dz\": -0.1, \"dpitch\": -0.88, \"dyaw\": -4.4, \"droll\": 0.0}, {\"dx\": -1.22, \"dy\": 1.94, \"dz\": -0.1, \"dpitch\": -0.84, \"dyaw\": -4.85, \"droll\": 0.0}, {\"dx\": -1.33, \"dy\": 2.4, \"dz\": -0.1, \"dpitch\": -0.78, \"dyaw\": -5.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.83, "window_alt_abs_m": 0.38, "target_px_mean_hist": 724.0, "cur_frame_id": 74, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00086/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-59.92, -34.55, 20.0, -43.19, 89.63, 0.0]\n  Target bbox: [632.25, 314.92, 670.65, 384.67]\n\nFrame 2:\n  Drone pose: [-59.9, -34.18, 20.01, -42.44, 90.57, 0.0]\n  Target bbox: [622.19, 325.1, 658.13, 394.18]\n\nFrame 3:\n  Drone pose: [-59.92, -33.82, 20.0, -44.73, 88.18, 0.0]\n  Target bbox: [650.93, 283.3, 687.67, 353.24]\n\nFrame 4:\n  Drone pose: [-59.79, -33.38, 19.97, -45.02, 94.53, 0.0]\n  Target bbox: [571.95, 277.11, 616.78, 346.54]\n\nFrame 5 (current):\n  Drone pose: [-59.86, -32.96, 20.0, -39.35, 90.4, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.12, \"ymin\": 370.65, \"xmax\": 659.56, \"ymax\": 439.16}, \"waypoint_deltas\": [{\"dx\": 0.09, \"dy\": 0.44, \"dz\": 0.0, \"dpitch\": -2.61, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": 0.88, \"dz\": 0.0, \"dpitch\": -2.53, \"dyaw\": 0.88, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": 1.34, \"dz\": 0.0, \"dpitch\": -2.46, \"dyaw\": 1.4, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": 1.82, \"dz\": 0.0, \"dpitch\": -2.44, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": 2.3, \"dz\": 0.0, \"dpitch\": -2.42, \"dyaw\": 0.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.81, "window_alt_abs_m": 0.07, "target_px_mean_hist": 723.2, "cur_frame_id": 86, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00098/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.03, -29.26, 20.09, -44.0, 89.44, 0.0]\n  Target bbox: [610.14, 284.14, 654.75, 360.46]\n\nFrame 2:\n  Drone pose: [-57.66, -28.74, 20.0, -41.05, 85.63, 0.0]\n  Target bbox: [653.69, 331.78, 699.57, 409.26]\n\nFrame 3:\n  Drone pose: [-57.36, -28.27, 20.0, -42.38, 85.27, 0.0]\n  Target bbox: [652.89, 310.18, 695.46, 384.67]\n\nFrame 4:\n  Drone pose: [-57.11, -27.86, 20.11, -41.68, 92.31, 0.0]\n  Target bbox: [550.45, 325.04, 603.49, 404.55]\n\nFrame 5 (current):\n  Drone pose: [-56.87, -27.28, 19.93, -41.69, 86.6, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.85, \"ymin\": 320.62, \"xmax\": 664.22, \"ymax\": 399.12}, \"waypoint_deltas\": [{\"dx\": 0.25, \"dy\": 0.47, \"dz\": 0.07, \"dpitch\": -0.04, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": 0.45, \"dy\": 0.98, \"dz\": 0.07, \"dpitch\": -0.02, \"dyaw\": -1.46, \"droll\": 0.0}, {\"dx\": 0.63, \"dy\": 1.5, \"dz\": 0.07, \"dpitch\": 0.0, \"dyaw\": -2.34, \"droll\": 0.0}, {\"dx\": 0.77, \"dy\": 2.02, \"dz\": 0.07, \"dpitch\": 0.02, \"dyaw\": -3.31, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": 2.53, \"dz\": 0.07, \"dpitch\": 0.07, \"dyaw\": -4.37, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.91, "window_alt_abs_m": 0.38, "target_px_mean_hist": 660.2, "cur_frame_id": 98, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060/aug_001/frames_playback/frame_00110/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.75, -23.18, 19.99, -41.63, 84.07, 0.0]\n  Target bbox: [607.47, 326.86, 641.38, 396.58]\n\nFrame 2:\n  Drone pose: [-55.87, -22.72, 20.0, -46.69, 80.9, 0.0]\n  Target bbox: [639.15, 240.47, 683.47, 311.19]\n\nFrame 3:\n  Drone pose: [-55.98, -22.18, 19.87, -41.31, 80.0, 0.0]\n  Target bbox: [647.24, 328.11, 690.07, 398.88]\n\nFrame 4:\n  Drone pose: [-55.86, -21.71, 20.0, -41.56, 77.77, 0.0]\n  Target bbox: [677.11, 328.01, 724.57, 399.3]\n\nFrame 5 (current):\n  Drone pose: [-55.84, -21.22, 20.0, -39.03, 86.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 576.67, \"ymin\": 370.99, \"xmax\": 611.07, \"ymax\": 439.76}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -2.66, \"dyaw\": -3.6, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": -2.66, \"dyaw\": -3.54, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 1.5, \"dz\": 0.0, \"dpitch\": -2.67, \"dyaw\": -3.47, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 2.0, \"dz\": 0.0, \"dpitch\": -2.67, \"dyaw\": -3.4, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 2.5, \"dz\": 0.0, \"dpitch\": -2.67, \"dyaw\": -3.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.8, "window_alt_abs_m": 0.27, "target_px_mean_hist": 694.0, "cur_frame_id": 110, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776066060", "difficulty_score": 0.2889, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.46, 62.27, 22.0, -46.47, -97.59, 0.0]\n  Target bbox: [619.04, 328.39, 660.68, 390.84]\n\nFrame 2:\n  Drone pose: [-53.39, 61.0, 21.2, -47.45, -93.77, 0.0]\n  Target bbox: [619.35, 328.54, 660.35, 390.5]\n\nFrame 3:\n  Drone pose: [-53.48, 60.41, 20.67, -47.65, -92.0, 0.0]\n  Target bbox: [617.8, 327.46, 661.82, 391.54]\n\nFrame 4:\n  Drone pose: [-53.54, 59.82, 20.64, -48.58, -90.21, 0.0]\n  Target bbox: [622.86, 328.19, 657.04, 390.77]\n\nFrame 5 (current):\n  Drone pose: [-53.55, 59.29, 20.62, -49.42, -88.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.65, \"ymin\": 320.39, \"xmax\": 663.33, \"ymax\": 398.73}, \"waypoint_deltas\": [{\"dx\": 0.2, \"dy\": -0.44, \"dz\": -0.03, \"dpitch\": 0.15, \"dyaw\": 1.01, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": -0.93, \"dz\": -0.05, \"dpitch\": 0.19, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": 0.61, \"dy\": -1.5, \"dz\": -0.07, \"dpitch\": 0.11, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": 0.83, \"dy\": -2.14, \"dz\": -0.09, \"dpitch\": -0.11, \"dyaw\": -1.13, \"droll\": 0.0}, {\"dx\": 1.1, \"dy\": -2.88, \"dz\": -0.2, \"dpitch\": -0.33, \"dyaw\": -3.8, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.09, "window_alt_abs_m": 1.38, "target_px_mean_hist": 525.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00019/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-49.49, 50.7, 20.24, -52.41, -112.22, 0.0]\n  Target bbox: [618.07, 323.09, 661.78, 395.65]\n\nFrame 2:\n  Drone pose: [-49.01, 49.75, 20.22, -52.79, -114.64, 0.0]\n  Target bbox: [618.69, 322.99, 661.18, 395.72]\n\nFrame 3:\n  Drone pose: [-48.61, 48.86, 20.19, -53.07, -116.71, 0.0]\n  Target bbox: [620.36, 323.31, 659.51, 395.38]\n\nFrame 4:\n  Drone pose: [-48.31, 48.06, 20.17, -53.29, -118.32, 0.0]\n  Target bbox: [623.56, 323.87, 656.67, 394.82]\n\nFrame 5 (current):\n  Drone pose: [-48.11, 47.34, 20.15, -52.96, -121.18, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.3, \"ymin\": 323.46, \"xmax\": 658.58, \"ymax\": 395.25}, \"waypoint_deltas\": [{\"dx\": 0.12, \"dy\": -0.64, \"dz\": -0.01, \"dpitch\": -0.37, \"dyaw\": -3.47, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -1.23, \"dz\": -0.03, \"dpitch\": -0.63, \"dyaw\": -6.66, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -1.78, \"dz\": -0.05, \"dpitch\": -0.81, \"dyaw\": -9.66, \"droll\": 0.0}, {\"dx\": 0.15, \"dy\": -2.3, \"dz\": -0.06, \"dpitch\": -0.93, \"dyaw\": -12.53, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -2.81, \"dz\": -0.07, \"dpitch\": -0.99, \"dyaw\": -15.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.95, "window_alt_abs_m": 0.09, "target_px_mean_hist": 656.5, "cur_frame_id": 19, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00034/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.56, 41.37, 20.03, -53.32, -152.87, 0.0]\n  Target bbox: [619.85, 321.29, 660.36, 397.39]\n\nFrame 2:\n  Drone pose: [-48.75, 40.88, 20.03, -53.2, -155.23, 0.0]\n  Target bbox: [617.17, 316.71, 663.22, 402.03]\n\nFrame 3:\n  Drone pose: [-48.98, 40.41, 20.02, -53.08, -157.4, 0.0]\n  Target bbox: [621.67, 323.57, 658.51, 395.09]\n\nFrame 4:\n  Drone pose: [-49.24, 39.98, 20.02, -52.95, -159.37, 0.0]\n  Target bbox: [619.6, 318.33, 660.79, 400.46]\n\nFrame 5 (current):\n  Drone pose: [-49.54, 39.58, 20.02, -52.83, -161.14, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.29, \"ymin\": 315.9, \"xmax\": 662.21, \"ymax\": 402.91}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": -0.36, \"dz\": -0.01, \"dpitch\": 0.12, \"dyaw\": -1.6, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": -0.69, \"dz\": -0.01, \"dpitch\": 0.24, \"dyaw\": -3.04, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -1.0, \"dz\": -0.01, \"dpitch\": 0.36, \"dyaw\": -4.35, \"droll\": 0.0}, {\"dx\": -1.37, \"dy\": -1.28, \"dz\": -0.01, \"dpitch\": 0.48, \"dyaw\": -5.55, \"droll\": 0.0}, {\"dx\": -1.75, \"dy\": -1.54, \"dz\": -0.01, \"dpitch\": 0.6, \"dyaw\": -6.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.28, "window_alt_abs_m": 0.02, "target_px_mean_hist": 687.5, "cur_frame_id": 34, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00049/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.66, 36.7, 20.0, -51.48, -173.31, 0.0]\n  Target bbox: [624.42, 319.34, 655.99, 399.47]\n\nFrame 2:\n  Drone pose: [-54.04, 36.5, 20.0, -51.29, -174.11, 0.0]\n  Target bbox: [625.21, 326.1, 655.03, 392.66]\n\nFrame 3:\n  Drone pose: [-54.41, 36.31, 20.0, -51.09, -174.88, 0.0]\n  Target bbox: [625.05, 320.26, 655.34, 398.58]\n\nFrame 4:\n  Drone pose: [-54.77, 36.12, 20.0, -50.86, -175.63, 0.0]\n  Target bbox: [625.09, 319.28, 655.33, 399.59]\n\nFrame 5 (current):\n  Drone pose: [-55.13, 35.93, 20.0, -50.63, -176.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.39, \"ymin\": 318.41, \"xmax\": 656.1, \"ymax\": 400.6}, \"waypoint_deltas\": [{\"dx\": -0.35, \"dy\": -0.18, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": -0.7, \"droll\": 0.0}, {\"dx\": -0.69, \"dy\": -0.36, \"dz\": 0.0, \"dpitch\": 0.52, \"dyaw\": -1.39, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": 0.76, \"dyaw\": -2.01, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": -0.69, \"dz\": 0.0, \"dpitch\": 1.03, \"dyaw\": -2.59, \"droll\": 0.0}, {\"dx\": -1.76, \"dy\": -0.82, \"dz\": 0.0, \"dpitch\": 1.26, \"dyaw\": -3.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.04, "window_alt_abs_m": 0.0, "target_px_mean_hist": 662.5, "cur_frame_id": 49, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00064/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.75, 34.23, 20.0, -47.42, 177.63, 0.0]\n  Target bbox: [626.27, 320.89, 653.34, 398.23]\n\nFrame 2:\n  Drone pose: [-58.95, 34.0, 20.0, -46.9, 176.93, 0.0]\n  Target bbox: [626.69, 325.8, 653.05, 393.22]\n\nFrame 3:\n  Drone pose: [-59.23, 33.67, 20.0, -46.51, 175.93, 0.0]\n  Target bbox: [625.45, 322.15, 654.16, 397.11]\n\nFrame 4:\n  Drone pose: [-59.67, 33.22, 20.0, -46.37, 174.49, 0.0]\n  Target bbox: [626.2, 327.43, 653.58, 391.63]\n\nFrame 5 (current):\n  Drone pose: [-60.32, 32.66, 20.0, -46.51, 172.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.12, \"ymin\": 325.2, \"xmax\": 654.6, \"ymax\": 393.9}, \"waypoint_deltas\": [{\"dx\": -0.79, \"dy\": -0.57, \"dz\": 0.0, \"dpitch\": -0.33, \"dyaw\": -1.95, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": -1.07, \"dz\": 0.0, \"dpitch\": -0.71, \"dyaw\": -3.73, \"droll\": 0.0}, {\"dx\": -2.41, \"dy\": -1.44, \"dz\": 0.0, \"dpitch\": -1.06, \"dyaw\": -5.11, \"droll\": 0.0}, {\"dx\": -3.13, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": -1.31, \"dyaw\": -6.04, \"droll\": 0.0}, {\"dx\": -3.76, \"dy\": -1.81, \"dz\": 0.0, \"dpitch\": -1.48, \"dyaw\": -6.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.95, "window_alt_abs_m": 0.0, "target_px_mean_hist": 613.8, "cur_frame_id": 64, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-67.57, 30.81, 20.0, -47.95, 165.98, 0.0]\n  Target bbox: [624.51, 326.13, 655.29, 392.86]\n\nFrame 2:\n  Drone pose: [-68.06, 30.79, 20.0, -47.92, 165.9, 0.0]\n  Target bbox: [623.04, 323.19, 656.65, 395.91]\n\nFrame 3:\n  Drone pose: [-68.55, 30.77, 20.0, -47.9, 165.84, 0.0]\n  Target bbox: [622.17, 319.59, 657.41, 399.57]\n\nFrame 4:\n  Drone pose: [-69.03, 30.77, 20.0, -47.87, 165.86, 0.0]\n  Target bbox: [622.14, 319.42, 657.42, 399.76]\n\nFrame 5 (current):\n  Drone pose: [-69.51, 30.79, 20.0, -47.85, 165.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.7, \"ymin\": 318.64, \"xmax\": 658.82, \"ymax\": 400.58}, \"waypoint_deltas\": [{\"dx\": -0.47, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.23, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 0.24, \"droll\": 0.0}, {\"dx\": -2.4, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": 0.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.25, "window_alt_abs_m": 0.0, "target_px_mean_hist": 635.2, "cur_frame_id": 80, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-75.87, 30.91, 20.0, -47.66, 166.46, 0.0]\n  Target bbox: [622.33, 319.47, 657.23, 399.72]\n\nFrame 2:\n  Drone pose: [-76.37, 30.88, 20.0, -47.67, 166.36, 0.0]\n  Target bbox: [621.04, 318.73, 658.45, 400.63]\n\nFrame 3:\n  Drone pose: [-76.9, 30.8, 20.0, -47.67, 166.09, 0.0]\n  Target bbox: [620.86, 318.57, 658.63, 400.75]\n\nFrame 4:\n  Drone pose: [-77.44, 30.67, 20.0, -47.68, 165.62, 0.0]\n  Target bbox: [623.32, 323.98, 656.4, 395.11]\n\nFrame 5 (current):\n  Drone pose: [-77.99, 30.49, 20.0, -47.69, 164.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.95, \"ymin\": 323.33, \"xmax\": 656.75, \"ymax\": 395.79}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": -1.15, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.55, \"droll\": 0.0}, {\"dx\": -1.73, \"dy\": -0.68, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -2.38, \"droll\": 0.0}, {\"dx\": -2.33, \"dy\": -0.9, \"dz\": 0.0, \"dpitch\": -0.32, \"dyaw\": -3.18, \"droll\": 0.0}, {\"dx\": -2.93, \"dy\": -1.09, \"dz\": 0.0, \"dpitch\": -0.39, \"dyaw\": -3.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.46, "window_alt_abs_m": 0.0, "target_px_mean_hist": 627.8, "cur_frame_id": 97, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00112/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-84.66, 29.2, 20.0, -47.79, 153.38, 0.0]\n  Target bbox: [619.94, 323.03, 660.3, 395.99]\n\nFrame 2:\n  Drone pose: [-85.29, 29.3, 20.0, -47.69, 152.0, 0.0]\n  Target bbox: [620.34, 322.5, 659.98, 396.67]\n\nFrame 3:\n  Drone pose: [-85.93, 29.42, 20.0, -47.59, 150.67, 0.0]\n  Target bbox: [620.03, 322.44, 660.32, 396.74]\n\nFrame 4:\n  Drone pose: [-86.57, 29.55, 20.0, -47.48, 149.4, 0.0]\n  Target bbox: [620.2, 322.49, 660.13, 396.61]\n\nFrame 5 (current):\n  Drone pose: [-87.21, 29.7, 20.0, -47.37, 148.18, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.45, \"ymin\": 324.65, \"xmax\": 656.82, \"ymax\": 394.42}, \"waypoint_deltas\": [{\"dx\": -0.62, \"dy\": 0.16, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -1.16, \"droll\": 0.0}, {\"dx\": -1.24, \"dy\": 0.33, \"dz\": 0.0, \"dpitch\": 0.26, \"dyaw\": -2.26, \"droll\": 0.0}, {\"dx\": -1.86, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.4, \"dyaw\": -3.36, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": 0.67, \"dz\": 0.0, \"dpitch\": 0.51, \"dyaw\": -4.49, \"droll\": 0.0}, {\"dx\": -3.2, \"dy\": 0.85, \"dz\": 0.0, \"dpitch\": 0.58, \"dyaw\": -5.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.2, "window_alt_abs_m": 0.0, "target_px_mean_hist": 614.0, "cur_frame_id": 112, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-94.55, 32.47, 20.0, -47.09, 137.48, 0.0]\n  Target bbox: [622.46, 324.41, 657.96, 394.73]\n\nFrame 2:\n  Drone pose: [-95.14, 32.88, 20.0, -47.1, 137.07, 0.0]\n  Target bbox: [623.91, 324.34, 656.46, 394.68]\n\nFrame 3:\n  Drone pose: [-95.71, 33.28, 20.0, -47.06, 136.68, 0.0]\n  Target bbox: [623.28, 325.41, 657.07, 393.58]\n\nFrame 4:\n  Drone pose: [-96.25, 33.66, 20.0, -46.98, 136.31, 0.0]\n  Target bbox: [623.72, 325.73, 656.64, 393.33]\n\nFrame 5 (current):\n  Drone pose: [-96.78, 34.03, 20.0, -46.86, 135.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.95, \"ymin\": 324.39, \"xmax\": 657.47, \"ymax\": 394.73}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.36, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": 0.72, \"dz\": 0.0, \"dpitch\": 0.28, \"dyaw\": -0.7, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": 1.08, \"dz\": 0.0, \"dpitch\": 0.41, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": -2.07, \"dy\": 1.45, \"dz\": 0.0, \"dpitch\": 0.53, \"dyaw\": -1.4, \"droll\": 0.0}, {\"dx\": -2.6, \"dy\": 1.82, \"dz\": 0.0, \"dpitch\": 0.64, \"dyaw\": -1.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.53, "window_alt_abs_m": 0.0, "target_px_mean_hist": 615.8, "cur_frame_id": 127, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00138/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00139/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/ORI/frames_playback/frame_00142/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.26, 38.94, 20.0, -45.08, 136.82, 0.0]\n  Target bbox: [626.34, 325.74, 653.97, 393.45]\n\nFrame 2:\n  Drone pose: [-101.57, 39.46, 20.0, -44.89, 137.24, 0.0]\n  Target bbox: [624.01, 327.98, 656.27, 391.14]\n\nFrame 3:\n  Drone pose: [-101.88, 39.97, 20.0, -44.7, 137.65, 0.0]\n  Target bbox: [626.33, 328.5, 653.91, 390.65]\n\nFrame 4:\n  Drone pose: [-102.19, 40.49, 20.0, -44.51, 138.05, 0.0]\n  Target bbox: [625.13, 326.75, 655.18, 392.47]\n\nFrame 5 (current):\n  Drone pose: [-102.51, 41.0, 20.0, -44.32, 138.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.19, \"ymin\": 328.41, \"xmax\": 656.06, \"ymax\": 390.74}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": 0.4, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": 1.03, \"dz\": 0.0, \"dpitch\": 0.38, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": -0.93, \"dy\": 1.55, \"dz\": 0.0, \"dpitch\": 0.8, \"dyaw\": 1.18, \"droll\": 0.0}, {\"dx\": -1.24, \"dy\": 2.06, \"dz\": 0.0, \"dpitch\": 0.54, \"dyaw\": 2.67, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": 2.58, \"dz\": 0.0, \"dpitch\": 0.29, \"dyaw\": 4.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.64, "window_alt_abs_m": 0.0, "target_px_mean_hist": 579.0, "cur_frame_id": 142, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.49, 62.23, 22.03, -49.3, -99.25, 0.0]\n  Target bbox: [617.83, 325.23, 662.04, 393.92]\n\nFrame 2:\n  Drone pose: [-53.35, 60.97, 21.29, -43.02, -91.04, 0.0]\n  Target bbox: [612.2, 333.04, 637.85, 392.62]\n\nFrame 3:\n  Drone pose: [-53.29, 60.41, 20.62, -43.74, -92.85, 0.0]\n  Target bbox: [623.18, 328.59, 657.02, 390.73]\n\nFrame 4:\n  Drone pose: [-53.54, 59.82, 20.64, -48.58, -90.21, 0.0]\n  Target bbox: [619.14, 327.5, 660.76, 391.41]\n\nFrame 5 (current):\n  Drone pose: [-53.55, 59.29, 20.62, -48.48, -84.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 572.04, \"ymin\": 341.63, \"xmax\": 609.91, \"ymax\": 411.64}, \"waypoint_deltas\": [{\"dx\": 0.2, \"dy\": -0.44, \"dz\": -0.03, \"dpitch\": -0.79, \"dyaw\": -3.43, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": -0.93, \"dz\": -0.05, \"dpitch\": -0.75, \"dyaw\": -4.13, \"droll\": 0.0}, {\"dx\": 0.61, \"dy\": -1.5, \"dz\": -0.07, \"dpitch\": -0.83, \"dyaw\": -4.81, \"droll\": 0.0}, {\"dx\": 0.83, \"dy\": -2.14, \"dz\": -0.09, \"dpitch\": -1.05, \"dyaw\": -5.57, \"droll\": 0.0}, {\"dx\": 1.1, \"dy\": -2.88, \"dz\": -0.2, \"dpitch\": -1.27, \"dyaw\": -8.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.81, "window_alt_abs_m": 1.47, "target_px_mean_hist": 520.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00019/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-49.49, 50.7, 20.24, -47.59, -117.22, 0.0]\n  Target bbox: [665.13, 402.88, 718.9, 481.47]\n\nFrame 2:\n  Drone pose: [-49.01, 49.75, 20.22, -49.17, -114.73, 0.0]\n  Target bbox: [620.3, 386.65, 661.27, 453.52]\n\nFrame 3:\n  Drone pose: [-48.48, 48.91, 20.07, -54.12, -107.92, 0.0]\n  Target bbox: [615.36, 320.74, 664.5, 397.87]\n\nFrame 4:\n  Drone pose: [-48.31, 48.06, 20.17, -57.41, -123.32, 0.0]\n  Target bbox: [670.82, 255.65, 710.77, 327.98]\n\nFrame 5 (current):\n  Drone pose: [-48.1, 47.24, 20.32, -56.61, -113.87, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 611.54, \"ymin\": 318.21, \"xmax\": 668.59, \"ymax\": 400.38}, \"waypoint_deltas\": [{\"dx\": 0.11, \"dy\": -0.54, \"dz\": -0.18, \"dpitch\": 3.28, \"dyaw\": -10.78, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -1.13, \"dz\": -0.2, \"dpitch\": 3.02, \"dyaw\": -13.97, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -1.68, \"dz\": -0.22, \"dpitch\": 2.84, \"dyaw\": -16.97, \"droll\": 0.0}, {\"dx\": 0.14, \"dy\": -2.2, \"dz\": -0.23, \"dpitch\": 2.72, \"dyaw\": -19.84, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": -2.71, \"dz\": -0.24, \"dpitch\": 2.66, \"dyaw\": -22.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 34.14, "window_alt_abs_m": 0.42, "target_px_mean_hist": 640.5, "cur_frame_id": 19, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00034/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.56, 41.37, 20.03, -57.87, -148.61, 0.0]\n  Target bbox: [572.99, 241.64, 620.24, 326.5]\n\nFrame 2:\n  Drone pose: [-48.75, 40.88, 20.03, -50.73, -150.23, 0.0]\n  Target bbox: [567.97, 363.69, 610.57, 441.64]\n\nFrame 3:\n  Drone pose: [-48.98, 40.41, 20.02, -57.29, -162.4, 0.0]\n  Target bbox: [668.65, 247.43, 714.18, 333.17]\n\nFrame 4:\n  Drone pose: [-49.24, 39.98, 20.02, -47.95, -159.89, 0.0]\n  Target bbox: [621.21, 400.72, 670.08, 486.49]\n\nFrame 5 (current):\n  Drone pose: [-49.57, 39.76, 20.05, -57.87, -154.07, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 589.82, \"ymin\": 238.66, \"xmax\": 631.68, \"ymax\": 312.89}, \"waypoint_deltas\": [{\"dx\": -0.28, \"dy\": -0.54, \"dz\": -0.04, \"dpitch\": 5.16, \"dyaw\": -8.67, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": -0.87, \"dz\": -0.04, \"dpitch\": 5.28, \"dyaw\": -10.11, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -1.18, \"dz\": -0.04, \"dpitch\": 5.4, \"dyaw\": -11.42, \"droll\": 0.0}, {\"dx\": -1.34, \"dy\": -1.46, \"dz\": -0.04, \"dpitch\": 5.52, \"dyaw\": -12.62, \"droll\": 0.0}, {\"dx\": -1.72, \"dy\": -1.72, \"dz\": -0.04, \"dpitch\": 5.64, \"dyaw\": -13.74, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.12, "window_alt_abs_m": 0.05, "target_px_mean_hist": 689.2, "cur_frame_id": 34, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00049/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.66, 36.7, 20.0, -48.55, -171.03, 0.0]\n  Target bbox: [599.89, 373.43, 631.94, 444.27]\n\nFrame 2:\n  Drone pose: [-54.04, 36.5, 20.0, -49.82, -179.11, 0.0]\n  Target bbox: [677.85, 350.64, 708.17, 421.12]\n\nFrame 3:\n  Drone pose: [-54.41, 36.31, 20.0, -49.34, -169.98, 0.0]\n  Target bbox: [570.78, 353.89, 605.29, 427.04]\n\nFrame 4:\n  Drone pose: [-54.77, 36.12, 20.0, -48.16, -170.63, 0.0]\n  Target bbox: [570.35, 368.89, 603.21, 444.42]\n\nFrame 5 (current):\n  Drone pose: [-55.13, 35.93, 20.0, -50.63, -176.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.22, \"ymin\": 324.08, \"xmax\": 654.08, \"ymax\": 394.73}, \"waypoint_deltas\": [{\"dx\": -0.35, \"dy\": -0.18, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": -0.7, \"droll\": 0.0}, {\"dx\": -0.69, \"dy\": -0.36, \"dz\": 0.0, \"dpitch\": 0.52, \"dyaw\": -1.39, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": 0.76, \"dyaw\": -2.01, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": -0.69, \"dz\": 0.0, \"dpitch\": 1.03, \"dyaw\": -2.59, \"droll\": 0.0}, {\"dx\": -1.76, \"dy\": -0.82, \"dz\": 0.0, \"dpitch\": 1.26, \"dyaw\": -3.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.58, "window_alt_abs_m": 0.0, "target_px_mean_hist": 668.0, "cur_frame_id": 49, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00064/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.63, 34.15, 19.89, -46.92, 179.2, 0.0]\n  Target bbox: [556.9, 252.85, 599.25, 329.61]\n\nFrame 2:\n  Drone pose: [-58.95, 34.0, 20.0, -41.9, 180.73, 0.0]\n  Target bbox: [582.08, 406.63, 609.44, 482.88]\n\nFrame 3:\n  Drone pose: [-59.23, 33.67, 20.0, -51.51, 179.12, 0.0]\n  Target bbox: [587.24, 237.3, 617.63, 315.17]\n\nFrame 4:\n  Drone pose: [-59.67, 33.22, 20.0, -50.27, 169.54, 0.0]\n  Target bbox: [681.94, 259.2, 713.53, 332.44]\n\nFrame 5 (current):\n  Drone pose: [-60.32, 32.66, 20.0, -46.08, 177.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 566.66, \"ymin\": 332.56, \"xmax\": 596.82, \"ymax\": 404.67}, \"waypoint_deltas\": [{\"dx\": -0.79, \"dy\": -0.57, \"dz\": 0.0, \"dpitch\": -0.76, \"dyaw\": -6.95, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": -1.07, \"dz\": 0.0, \"dpitch\": -1.14, \"dyaw\": -8.73, \"droll\": 0.0}, {\"dx\": -2.41, \"dy\": -1.44, \"dz\": 0.0, \"dpitch\": -1.49, \"dyaw\": -10.11, \"droll\": 0.0}, {\"dx\": -3.13, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": -1.74, \"dyaw\": -11.04, \"droll\": 0.0}, {\"dx\": -3.76, \"dy\": -1.81, \"dz\": 0.0, \"dpitch\": -1.91, \"dyaw\": -11.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.86, "window_alt_abs_m": 0.11, "target_px_mean_hist": 609.5, "cur_frame_id": 64, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-67.57, 30.81, 20.0, -50.1, 163.35, 0.0]\n  Target bbox: [651.55, 285.24, 688.06, 362.82]\n\nFrame 2:\n  Drone pose: [-68.06, 30.79, 20.0, -43.73, 165.32, 0.0]\n  Target bbox: [630.12, 394.35, 662.98, 465.63]\n\nFrame 3:\n  Drone pose: [-68.55, 30.77, 20.0, -47.9, 165.84, 0.0]\n  Target bbox: [624.57, 326.54, 655.25, 392.43]\n\nFrame 4:\n  Drone pose: [-69.03, 30.77, 20.0, -44.3, 160.86, 0.0]\n  Target bbox: [677.38, 380.83, 715.81, 462.02]\n\nFrame 5 (current):\n  Drone pose: [-69.51, 30.79, 20.0, -49.73, 170.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 566.34, \"ymin\": 293.34, \"xmax\": 599.87, \"ymax\": 366.28}, \"waypoint_deltas\": [{\"dx\": -0.47, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 1.92, \"dyaw\": -4.87, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 1.95, \"dyaw\": -4.77, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 1.98, \"dyaw\": -4.73, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 2.0, \"dyaw\": -4.76, \"droll\": 0.0}, {\"dx\": -2.4, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": 2.02, \"dyaw\": -4.8, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.57, "window_alt_abs_m": 0.0, "target_px_mean_hist": 638.0, "cur_frame_id": 80, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-75.87, 30.91, 20.0, -47.72, 163.84, 0.0]\n  Target bbox: [653.49, 322.47, 686.03, 395.77]\n\nFrame 2:\n  Drone pose: [-76.42, 31.05, 19.95, -51.28, 165.23, 0.0]\n  Target bbox: [623.49, 324.97, 656.31, 393.76]\n\nFrame 3:\n  Drone pose: [-76.9, 30.8, 20.0, -47.08, 171.09, 0.0]\n  Target bbox: [565.1, 331.79, 600.63, 411.03]\n\nFrame 4:\n  Drone pose: [-77.44, 30.67, 20.0, -42.68, 166.9, 0.0]\n  Target bbox: [608.02, 405.28, 641.96, 482.39]\n\nFrame 5 (current):\n  Drone pose: [-77.99, 30.49, 20.0, -47.69, 164.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.19, \"ymin\": 322.2, \"xmax\": 656.48, \"ymax\": 396.92}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": -1.15, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.55, \"droll\": 0.0}, {\"dx\": -1.73, \"dy\": -0.68, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -2.38, \"droll\": 0.0}, {\"dx\": -2.33, \"dy\": -0.9, \"dz\": 0.0, \"dpitch\": -0.32, \"dyaw\": -3.18, \"droll\": 0.0}, {\"dx\": -2.93, \"dy\": -1.09, \"dz\": 0.0, \"dpitch\": -0.39, \"dyaw\": -3.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.33, "window_alt_abs_m": 0.09, "target_px_mean_hist": 634.8, "cur_frame_id": 97, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00112/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-84.66, 29.2, 20.0, -42.79, 154.37, 0.0]\n  Target bbox: [610.4, 407.37, 647.1, 480.07]\n\nFrame 2:\n  Drone pose: [-85.29, 29.3, 20.0, -51.05, 147.89, 0.0]\n  Target bbox: [667.56, 267.09, 706.6, 341.56]\n\nFrame 3:\n  Drone pose: [-85.93, 29.42, 20.0, -49.82, 148.49, 0.0]\n  Target bbox: [651.16, 287.33, 679.16, 357.53]\n\nFrame 4:\n  Drone pose: [-86.52, 29.52, 20.06, -48.38, 155.71, 0.0]\n  Target bbox: [617.5, 318.0, 662.06, 401.3]\n\nFrame 5 (current):\n  Drone pose: [-87.21, 29.7, 20.0, -50.49, 153.18, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 563.52, \"ymin\": 272.64, \"xmax\": 602.05, \"ymax\": 345.51}, \"waypoint_deltas\": [{\"dx\": -0.62, \"dy\": 0.16, \"dz\": 0.0, \"dpitch\": 3.24, \"dyaw\": -6.16, \"droll\": 0.0}, {\"dx\": -1.24, \"dy\": 0.33, \"dz\": 0.0, \"dpitch\": 3.38, \"dyaw\": -7.26, \"droll\": 0.0}, {\"dx\": -1.86, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 3.52, \"dyaw\": -8.36, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": 0.67, \"dz\": 0.0, \"dpitch\": 3.63, \"dyaw\": -9.49, \"droll\": 0.0}, {\"dx\": -3.2, \"dy\": 0.85, \"dz\": 0.0, \"dpitch\": 3.7, \"dyaw\": -10.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.82, "window_alt_abs_m": 0.11, "target_px_mean_hist": 609.8, "cur_frame_id": 112, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-94.56, 32.48, 19.98, -41.76, 136.01, 0.0]\n  Target bbox: [622.37, 326.42, 657.95, 392.99]\n\nFrame 2:\n  Drone pose: [-95.14, 32.88, 20.0, -46.13, 142.07, 0.0]\n  Target bbox: [566.81, 342.36, 598.6, 413.02]\n\nFrame 3:\n  Drone pose: [-95.71, 33.28, 20.0, -50.4, 131.68, 0.0]\n  Target bbox: [672.8, 264.48, 722.71, 346.31]\n\nFrame 4:\n  Drone pose: [-96.29, 33.64, 20.07, -54.29, 134.56, 0.0]\n  Target bbox: [603.37, 254.71, 643.91, 330.26]\n\nFrame 5 (current):\n  Drone pose: [-96.78, 34.03, 20.0, -51.86, 137.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 611.21, \"ymin\": 241.42, \"xmax\": 636.73, \"ymax\": 309.79}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.36, \"dz\": 0.0, \"dpitch\": 5.14, \"dyaw\": -1.73, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": 0.72, \"dz\": 0.0, \"dpitch\": 5.28, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": 1.08, \"dz\": 0.0, \"dpitch\": 5.41, \"dyaw\": -2.43, \"droll\": 0.0}, {\"dx\": -2.07, \"dy\": 1.45, \"dz\": 0.0, \"dpitch\": 5.53, \"dyaw\": -2.78, \"droll\": 0.0}, {\"dx\": -2.6, \"dy\": 1.82, \"dz\": 0.0, \"dpitch\": 5.64, \"dyaw\": -3.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.08, "window_alt_abs_m": 0.17, "target_px_mean_hist": 601.8, "cur_frame_id": 127, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00138/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00139/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593/aug_001/frames_playback/frame_00142/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.26, 38.94, 20.0, -45.08, 136.82, 0.0]\n  Target bbox: [625.91, 325.47, 654.42, 393.71]\n\nFrame 2:\n  Drone pose: [-101.57, 39.46, 20.0, -44.89, 137.24, 0.0]\n  Target bbox: [623.15, 324.91, 657.26, 394.32]\n\nFrame 3:\n  Drone pose: [-101.88, 39.97, 20.0, -44.7, 137.65, 0.0]\n  Target bbox: [625.64, 325.36, 654.69, 393.86]\n\nFrame 4:\n  Drone pose: [-102.19, 40.49, 20.0, -44.67, 137.14, 0.0]\n  Target bbox: [636.41, 322.74, 666.32, 391.27]\n\nFrame 5 (current):\n  Drone pose: [-102.51, 41.0, 20.0, -44.32, 138.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.47, \"ymin\": 325.39, \"xmax\": 656.92, \"ymax\": 393.92}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": 0.4, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": 1.03, \"dz\": 0.0, \"dpitch\": 0.38, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": -0.93, \"dy\": 1.55, \"dz\": 0.0, \"dpitch\": 0.8, \"dyaw\": 1.18, \"droll\": 0.0}, {\"dx\": -1.24, \"dy\": 2.06, \"dz\": 0.0, \"dpitch\": 0.54, \"dyaw\": 2.67, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": 2.58, \"dz\": 0.0, \"dpitch\": 0.29, \"dyaw\": 4.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.65, "window_alt_abs_m": 0.0, "target_px_mean_hist": 578.0, "cur_frame_id": 142, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776176593", "difficulty_score": 0.4975, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-114.57, 13.22, 22.0, -46.47, -81.47, 0.0]\n  Target bbox: [618.92, 323.39, 660.93, 396.02]\n\nFrame 2:\n  Drone pose: [-115.43, 11.42, 21.2, -46.87, -76.82, 0.0]\n  Target bbox: [620.56, 325.53, 659.77, 393.84]\n\nFrame 3:\n  Drone pose: [-115.56, 10.55, 20.67, -46.61, -76.2, 0.0]\n  Target bbox: [620.79, 324.62, 659.04, 394.74]\n\nFrame 4:\n  Drone pose: [-115.65, 9.74, 20.64, -46.81, -74.19, 0.0]\n  Target bbox: [618.34, 321.63, 661.41, 397.7]\n\nFrame 5 (current):\n  Drone pose: [-115.64, 9.04, 20.62, -46.83, -72.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.47, \"ymin\": 322.48, \"xmax\": 661.84, \"ymax\": 396.74}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.7, \"dz\": -0.03, \"dpitch\": -0.27, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": -1.22, \"dz\": -0.05, \"dpitch\": -0.32, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": -1.73, \"dz\": -0.07, \"dpitch\": -0.36, \"dyaw\": -0.45, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": -2.24, \"dz\": -0.09, \"dpitch\": -0.38, \"dyaw\": -0.72, \"droll\": 0.0}, {\"dx\": 0.4, \"dy\": -2.75, \"dz\": -0.2, \"dpitch\": -0.27, \"dyaw\": -0.97, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.92, "window_alt_abs_m": 1.38, "target_px_mean_hist": 674.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-115.1, 5.28, 20.36, -47.09, -73.95, 0.0]\n  Target bbox: [620.01, 323.4, 660.32, 395.87]\n\nFrame 2:\n  Drone pose: [-115.03, 4.78, 20.33, -47.08, -74.13, 0.0]\n  Target bbox: [617.69, 321.95, 662.66, 397.24]\n\nFrame 3:\n  Drone pose: [-114.98, 4.27, 20.3, -47.07, -74.29, 0.0]\n  Target bbox: [617.78, 321.72, 662.56, 397.44]\n\nFrame 4:\n  Drone pose: [-114.95, 3.77, 20.27, -47.05, -74.4, 0.0]\n  Target bbox: [620.02, 323.04, 660.32, 396.22]\n\nFrame 5 (current):\n  Drone pose: [-114.92, 3.26, 20.24, -47.03, -74.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.48, \"ymin\": 321.57, \"xmax\": 662.87, \"ymax\": 397.59}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.51, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.02, \"dz\": -0.05, \"dpitch\": 0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": -1.52, \"dz\": -0.07, \"dpitch\": 0.08, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -2.0, \"dz\": -0.09, \"dpitch\": 0.14, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -2.47, \"dz\": -0.1, \"dpitch\": 0.23, \"dyaw\": 0.15, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.51, "window_alt_abs_m": 0.11, "target_px_mean_hist": 748.5, "cur_frame_id": 15, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00026/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-115.0, -0.09, 20.1, -46.57, -74.36, 0.0]\n  Target bbox: [617.08, 322.59, 663.28, 396.72]\n\nFrame 2:\n  Drone pose: [-115.0, -0.52, 20.09, -46.45, -74.41, 0.0]\n  Target bbox: [617.74, 321.59, 662.61, 397.59]\n\nFrame 3:\n  Drone pose: [-115.0, -0.95, 20.08, -46.32, -74.47, 0.0]\n  Target bbox: [615.67, 320.83, 664.71, 398.38]\n\nFrame 4:\n  Drone pose: [-114.99, -1.39, 20.07, -46.21, -74.55, 0.0]\n  Target bbox: [618.48, 322.32, 661.87, 396.94]\n\nFrame 5 (current):\n  Drone pose: [-114.99, -1.85, 20.06, -46.14, -74.57, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.4, \"ymin\": 323.69, \"xmax\": 659.93, \"ymax\": 395.64}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.47, \"dz\": -0.01, \"dpitch\": 0.06, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -0.85, \"dz\": -0.02, \"dpitch\": 0.18, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": 0.56, \"dy\": -1.24, \"dz\": -0.02, \"dpitch\": 0.21, \"dyaw\": -1.91, \"droll\": 0.0}, {\"dx\": 1.27, \"dy\": -1.53, \"dz\": -0.03, \"dpitch\": 0.3, \"dyaw\": -4.2, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -2.34, \"dz\": -0.03, \"dpitch\": -0.09, \"dyaw\": -3.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.21, "window_alt_abs_m": 0.04, "target_px_mean_hist": 764.2, "cur_frame_id": 26, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00037/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-113.08, -5.67, 20.02, -46.74, -75.58, 0.0]\n  Target bbox: [615.56, 319.84, 664.21, 399.54]\n\nFrame 2:\n  Drone pose: [-112.94, -6.37, 20.02, -46.88, -74.3, 0.0]\n  Target bbox: [616.94, 319.61, 662.81, 399.64]\n\nFrame 3:\n  Drone pose: [-112.81, -7.03, 20.01, -46.96, -73.01, 0.0]\n  Target bbox: [619.3, 322.63, 660.5, 396.63]\n\nFrame 4:\n  Drone pose: [-112.48, -7.63, 20.01, -47.02, -72.38, 0.0]\n  Target bbox: [620.24, 323.25, 659.54, 396.02]\n\nFrame 5 (current):\n  Drone pose: [-112.16, -8.23, 20.01, -47.09, -71.74, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.72, \"ymin\": 320.41, \"xmax\": 662.01, \"ymax\": 398.83}, \"waypoint_deltas\": [{\"dx\": 0.31, \"dy\": -0.62, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.71, \"droll\": 0.0}, {\"dx\": 0.59, \"dy\": -1.26, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": 1.54, \"droll\": 0.0}, {\"dx\": 0.84, \"dy\": -1.92, \"dz\": -0.01, \"dpitch\": -0.28, \"dyaw\": 2.49, \"droll\": 0.0}, {\"dx\": 1.05, \"dy\": -2.6, \"dz\": -0.01, \"dpitch\": -0.38, \"dyaw\": 3.57, \"droll\": 0.0}, {\"dx\": 1.24, \"dy\": -3.28, \"dz\": -0.01, \"dpitch\": -0.46, \"dyaw\": 4.74, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.84, "window_alt_abs_m": 0.01, "target_px_mean_hist": 780.5, "cur_frame_id": 37, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00048/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.59, -12.88, 20.0, -47.64, -64.5, 0.0]\n  Target bbox: [618.3, 320.03, 661.34, 399.11]\n\nFrame 2:\n  Drone pose: [-110.43, -13.55, 20.0, -47.65, -63.25, 0.0]\n  Target bbox: [619.5, 321.45, 660.17, 397.7]\n\nFrame 3:\n  Drone pose: [-110.26, -14.21, 20.0, -47.63, -62.03, 0.0]\n  Target bbox: [619.62, 321.85, 660.03, 397.36]\n\nFrame 4:\n  Drone pose: [-110.09, -14.85, 20.0, -47.58, -60.84, 0.0]\n  Target bbox: [620.92, 323.49, 658.78, 395.66]\n\nFrame 5 (current):\n  Drone pose: [-109.91, -15.5, 20.0, -47.52, -59.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.14, \"ymin\": 324.24, \"xmax\": 657.56, \"ymax\": 394.97}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": -0.63, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 1.14, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.25, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": 2.24, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": 0.33, \"dyaw\": 3.31, \"droll\": 0.0}, {\"dx\": 0.71, \"dy\": -2.48, \"dz\": 0.0, \"dpitch\": 0.45, \"dyaw\": 4.38, \"droll\": 0.0}, {\"dx\": 0.89, \"dy\": -3.1, \"dz\": 0.0, \"dpitch\": 0.6, \"dyaw\": 5.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.81, "window_alt_abs_m": 0.0, "target_px_mean_hist": 792.5, "cur_frame_id": 48, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.6, -19.81, 20.0, -47.12, -53.64, 0.0]\n  Target bbox: [621.81, 322.92, 657.8, 396.23]\n\nFrame 2:\n  Drone pose: [-108.34, -20.4, 20.0, -47.0, -52.82, 0.0]\n  Target bbox: [620.08, 322.76, 659.49, 396.42]\n\nFrame 3:\n  Drone pose: [-108.06, -20.99, 20.0, -46.89, -52.09, 0.0]\n  Target bbox: [623.35, 324.56, 656.31, 394.58]\n\nFrame 4:\n  Drone pose: [-107.76, -21.56, 20.0, -46.8, -51.44, 0.0]\n  Target bbox: [618.79, 322.46, 660.74, 396.66]\n\nFrame 5 (current):\n  Drone pose: [-107.44, -22.13, 20.0, -46.7, -50.87, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.83, \"ymin\": 325.35, \"xmax\": 655.83, \"ymax\": 393.85}, \"waypoint_deltas\": [{\"dx\": 0.35, \"dy\": -0.56, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": 0.71, \"dy\": -1.12, \"dz\": 0.0, \"dpitch\": 0.38, \"dyaw\": 0.97, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": 0.45, \"dyaw\": 1.36, \"droll\": 0.0}, {\"dx\": 1.48, \"dy\": -2.21, \"dz\": 0.0, \"dpitch\": 0.51, \"dyaw\": 1.7, \"droll\": 0.0}, {\"dx\": 1.89, \"dy\": -2.74, \"dz\": 0.0, \"dpitch\": 0.57, \"dyaw\": 1.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.77, "window_alt_abs_m": 0.0, "target_px_mean_hist": 783.0, "cur_frame_id": 59, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00070/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.71, -25.9, 20.0, -46.01, -48.45, 0.0]\n  Target bbox: [620.2, 323.47, 659.35, 395.62]\n\nFrame 2:\n  Drone pose: [-104.28, -26.41, 20.0, -45.94, -48.28, 0.0]\n  Target bbox: [622.73, 323.29, 656.86, 395.83]\n\nFrame 3:\n  Drone pose: [-103.85, -26.9, 20.0, -45.86, -48.12, 0.0]\n  Target bbox: [619.5, 323.33, 660.03, 395.79]\n\nFrame 4:\n  Drone pose: [-103.44, -27.38, 20.0, -45.75, -47.95, 0.0]\n  Target bbox: [622.74, 325.11, 656.88, 394.05]\n\nFrame 5 (current):\n  Drone pose: [-103.05, -27.86, 20.0, -45.61, -47.74, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.08, \"ymin\": 323.73, \"xmax\": 658.45, \"ymax\": 395.47}, \"waypoint_deltas\": [{\"dx\": 0.36, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": 0.26, \"droll\": 0.0}, {\"dx\": 0.67, \"dy\": -0.93, \"dz\": 0.0, \"dpitch\": 0.42, \"dyaw\": 0.58, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": -1.38, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": 1.21, \"dy\": -1.83, \"dz\": 0.0, \"dpitch\": 0.5, \"dyaw\": 0.3, \"droll\": 0.0}, {\"dx\": 1.45, \"dy\": -2.28, \"dz\": 0.0, \"dpitch\": 0.3, \"dyaw\": -0.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.71, "window_alt_abs_m": 0.0, "target_px_mean_hist": 768.0, "cur_frame_id": 70, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00081/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.11, -31.08, 20.0, -45.23, -48.2, 0.0]\n  Target bbox: [615.43, 319.77, 664.58, 399.7]\n\nFrame 2:\n  Drone pose: [-100.84, -31.56, 20.0, -45.47, -48.84, 0.0]\n  Target bbox: [624.81, 325.98, 654.86, 393.25]\n\nFrame 3:\n  Drone pose: [-100.57, -32.06, 20.0, -45.24, -48.32, 0.0]\n  Target bbox: [615.71, 319.6, 664.3, 399.82]\n\nFrame 4:\n  Drone pose: [-100.28, -32.55, 20.0, -45.53, -48.99, 0.0]\n  Target bbox: [620.91, 323.99, 658.67, 395.15]\n\nFrame 5 (current):\n  Drone pose: [-99.98, -33.06, 20.0, -45.34, -48.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.95, \"ymin\": 320.96, \"xmax\": 663.11, \"ymax\": 398.47}, \"waypoint_deltas\": [{\"dx\": 0.28, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": -0.29, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": 0.56, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.83, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": -0.56, \"dyaw\": -0.78, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": -0.32, \"dyaw\": -0.23, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": -2.48, \"dz\": 0.0, \"dpitch\": -0.54, \"dyaw\": -0.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.3, "window_alt_abs_m": 0.0, "target_px_mean_hist": 620.0, "cur_frame_id": 81, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-98.17, -36.46, 20.0, -45.77, -49.44, 0.0]\n  Target bbox: [616.46, 321.57, 663.6, 397.93]\n\nFrame 2:\n  Drone pose: [-97.94, -36.92, 20.0, -45.94, -50.06, 0.0]\n  Target bbox: [619.46, 323.13, 660.08, 396.04]\n\nFrame 3:\n  Drone pose: [-97.73, -37.37, 20.0, -45.6, -49.5, 0.0]\n  Target bbox: [616.93, 321.17, 663.13, 398.26]\n\nFrame 4:\n  Drone pose: [-97.51, -37.82, 20.0, -45.76, -50.1, 0.0]\n  Target bbox: [623.7, 324.45, 655.94, 394.74]\n\nFrame 5 (current):\n  Drone pose: [-97.28, -38.28, 20.0, -45.44, -49.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.71, \"ymin\": 319.86, \"xmax\": 664.35, \"ymax\": 399.58}, \"waypoint_deltas\": [{\"dx\": 0.25, \"dy\": -0.46, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": -0.65, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -0.93, \"dz\": 0.0, \"dpitch\": -0.43, \"dyaw\": -1.39, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": -1.39, \"dz\": 0.0, \"dpitch\": -0.22, \"dyaw\": -1.03, \"droll\": 0.0}, {\"dx\": 1.22, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.77, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": -2.33, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": -0.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.36, "window_alt_abs_m": 0.0, "target_px_mean_hist": 786.0, "cur_frame_id": 92, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/ORI/frames_playback/frame_00103/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-94.87, -41.55, 20.0, -45.09, -49.79, 0.0]\n  Target bbox: [623.22, 323.23, 656.38, 395.99]\n\nFrame 2:\n  Drone pose: [-94.51, -42.04, 20.0, -44.93, -49.5, 0.0]\n  Target bbox: [624.43, 326.21, 655.25, 393.04]\n\nFrame 3:\n  Drone pose: [-94.21, -42.53, 20.0, -44.73, -49.05, 0.0]\n  Target bbox: [622.97, 325.23, 656.64, 394.11]\n\nFrame 4:\n  Drone pose: [-93.99, -43.03, 20.0, -44.47, -48.43, 0.0]\n  Target bbox: [624.58, 326.31, 655.08, 393.01]\n\nFrame 5 (current):\n  Drone pose: [-93.83, -43.54, 20.0, -44.14, -47.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.47, \"ymin\": 319.95, \"xmax\": 664.54, \"ymax\": 399.61}, \"waypoint_deltas\": [{\"dx\": 0.11, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -0.96, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -0.45, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": -1.41, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.62, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -1.85, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.76, \"droll\": 0.0}, {\"dx\": 0.21, \"dy\": -2.28, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.87, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.1, "window_alt_abs_m": 0.0, "target_px_mean_hist": 746.2, "cur_frame_id": 103, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-114.57, 13.22, 22.0, -42.55, -86.47, 0.0]\n  Target bbox: [674.4, 390.1, 722.34, 464.91]\n\nFrame 2:\n  Drone pose: [-115.32, 11.32, 21.16, -50.4, -76.0, 0.0]\n  Target bbox: [607.57, 267.13, 647.32, 337.91]\n\nFrame 3:\n  Drone pose: [-115.57, 10.61, 20.77, -46.66, -76.21, 0.0]\n  Target bbox: [618.09, 321.78, 661.7, 397.57]\n\nFrame 4:\n  Drone pose: [-115.65, 9.74, 20.64, -47.18, -75.47, 0.0]\n  Target bbox: [630.37, 314.16, 679.54, 393.07]\n\nFrame 5 (current):\n  Drone pose: [-115.64, 9.04, 20.62, -44.17, -74.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 643.5, \"ymin\": 367.86, \"xmax\": 687.29, \"ymax\": 441.36}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.7, \"dz\": -0.03, \"dpitch\": -2.93, \"dyaw\": 2.35, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": -1.22, \"dz\": -0.05, \"dpitch\": -2.98, \"dyaw\": 2.01, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": -1.73, \"dz\": -0.07, \"dpitch\": -3.02, \"dyaw\": 1.7, \"droll\": 0.0}, {\"dx\": 0.32, \"dy\": -2.24, \"dz\": -0.09, \"dpitch\": -3.04, \"dyaw\": 1.43, \"droll\": 0.0}, {\"dx\": 0.4, \"dy\": -2.75, \"dz\": -0.2, \"dpitch\": -2.93, \"dyaw\": 1.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.19, "window_alt_abs_m": 1.38, "target_px_mean_hist": 702.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-115.1, 5.28, 20.36, -43.73, -73.45, 0.0]\n  Target bbox: [614.21, 380.23, 654.3, 452.11]\n\nFrame 2:\n  Drone pose: [-114.92, 4.73, 20.29, -43.12, -78.66, 0.0]\n  Target bbox: [668.49, 392.82, 709.31, 464.71]\n\nFrame 3:\n  Drone pose: [-114.98, 4.27, 20.3, -45.95, -79.29, 0.0]\n  Target bbox: [675.05, 343.32, 720.71, 416.89]\n\nFrame 4:\n  Drone pose: [-114.82, 3.89, 20.24, -46.87, -74.88, 0.0]\n  Target bbox: [617.66, 321.79, 662.7, 397.39]\n\nFrame 5 (current):\n  Drone pose: [-114.85, 3.32, 20.24, -48.06, -69.74, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.3, \"ymin\": 305.62, \"xmax\": 604.46, \"ymax\": 380.68}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": -0.57, \"dz\": -0.02, \"dpitch\": 1.05, \"dyaw\": -4.74, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": -1.08, \"dz\": -0.05, \"dpitch\": 1.07, \"dyaw\": -4.72, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": -1.58, \"dz\": -0.07, \"dpitch\": 1.11, \"dyaw\": -4.66, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -2.06, \"dz\": -0.09, \"dpitch\": 1.17, \"dyaw\": -4.6, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -2.53, \"dz\": -0.1, \"dpitch\": 1.26, \"dyaw\": -4.57, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.38, "window_alt_abs_m": 0.13, "target_px_mean_hist": 739.5, "cur_frame_id": 15, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00026/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-115.0, -0.09, 20.1, -46.57, -74.36, 0.0]\n  Target bbox: [620.19, 323.58, 660.14, 395.73]\n\nFrame 2:\n  Drone pose: [-115.0, -0.52, 20.09, -50.38, -74.66, 0.0]\n  Target bbox: [618.72, 254.71, 667.61, 332.31]\n\nFrame 3:\n  Drone pose: [-114.85, -0.89, 20.13, -46.36, -74.96, 0.0]\n  Target bbox: [615.56, 321.84, 664.81, 397.43]\n\nFrame 4:\n  Drone pose: [-114.97, -1.47, 20.18, -46.51, -74.54, 0.0]\n  Target bbox: [615.39, 321.47, 664.98, 397.77]\n\nFrame 5 (current):\n  Drone pose: [-114.99, -1.85, 20.06, -46.14, -74.57, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.36, \"ymin\": 323.83, \"xmax\": 659.97, \"ymax\": 395.52}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.47, \"dz\": -0.01, \"dpitch\": 0.06, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -0.85, \"dz\": -0.02, \"dpitch\": 0.18, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": 0.56, \"dy\": -1.24, \"dz\": -0.02, \"dpitch\": 0.21, \"dyaw\": -1.91, \"droll\": 0.0}, {\"dx\": 1.27, \"dy\": -1.53, \"dz\": -0.03, \"dpitch\": 0.3, \"dyaw\": -4.2, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -2.34, \"dz\": -0.03, \"dpitch\": -0.09, \"dyaw\": -3.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.07, "window_alt_abs_m": 0.21, "target_px_mean_hist": 772.8, "cur_frame_id": 26, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00037/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-113.08, -5.67, 20.02, -47.98, -71.67, 0.0]\n  Target bbox: [572.95, 303.04, 615.78, 376.95]\n\nFrame 2:\n  Drone pose: [-112.94, -6.37, 20.02, -48.27, -77.55, 0.0]\n  Target bbox: [654.8, 297.78, 700.57, 376.52]\n\nFrame 3:\n  Drone pose: [-112.76, -6.9, 19.87, -44.95, -78.28, 0.0]\n  Target bbox: [676.67, 351.12, 719.49, 426.22]\n\nFrame 4:\n  Drone pose: [-112.48, -7.63, 20.01, -42.02, -71.08, 0.0]\n  Target bbox: [602.65, 405.83, 646.5, 481.99]\n\nFrame 5 (current):\n  Drone pose: [-112.27, -8.29, 20.1, -47.61, -67.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 571.85, \"ymin\": 315.39, \"xmax\": 616.74, \"ymax\": 394.46}, \"waypoint_deltas\": [{\"dx\": 0.42, \"dy\": -0.56, \"dz\": -0.09, \"dpitch\": 0.43, \"dyaw\": -3.65, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": -1.2, \"dz\": -0.09, \"dpitch\": 0.34, \"dyaw\": -2.82, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": -1.86, \"dz\": -0.1, \"dpitch\": 0.24, \"dyaw\": -1.87, \"droll\": 0.0}, {\"dx\": 1.16, \"dy\": -2.54, \"dz\": -0.1, \"dpitch\": 0.14, \"dyaw\": -0.79, \"droll\": 0.0}, {\"dx\": 1.35, \"dy\": -3.22, \"dz\": -0.1, \"dpitch\": 0.06, \"dyaw\": 0.38, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.51, "window_alt_abs_m": 0.37, "target_px_mean_hist": 760.8, "cur_frame_id": 37, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00048/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.59, -12.88, 20.0, -48.98, -60.29, 0.0]\n  Target bbox: [571.93, 301.67, 611.77, 375.2]\n\nFrame 2:\n  Drone pose: [-110.43, -13.55, 20.0, -43.06, -63.44, 0.0]\n  Target bbox: [620.23, 398.75, 663.96, 474.93]\n\nFrame 3:\n  Drone pose: [-110.12, -14.3, 20.0, -47.88, -62.28, 0.0]\n  Target bbox: [621.37, 323.57, 658.34, 395.6]\n\nFrame 4:\n  Drone pose: [-110.25, -14.82, 20.01, -48.21, -60.86, 0.0]\n  Target bbox: [626.45, 310.66, 663.35, 382.41]\n\nFrame 5 (current):\n  Drone pose: [-110.05, -15.52, 20.14, -48.83, -55.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 579.83, \"ymin\": 305.49, \"xmax\": 616.08, \"ymax\": 375.89}, \"waypoint_deltas\": [{\"dx\": 0.31, \"dy\": -0.61, \"dz\": -0.14, \"dpitch\": 1.4, \"dyaw\": -2.95, \"droll\": 0.0}, {\"dx\": 0.49, \"dy\": -1.23, \"dz\": -0.14, \"dpitch\": 1.5, \"dyaw\": -1.85, \"droll\": 0.0}, {\"dx\": 0.67, \"dy\": -1.84, \"dz\": -0.14, \"dpitch\": 1.64, \"dyaw\": -0.78, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": -2.46, \"dz\": -0.14, \"dpitch\": 1.76, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -3.08, \"dz\": -0.14, \"dpitch\": 1.91, \"dyaw\": 1.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.0, "window_alt_abs_m": 0.13, "target_px_mean_hist": 798.0, "cur_frame_id": 48, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.6, -19.81, 20.0, -43.6, -49.27, 0.0]\n  Target bbox: [571.11, 385.24, 607.72, 454.78]\n\nFrame 2:\n  Drone pose: [-108.34, -20.4, 20.0, -45.08, -49.98, 0.0]\n  Target bbox: [589.04, 355.46, 624.75, 429.43]\n\nFrame 3:\n  Drone pose: [-108.07, -20.99, 19.99, -42.41, -53.14, 0.0]\n  Target bbox: [634.9, 399.18, 670.3, 470.43]\n\nFrame 4:\n  Drone pose: [-107.76, -21.56, 20.0, -46.8, -51.44, 0.0]\n  Target bbox: [622.67, 324.01, 656.95, 395.17]\n\nFrame 5 (current):\n  Drone pose: [-107.43, -22.07, 20.04, -48.42, -49.83, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 608.99, \"ymin\": 293.7, \"xmax\": 643.12, \"ymax\": 367.22}, \"waypoint_deltas\": [{\"dx\": 0.34, \"dy\": -0.62, \"dz\": -0.04, \"dpitch\": 1.8, \"dyaw\": -0.53, \"droll\": 0.0}, {\"dx\": 0.7, \"dy\": -1.18, \"dz\": -0.04, \"dpitch\": 2.1, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": -1.73, \"dz\": -0.04, \"dpitch\": 2.17, \"dyaw\": 0.32, \"droll\": 0.0}, {\"dx\": 1.47, \"dy\": -2.27, \"dz\": -0.04, \"dpitch\": 2.23, \"dyaw\": 0.66, \"droll\": 0.0}, {\"dx\": 1.88, \"dy\": -2.8, \"dz\": -0.04, \"dpitch\": 2.29, \"dyaw\": 0.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.19, "window_alt_abs_m": 0.05, "target_px_mean_hist": 777.8, "cur_frame_id": 59, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00070/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.8, -25.85, 19.86, -45.64, -48.36, 0.0]\n  Target bbox: [623.7, 325.33, 655.95, 393.84]\n\nFrame 2:\n  Drone pose: [-104.28, -26.41, 20.0, -45.94, -48.28, 0.0]\n  Target bbox: [619.56, 323.14, 659.98, 395.93]\n\nFrame 3:\n  Drone pose: [-103.95, -26.87, 19.94, -46.36, -45.51, 0.0]\n  Target bbox: [595.34, 314.18, 626.64, 381.64]\n\nFrame 4:\n  Drone pose: [-103.25, -27.41, 20.04, -46.03, -48.33, 0.0]\n  Target bbox: [623.34, 325.31, 656.3, 393.84]\n\nFrame 5 (current):\n  Drone pose: [-103.05, -27.76, 19.95, -44.33, -44.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 578.1, \"ymin\": 345.45, \"xmax\": 608.27, \"ymax\": 412.49}, \"waypoint_deltas\": [{\"dx\": 0.36, \"dy\": -0.57, \"dz\": 0.05, \"dpitch\": -1.09, \"dyaw\": -3.47, \"droll\": 0.0}, {\"dx\": 0.67, \"dy\": -1.03, \"dz\": 0.05, \"dpitch\": -0.86, \"dyaw\": -3.15, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": -1.48, \"dz\": 0.05, \"dpitch\": -1.09, \"dyaw\": -3.88, \"droll\": 0.0}, {\"dx\": 1.21, \"dy\": -1.93, \"dz\": 0.05, \"dpitch\": -0.78, \"dyaw\": -3.43, \"droll\": 0.0}, {\"dx\": 1.45, \"dy\": -2.38, \"dz\": 0.05, \"dpitch\": -0.98, \"dyaw\": -4.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.99, "window_alt_abs_m": 0.38, "target_px_mean_hist": 776.0, "cur_frame_id": 70, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00081/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.06, -31.18, 19.97, -47.58, -49.87, 0.0]\n  Target bbox: [636.75, 282.22, 686.11, 362.9]\n\nFrame 2:\n  Drone pose: [-100.7, -31.47, 20.02, -48.04, -46.64, 0.0]\n  Target bbox: [590.81, 282.83, 624.19, 353.75]\n\nFrame 3:\n  Drone pose: [-100.57, -32.06, 20.0, -45.24, -48.32, 0.0]\n  Target bbox: [616.38, 320.38, 663.65, 399.05]\n\nFrame 4:\n  Drone pose: [-100.23, -32.59, 19.82, -45.35, -49.04, 0.0]\n  Target bbox: [619.75, 323.11, 659.79, 396.0]\n\nFrame 5 (current):\n  Drone pose: [-99.94, -33.24, 19.96, -49.73, -47.14, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 603.45, \"ymin\": 250.83, \"xmax\": 649.95, \"ymax\": 327.13}, \"waypoint_deltas\": [{\"dx\": 0.24, \"dy\": -0.33, \"dz\": 0.04, \"dpitch\": 4.1, \"dyaw\": -2.0, \"droll\": 0.0}, {\"dx\": 0.52, \"dy\": -0.83, \"dz\": 0.04, \"dpitch\": 4.33, \"dyaw\": -1.5, \"droll\": 0.0}, {\"dx\": 0.79, \"dy\": -1.32, \"dz\": 0.04, \"dpitch\": 3.83, \"dyaw\": -2.15, \"droll\": 0.0}, {\"dx\": 1.05, \"dy\": -1.82, \"dz\": 0.04, \"dpitch\": 4.07, \"dyaw\": -1.6, \"droll\": 0.0}, {\"dx\": 1.3, \"dy\": -2.3, \"dz\": 0.04, \"dpitch\": 3.85, \"dyaw\": -2.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.54, "window_alt_abs_m": 0.38, "target_px_mean_hist": 646.5, "cur_frame_id": 81, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-98.08, -36.35, 20.1, -45.55, -48.82, 0.0]\n  Target bbox: [603.93, 326.64, 650.7, 404.08]\n\nFrame 2:\n  Drone pose: [-98.03, -36.76, 19.95, -45.58, -50.18, 0.0]\n  Target bbox: [624.43, 325.89, 655.25, 393.36]\n\nFrame 3:\n  Drone pose: [-97.73, -37.37, 20.0, -45.6, -49.5, 0.0]\n  Target bbox: [617.19, 322.08, 662.88, 397.41]\n\nFrame 4:\n  Drone pose: [-97.6, -37.85, 19.99, -48.13, -48.38, 0.0]\n  Target bbox: [607.89, 285.07, 636.98, 352.09]\n\nFrame 5 (current):\n  Drone pose: [-97.31, -38.18, 20.11, -45.53, -49.89, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.27, \"ymin\": 320.92, \"xmax\": 665.95, \"ymax\": 396.55}, \"waypoint_deltas\": [{\"dx\": 0.28, \"dy\": -0.56, \"dz\": -0.11, \"dpitch\": -0.11, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": 0.56, \"dy\": -1.03, \"dz\": -0.11, \"dpitch\": -0.34, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": -1.49, \"dz\": -0.11, \"dpitch\": -0.13, \"dyaw\": -0.69, \"droll\": 0.0}, {\"dx\": 1.25, \"dy\": -1.96, \"dz\": -0.11, \"dpitch\": 0.04, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": 1.64, \"dy\": -2.43, \"dz\": -0.11, \"dpitch\": 0.18, \"dyaw\": -0.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.67, "window_alt_abs_m": 0.35, "target_px_mean_hist": 769.8, "cur_frame_id": 92, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611/aug_001/frames_playback/frame_00103/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-94.91, -41.58, 19.81, -44.79, -49.65, 0.0]\n  Target bbox: [620.83, 324.05, 658.75, 395.15]\n\nFrame 2:\n  Drone pose: [-94.51, -42.04, 20.0, -46.11, -47.5, 0.0]\n  Target bbox: [599.48, 304.36, 631.99, 375.9]\n\nFrame 3:\n  Drone pose: [-94.2, -42.65, 19.91, -44.74, -48.85, 0.0]\n  Target bbox: [623.66, 325.83, 655.99, 393.41]\n\nFrame 4:\n  Drone pose: [-93.91, -43.06, 20.0, -39.65, -47.04, 0.0]\n  Target bbox: [606.86, 409.69, 635.91, 475.67]\n\nFrame 5 (current):\n  Drone pose: [-93.83, -43.54, 20.0, -48.42, -46.57, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 603.74, \"ymin\": 250.06, \"xmax\": 648.7, \"ymax\": 325.5}, \"waypoint_deltas\": [{\"dx\": 0.11, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": 4.19, \"dyaw\": -1.37, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -0.96, \"dz\": 0.0, \"dpitch\": 4.16, \"dyaw\": -1.57, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": -1.41, \"dz\": 0.0, \"dpitch\": 4.17, \"dyaw\": -1.74, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -1.85, \"dz\": 0.0, \"dpitch\": 4.22, \"dyaw\": -1.88, \"droll\": 0.0}, {\"dx\": 0.21, \"dy\": -2.28, \"dz\": 0.0, \"dpitch\": 4.32, \"dyaw\": -1.99, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.76, "window_alt_abs_m": 0.37, "target_px_mean_hist": 745.5, "cur_frame_id": 103, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776333611", "difficulty_score": 0.2215, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [50.89, 16.22, 22.0, -46.21, -43.53, 0.0]\n  Target bbox: [624.8, 328.02, 655.49, 391.13]\n\nFrame 2:\n  Drone pose: [50.36, 14.67, 21.2, -44.97, -39.33, 0.0]\n  Target bbox: [624.95, 326.23, 655.35, 392.99]\n\nFrame 3:\n  Drone pose: [50.41, 13.67, 20.67, -44.16, -37.42, 0.0]\n  Target bbox: [628.5, 326.68, 651.7, 392.47]\n\nFrame 4:\n  Drone pose: [50.77, 12.95, 20.64, -44.15, -36.67, 0.0]\n  Target bbox: [623.05, 325.97, 657.25, 393.28]\n\nFrame 5 (current):\n  Drone pose: [51.26, 12.32, 20.62, -44.2, -36.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.4, \"ymin\": 328.06, \"xmax\": 650.76, \"ymax\": 391.04}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.58, \"dz\": -0.03, \"dpitch\": -0.04, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": -1.16, \"dz\": -0.05, \"dpitch\": -0.09, \"dyaw\": 0.34, \"droll\": 0.0}, {\"dx\": 1.53, \"dy\": -1.74, \"dz\": -0.07, \"dpitch\": -0.14, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": 2.04, \"dy\": -2.3, \"dz\": -0.09, \"dpitch\": -0.17, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": 2.55, \"dy\": -2.86, \"dz\": -0.2, \"dpitch\": -0.07, \"dyaw\": 0.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.16, "window_alt_abs_m": 1.38, "target_px_mean_hist": 521.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00016/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [55.32, 7.86, 20.33, -44.22, -35.39, 0.0]\n  Target bbox: [625.8, 326.84, 654.38, 392.19]\n\nFrame 2:\n  Drone pose: [55.81, 7.36, 20.3, -44.17, -35.37, 0.0]\n  Target bbox: [626.83, 327.9, 653.32, 391.08]\n\nFrame 3:\n  Drone pose: [56.29, 6.87, 20.27, -44.1, -35.37, 0.0]\n  Target bbox: [626.76, 326.06, 653.47, 393.11]\n\nFrame 4:\n  Drone pose: [56.77, 6.41, 20.24, -44.0, -35.42, 0.0]\n  Target bbox: [625.99, 326.37, 654.22, 392.73]\n\nFrame 5 (current):\n  Drone pose: [57.24, 5.98, 20.22, -43.87, -35.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.0, \"ymin\": 325.01, \"xmax\": 656.3, \"ymax\": 394.24}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": -0.4, \"dz\": -0.03, \"dpitch\": 0.15, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": -0.74, \"dz\": -0.05, \"dpitch\": 0.34, \"dyaw\": -0.52, \"droll\": 0.0}, {\"dx\": 1.43, \"dy\": -1.04, \"dz\": -0.07, \"dpitch\": 0.56, \"dyaw\": -0.95, \"droll\": 0.0}, {\"dx\": 1.9, \"dy\": -1.3, \"dz\": -0.08, \"dpitch\": 0.41, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": 2.38, \"dy\": -1.53, \"dz\": -0.1, \"dpitch\": 0.27, \"dyaw\": 0.27, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.19, "window_alt_abs_m": 0.11, "target_px_mean_hist": 552.0, "cur_frame_id": 16, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00028/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [61.08, 3.65, 20.08, -43.74, -34.49, 0.0]\n  Target bbox: [627.9, 325.97, 652.29, 393.14]\n\nFrame 2:\n  Drone pose: [61.59, 3.32, 20.07, -43.6, -34.9, 0.0]\n  Target bbox: [628.33, 326.75, 651.86, 392.39]\n\nFrame 3:\n  Drone pose: [62.1, 2.96, 20.06, -43.48, -35.25, 0.0]\n  Target bbox: [626.22, 327.25, 653.98, 391.91]\n\nFrame 4:\n  Drone pose: [62.62, 2.56, 20.05, -43.41, -35.53, 0.0]\n  Target bbox: [623.82, 325.35, 656.44, 393.81]\n\nFrame 5 (current):\n  Drone pose: [63.16, 2.14, 20.04, -43.38, -35.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.13, \"ymin\": 327.09, \"xmax\": 653.05, \"ymax\": 391.99}, \"waypoint_deltas\": [{\"dx\": 0.59, \"dy\": -0.38, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": 1.31, \"dy\": -0.65, \"dz\": -0.01, \"dpitch\": -0.05, \"dyaw\": -1.33, \"droll\": 0.0}, {\"dx\": 2.19, \"dy\": -0.83, \"dz\": -0.01, \"dpitch\": -0.18, \"dyaw\": -2.73, \"droll\": 0.0}, {\"dx\": 3.0, \"dy\": -1.32, \"dz\": -0.02, \"dpitch\": -0.52, \"dyaw\": -3.31, \"droll\": 0.0}, {\"dx\": 3.57, \"dy\": -2.05, \"dz\": -0.02, \"dpitch\": -0.81, \"dyaw\": -2.92, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.29, "window_alt_abs_m": 0.04, "target_px_mean_hist": 564.0, "cur_frame_id": 28, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [68.06, -1.8, 20.01, -44.33, -37.48, 0.0]\n  Target bbox: [627.34, 327.59, 652.84, 391.47]\n\nFrame 2:\n  Drone pose: [68.61, -2.25, 20.01, -44.34, -37.7, 0.0]\n  Target bbox: [622.61, 324.93, 657.71, 394.24]\n\nFrame 3:\n  Drone pose: [69.17, -2.66, 20.01, -44.33, -38.01, 0.0]\n  Target bbox: [623.72, 325.63, 656.54, 393.41]\n\nFrame 4:\n  Drone pose: [69.75, -3.05, 20.01, -44.31, -38.4, 0.0]\n  Target bbox: [629.28, 327.05, 650.89, 391.99]\n\nFrame 5 (current):\n  Drone pose: [70.34, -3.42, 20.0, -44.31, -38.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.33, \"ymin\": 324.53, \"xmax\": 655.99, \"ymax\": 394.63}, \"waypoint_deltas\": [{\"dx\": 0.63, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.55, \"droll\": 0.0}, {\"dx\": 1.29, \"dy\": -0.72, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -1.17, \"droll\": 0.0}, {\"dx\": 1.97, \"dy\": -1.07, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -1.86, \"droll\": 0.0}, {\"dx\": 2.65, \"dy\": -1.42, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -2.57, \"droll\": 0.0}, {\"dx\": 3.31, \"dy\": -1.76, \"dz\": 0.0, \"dpitch\": -0.19, \"dyaw\": -3.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.4, "window_alt_abs_m": 0.01, "target_px_mean_hist": 574.8, "cur_frame_id": 40, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [75.36, -6.17, 20.0, -44.74, -44.7, 0.0]\n  Target bbox: [618.54, 321.36, 661.34, 397.96]\n\nFrame 2:\n  Drone pose: [75.77, -6.49, 20.0, -44.98, -45.94, 0.0]\n  Target bbox: [622.54, 324.2, 657.44, 394.97]\n\nFrame 3:\n  Drone pose: [76.07, -6.84, 20.0, -45.14, -46.92, 0.0]\n  Target bbox: [622.65, 324.17, 657.34, 394.97]\n\nFrame 4:\n  Drone pose: [76.37, -7.22, 20.0, -45.3, -47.86, 0.0]\n  Target bbox: [625.22, 325.33, 654.86, 393.7]\n\nFrame 5 (current):\n  Drone pose: [76.9, -7.53, 20.0, -45.63, -49.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.81, \"ymin\": 324.19, \"xmax\": 655.28, \"ymax\": 394.76}, \"waypoint_deltas\": [{\"dx\": 1.5, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": -0.96, \"dyaw\": -4.51, \"droll\": 0.0}, {\"dx\": 3.36, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": -1.96, \"dyaw\": -10.45, \"droll\": 0.0}, {\"dx\": 5.25, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": -2.66, \"dyaw\": -16.88, \"droll\": 0.0}, {\"dx\": 7.56, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": -3.05, \"dyaw\": -25.03, \"droll\": 0.0}, {\"dx\": 9.91, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": -2.86, \"dyaw\": -33.36, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.79, "window_alt_abs_m": 0.0, "target_px_mean_hist": 534.0, "cur_frame_id": 52, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.2, -8.35, 20.0, -46.33, -100.58, 0.0]\n  Target bbox: [622.21, 324.38, 657.53, 394.59]\n\nFrame 2:\n  Drone pose: [92.35, -8.86, 20.0, -46.29, -101.06, 0.0]\n  Target bbox: [619.39, 323.99, 660.32, 395.02]\n\nFrame 3:\n  Drone pose: [92.41, -9.36, 20.0, -46.28, -101.23, 0.0]\n  Target bbox: [620.59, 324.6, 659.16, 394.35]\n\nFrame 4:\n  Drone pose: [92.42, -9.87, 20.0, -46.28, -101.29, 0.0]\n  Target bbox: [626.41, 326.1, 653.41, 392.86]\n\nFrame 5 (current):\n  Drone pose: [92.43, -10.37, 20.0, -46.28, -101.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.76, \"ymin\": 325.2, \"xmax\": 651.07, \"ymax\": 393.66}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -0.99, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -1.48, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.05, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -1.98, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -2.46, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.72, "window_alt_abs_m": 0.0, "target_px_mean_hist": 596.8, "cur_frame_id": 65, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00077/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.6, -14.22, 20.0, -46.0, -101.75, 0.0]\n  Target bbox: [628.84, 325.41, 650.99, 393.48]\n\nFrame 2:\n  Drone pose: [92.65, -14.68, 20.0, -45.91, -101.88, 0.0]\n  Target bbox: [628.82, 325.66, 651.01, 393.3]\n\nFrame 3:\n  Drone pose: [92.71, -15.13, 20.0, -45.83, -102.01, 0.0]\n  Target bbox: [627.19, 325.25, 652.62, 393.66]\n\nFrame 4:\n  Drone pose: [92.75, -15.59, 20.0, -45.76, -102.13, 0.0]\n  Target bbox: [629.2, 325.82, 650.64, 393.14]\n\nFrame 5 (current):\n  Drone pose: [92.8, -16.06, 20.0, -45.7, -102.24, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.39, \"ymin\": 326.09, \"xmax\": 655.41, \"ymax\": 392.91}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -0.97, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -1.47, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.28, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.98, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -0.4, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.49, "window_alt_abs_m": 0.0, "target_px_mean_hist": 563.5, "cur_frame_id": 77, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00089/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.91, -20.17, 20.0, -45.82, -102.66, 0.0]\n  Target bbox: [622.86, 325.54, 656.93, 393.45]\n\nFrame 2:\n  Drone pose: [92.88, -20.72, 20.0, -45.9, -102.6, 0.0]\n  Target bbox: [620.51, 324.22, 659.23, 394.83]\n\nFrame 3:\n  Drone pose: [92.85, -21.26, 20.0, -45.98, -102.53, 0.0]\n  Target bbox: [619.69, 324.43, 660.06, 394.59]\n\nFrame 4:\n  Drone pose: [92.81, -21.81, 20.0, -46.07, -102.45, 0.0]\n  Target bbox: [622.31, 325.32, 657.47, 393.68]\n\nFrame 5 (current):\n  Drone pose: [92.78, -22.35, 20.0, -46.13, -102.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.79, \"ymin\": 324.91, \"xmax\": 659.96, \"ymax\": 394.12}, \"waypoint_deltas\": [{\"dx\": -0.02, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -1.54, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": -0.31, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -2.53, \"dz\": 0.0, \"dpitch\": -0.3, \"dyaw\": 0.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.28, "window_alt_abs_m": 0.0, "target_px_mean_hist": 571.8, "cur_frame_id": 89, "source": "ORI", "fut_invisible_cnt": 3}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00103/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.85, -27.24, 20.0, -46.18, -102.53, 0.0]\n  Target bbox: [626.1, 325.52, 653.68, 393.52]\n\nFrame 2:\n  Drone pose: [92.86, -27.74, 20.0, -46.17, -102.54, 0.0]\n  Target bbox: [623.88, 325.03, 655.91, 393.91]\n\nFrame 3:\n  Drone pose: [92.86, -28.25, 20.0, -46.19, -102.57, 0.0]\n  Target bbox: [626.67, 326.26, 653.15, 392.71]\n\nFrame 4:\n  Drone pose: [92.85, -28.77, 20.0, -46.22, -102.53, 0.0]\n  Target bbox: [628.96, 325.97, 650.89, 392.93]\n\nFrame 5 (current):\n  Drone pose: [92.82, -29.29, 20.0, -46.26, -102.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.87, \"ymin\": 325.82, \"xmax\": 650.96, \"ymax\": 393.15}, \"waypoint_deltas\": [{\"dx\": -0.03, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -1.53, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": -2.47, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -0.72, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.17, "window_alt_abs_m": 0.0, "target_px_mean_hist": 597.5, "cur_frame_id": 103, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00113/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/ORI/frames_playback/frame_00115/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [93.27, -33.24, 20.0, -46.2, -102.28, 0.0]\n  Target bbox: [619.64, 324.15, 660.09, 394.89]\n\nFrame 2:\n  Drone pose: [93.26, -33.77, 20.0, -46.25, -102.28, 0.0]\n  Target bbox: [623.02, 324.92, 656.76, 394.04]\n\nFrame 3:\n  Drone pose: [93.24, -34.31, 20.0, -46.31, -102.23, 0.0]\n  Target bbox: [629.15, 325.12, 650.68, 393.76]\n\nFrame 4:\n  Drone pose: [93.22, -34.84, 20.0, -46.37, -102.17, 0.0]\n  Target bbox: [627.8, 326.16, 652.03, 392.78]\n\nFrame 5 (current):\n  Drone pose: [93.19, -35.36, 20.0, -46.41, -102.13, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.95, \"ymin\": 324.41, \"xmax\": 657.79, \"ymax\": 394.58}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -1.53, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -2.55, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.15, "window_alt_abs_m": 0.0, "target_px_mean_hist": 575.5, "cur_frame_id": 115, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [50.89, 16.22, 22.0, -50.78, -46.16, 0.0]\n  Target bbox: [657.27, 253.22, 684.48, 313.04]\n\nFrame 2:\n  Drone pose: [50.4, 14.64, 21.08, -44.87, -39.32, 0.0]\n  Target bbox: [628.89, 328.56, 651.27, 390.48]\n\nFrame 3:\n  Drone pose: [50.41, 13.67, 20.67, -39.16, -38.22, 0.0]\n  Target bbox: [636.86, 410.53, 663.17, 476.95]\n\nFrame 4:\n  Drone pose: [50.77, 12.95, 20.64, -44.15, -36.67, 0.0]\n  Target bbox: [624.47, 325.73, 655.81, 393.5]\n\nFrame 5 (current):\n  Drone pose: [51.29, 12.34, 20.77, -46.97, -38.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 657.24, \"ymin\": 284.89, \"xmax\": 681.11, \"ymax\": 350.07}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": -0.6, \"dz\": -0.18, \"dpitch\": 2.73, \"dyaw\": 2.67, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": -1.18, \"dz\": -0.2, \"dpitch\": 2.68, \"dyaw\": 2.83, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -1.76, \"dz\": -0.22, \"dpitch\": 2.63, \"dyaw\": 2.99, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": -2.32, \"dz\": -0.24, \"dpitch\": 2.6, \"dyaw\": 3.13, \"droll\": 0.0}, {\"dx\": 2.52, \"dy\": -2.88, \"dz\": -0.35, \"dpitch\": 2.7, \"dyaw\": 3.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.67, "window_alt_abs_m": 1.48, "target_px_mean_hist": 536.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00016/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [55.24, 7.79, 20.37, -44.27, -35.08, 0.0]\n  Target bbox: [627.53, 326.87, 652.67, 392.27]\n\nFrame 2:\n  Drone pose: [55.75, 7.4, 20.18, -43.89, -35.38, 0.0]\n  Target bbox: [624.22, 323.86, 656.1, 395.37]\n\nFrame 3:\n  Drone pose: [56.29, 6.87, 20.27, -44.1, -35.37, 0.0]\n  Target bbox: [627.32, 327.84, 652.85, 391.26]\n\nFrame 4:\n  Drone pose: [56.69, 6.39, 20.31, -47.67, -36.02, 0.0]\n  Target bbox: [633.97, 264.95, 665.75, 332.12]\n\nFrame 5 (current):\n  Drone pose: [57.24, 5.98, 20.22, -40.27, -35.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.19, \"ymin\": 386.47, \"xmax\": 648.29, \"ymax\": 453.65}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": -0.4, \"dz\": -0.03, \"dpitch\": -3.45, \"dyaw\": -0.73, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": -0.74, \"dz\": -0.05, \"dpitch\": -3.26, \"dyaw\": -1.04, \"droll\": 0.0}, {\"dx\": 1.43, \"dy\": -1.04, \"dz\": -0.07, \"dpitch\": -3.04, \"dyaw\": -1.47, \"droll\": 0.0}, {\"dx\": 1.9, \"dy\": -1.3, \"dz\": -0.08, \"dpitch\": -3.19, \"dyaw\": -0.84, \"droll\": 0.0}, {\"dx\": 2.38, \"dy\": -1.53, \"dz\": -0.1, \"dpitch\": -3.33, \"dyaw\": -0.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.98, "window_alt_abs_m": 0.42, "target_px_mean_hist": 547.2, "cur_frame_id": 16, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00028/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [61.08, 3.65, 20.08, -43.74, -34.49, 0.0]\n  Target bbox: [622.3, 324.53, 658.01, 394.72]\n\nFrame 2:\n  Drone pose: [61.59, 3.32, 20.07, -41.27, -38.97, 0.0]\n  Target bbox: [675.64, 368.4, 704.13, 431.23]\n\nFrame 3:\n  Drone pose: [62.0, 3.04, 19.94, -42.19, -31.83, 0.0]\n  Target bbox: [580.37, 340.81, 615.02, 411.36]\n\nFrame 4:\n  Drone pose: [62.62, 2.56, 20.05, -42.6, -40.53, 0.0]\n  Target bbox: [688.87, 342.4, 713.74, 407.63]\n\nFrame 5 (current):\n  Drone pose: [63.13, 1.98, 19.99, -42.18, -40.34, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 682.8, \"ymin\": 347.52, \"xmax\": 720.3, \"ymax\": 416.07}, \"waypoint_deltas\": [{\"dx\": 0.62, \"dy\": -0.22, \"dz\": 0.05, \"dpitch\": -1.19, \"dyaw\": 4.13, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": -0.49, \"dz\": 0.04, \"dpitch\": -1.25, \"dyaw\": 3.23, \"droll\": 0.0}, {\"dx\": 2.22, \"dy\": -0.67, \"dz\": 0.04, \"dpitch\": -1.38, \"dyaw\": 1.83, \"droll\": 0.0}, {\"dx\": 3.03, \"dy\": -1.16, \"dz\": 0.03, \"dpitch\": -1.72, \"dyaw\": 1.25, \"droll\": 0.0}, {\"dx\": 3.6, \"dy\": -1.89, \"dz\": 0.03, \"dpitch\": -2.01, \"dyaw\": 1.64, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.49, "window_alt_abs_m": 0.32, "target_px_mean_hist": 562.5, "cur_frame_id": 28, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [68.2, -1.8, 19.95, -40.61, -40.14, 0.0]\n  Target bbox: [656.11, 389.45, 682.52, 458.25]\n\nFrame 2:\n  Drone pose: [68.61, -2.32, 19.99, -47.2, -36.82, 0.0]\n  Target bbox: [619.72, 279.32, 643.21, 345.33]\n\nFrame 3:\n  Drone pose: [69.17, -2.66, 20.01, -43.83, -35.02, 0.0]\n  Target bbox: [586.47, 333.86, 621.2, 403.17]\n\nFrame 4:\n  Drone pose: [69.75, -3.05, 20.01, -43.38, -37.05, 0.0]\n  Target bbox: [611.7, 342.54, 635.43, 408.01]\n\nFrame 5 (current):\n  Drone pose: [70.34, -3.42, 20.0, -41.79, -34.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 572.01, \"ymin\": 366.97, \"xmax\": 607.22, \"ymax\": 439.47}, \"waypoint_deltas\": [{\"dx\": 0.63, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": -2.53, \"dyaw\": -4.73, \"droll\": 0.0}, {\"dx\": 1.29, \"dy\": -0.72, \"dz\": 0.0, \"dpitch\": -2.57, \"dyaw\": -5.35, \"droll\": 0.0}, {\"dx\": 1.97, \"dy\": -1.07, \"dz\": 0.0, \"dpitch\": -2.63, \"dyaw\": -6.04, \"droll\": 0.0}, {\"dx\": 2.65, \"dy\": -1.42, \"dz\": 0.0, \"dpitch\": -2.68, \"dyaw\": -6.75, \"droll\": 0.0}, {\"dx\": 3.31, \"dy\": -1.76, \"dz\": 0.0, \"dpitch\": -2.71, \"dyaw\": -7.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.5, "window_alt_abs_m": 0.06, "target_px_mean_hist": 590.0, "cur_frame_id": 40, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [75.46, -6.21, 19.98, -44.67, -44.95, 0.0]\n  Target bbox: [625.39, 328.24, 658.05, 397.31]\n\nFrame 2:\n  Drone pose: [75.77, -6.49, 20.0, -45.6, -40.94, 0.0]\n  Target bbox: [563.5, 315.94, 597.06, 386.0]\n\nFrame 3:\n  Drone pose: [76.07, -6.84, 20.0, -45.14, -46.92, 0.0]\n  Target bbox: [623.44, 323.94, 656.61, 395.12]\n\nFrame 4:\n  Drone pose: [76.19, -7.22, 20.05, -48.44, -46.91, 0.0]\n  Target bbox: [611.85, 266.15, 655.38, 344.09]\n\nFrame 5 (current):\n  Drone pose: [76.88, -7.69, 19.9, -45.64, -49.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.3, \"ymin\": 321.93, \"xmax\": 659.69, \"ymax\": 397.18}, \"waypoint_deltas\": [{\"dx\": 1.52, \"dy\": 0.07, \"dz\": 0.1, \"dpitch\": -0.95, \"dyaw\": -4.9, \"droll\": 0.0}, {\"dx\": 3.38, \"dy\": 0.05, \"dz\": 0.1, \"dpitch\": -1.95, \"dyaw\": -10.84, \"droll\": 0.0}, {\"dx\": 5.27, \"dy\": 0.04, \"dz\": 0.1, \"dpitch\": -2.65, \"dyaw\": -17.27, \"droll\": 0.0}, {\"dx\": 7.58, \"dy\": 0.11, \"dz\": 0.1, \"dpitch\": -3.04, \"dyaw\": -25.42, \"droll\": 0.0}, {\"dx\": 9.93, \"dy\": 0.2, \"dz\": 0.1, \"dpitch\": -2.85, \"dyaw\": -33.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.16, "window_alt_abs_m": 0.23, "target_px_mean_hist": 549.2, "cur_frame_id": 52, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.04, -8.31, 19.96, -46.24, -100.06, 0.0]\n  Target bbox: [628.29, 324.65, 651.52, 394.22]\n\nFrame 2:\n  Drone pose: [92.34, -8.99, 20.1, -46.64, -101.1, 0.0]\n  Target bbox: [629.06, 325.94, 650.78, 392.96]\n\nFrame 3:\n  Drone pose: [92.41, -9.36, 20.0, -47.43, -98.6, 0.0]\n  Target bbox: [597.82, 306.92, 620.33, 374.45]\n\nFrame 4:\n  Drone pose: [92.42, -9.87, 20.0, -45.16, -96.97, 0.0]\n  Target bbox: [576.26, 345.74, 602.85, 413.35]\n\nFrame 5 (current):\n  Drone pose: [92.55, -10.33, 20.12, -42.38, -104.22, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 656.79, \"ymin\": 391.9, \"xmax\": 682.96, \"ymax\": 461.4}, \"waypoint_deltas\": [{\"dx\": -0.12, \"dy\": -0.54, \"dz\": -0.12, \"dpitch\": -3.9, \"dyaw\": 2.91, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -1.03, \"dz\": -0.12, \"dpitch\": -3.89, \"dyaw\": 2.89, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": -1.52, \"dz\": -0.12, \"dpitch\": -3.87, \"dyaw\": 2.87, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": -2.02, \"dz\": -0.12, \"dpitch\": -3.86, \"dyaw\": 2.84, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": -2.5, \"dz\": -0.12, \"dpitch\": -3.83, \"dyaw\": 2.79, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.42, "window_alt_abs_m": 0.35, "target_px_mean_hist": 580.2, "cur_frame_id": 65, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00077/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.59, -14.38, 20.1, -46.41, -101.83, 0.0]\n  Target bbox: [629.12, 325.67, 650.72, 393.25]\n\nFrame 2:\n  Drone pose: [92.65, -14.75, 19.82, -48.59, -104.66, 0.0]\n  Target bbox: [657.88, 276.61, 687.22, 348.43]\n\nFrame 3:\n  Drone pose: [92.86, -15.16, 20.11, -45.99, -102.49, 0.0]\n  Target bbox: [628.34, 325.62, 651.48, 393.33]\n\nFrame 4:\n  Drone pose: [92.63, -15.73, 19.93, -45.9, -101.85, 0.0]\n  Target bbox: [622.96, 324.69, 656.82, 394.24]\n\nFrame 5 (current):\n  Drone pose: [92.88, -16.01, 19.91, -45.45, -102.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.68, \"ymin\": 325.48, \"xmax\": 657.11, \"ymax\": 393.55}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.53, \"dz\": 0.09, \"dpitch\": -0.2, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.02, \"dz\": 0.09, \"dpitch\": -0.18, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.52, \"dz\": 0.09, \"dpitch\": -0.17, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -2.03, \"dz\": 0.09, \"dpitch\": -0.17, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -2.55, \"dz\": 0.09, \"dpitch\": -0.19, \"dyaw\": -0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.25, "window_alt_abs_m": 0.77, "target_px_mean_hist": 559.2, "cur_frame_id": 77, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00089/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.96, -20.32, 20.11, -50.03, -99.08, 0.0]\n  Target bbox: [576.34, 261.91, 613.35, 330.82]\n\nFrame 2:\n  Drone pose: [92.88, -20.72, 20.0, -40.94, -100.85, 0.0]\n  Target bbox: [608.23, 408.47, 630.01, 477.67]\n\nFrame 3:\n  Drone pose: [92.84, -21.33, 20.19, -46.36, -102.56, 0.0]\n  Target bbox: [628.88, 325.63, 650.95, 393.3]\n\nFrame 4:\n  Drone pose: [92.68, -21.85, 20.14, -44.63, -102.24, 0.0]\n  Target bbox: [628.71, 355.73, 654.82, 422.29]\n\nFrame 5 (current):\n  Drone pose: [92.78, -22.35, 20.0, -46.13, -102.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.78, \"ymin\": 324.81, \"xmax\": 656.97, \"ymax\": 394.22}, \"waypoint_deltas\": [{\"dx\": -0.02, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -1.54, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": -0.31, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -2.53, \"dz\": 0.0, \"dpitch\": -0.3, \"dyaw\": 0.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.95, "window_alt_abs_m": 0.48, "target_px_mean_hist": 563.8, "cur_frame_id": 89, "source": "aug_001", "fut_invisible_cnt": 3}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00103/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.99, -27.19, 20.13, -50.06, -102.52, 0.0]\n  Target bbox: [621.49, 261.31, 648.88, 329.25]\n\nFrame 2:\n  Drone pose: [92.67, -27.73, 19.96, -46.17, -101.98, 0.0]\n  Target bbox: [629.23, 325.16, 650.6, 393.73]\n\nFrame 3:\n  Drone pose: [92.69, -28.18, 19.95, -48.26, -101.67, 0.0]\n  Target bbox: [624.68, 288.89, 647.74, 356.7]\n\nFrame 4:\n  Drone pose: [92.85, -28.8, 20.0, -43.21, -101.21, 0.0]\n  Target bbox: [611.78, 377.6, 635.68, 444.02]\n\nFrame 5 (current):\n  Drone pose: [92.76, -29.22, 19.86, -45.97, -102.25, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.35, \"ymin\": 325.02, \"xmax\": 656.42, \"ymax\": 394.02}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.59, \"dz\": 0.14, \"dpitch\": -0.33, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -1.1, \"dz\": 0.14, \"dpitch\": -0.35, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.6, \"dz\": 0.14, \"dpitch\": -0.33, \"dyaw\": -0.28, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -2.07, \"dz\": 0.14, \"dpitch\": -0.26, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": 0.3, \"dy\": -2.54, \"dz\": 0.14, \"dpitch\": -0.16, \"dyaw\": -0.92, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.35, "window_alt_abs_m": 0.36, "target_px_mean_hist": 574.0, "cur_frame_id": 103, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00113/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463/aug_001/frames_playback/frame_00115/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [93.26, -33.33, 19.98, -48.77, -100.3, 0.0]\n  Target bbox: [597.92, 284.13, 633.95, 352.96]\n\nFrame 2:\n  Drone pose: [93.26, -33.72, 19.87, -45.99, -102.23, 0.0]\n  Target bbox: [629.17, 325.02, 650.66, 393.87]\n\nFrame 3:\n  Drone pose: [93.14, -34.17, 20.1, -48.0, -103.03, 0.0]\n  Target bbox: [633.98, 295.29, 674.58, 366.41]\n\nFrame 4:\n  Drone pose: [93.25, -34.89, 20.14, -46.66, -102.32, 0.0]\n  Target bbox: [620.87, 324.95, 658.9, 394.04]\n\nFrame 5 (current):\n  Drone pose: [93.11, -35.35, 20.06, -46.52, -101.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.97, \"ymin\": 326.32, \"xmax\": 654.84, \"ymax\": 392.66}, \"waypoint_deltas\": [{\"dx\": 0.07, \"dy\": -0.53, \"dz\": -0.06, \"dpitch\": 0.07, \"dyaw\": -0.24, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -1.04, \"dz\": -0.06, \"dpitch\": 0.05, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -1.54, \"dz\": -0.06, \"dpitch\": 0.04, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -2.05, \"dz\": -0.06, \"dpitch\": 0.03, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -2.56, \"dz\": -0.06, \"dpitch\": 0.02, \"dyaw\": -0.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.92, "window_alt_abs_m": 0.46, "target_px_mean_hist": 578.5, "cur_frame_id": 115, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776362463", "difficulty_score": 0.5058, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [46.39, 11.94, 22.0, -46.47, 180.0, 0.0]\n  Target bbox: [627.77, 332.45, 652.23, 386.63]\n\nFrame 2:\n  Drone pose: [44.07, 9.67, 21.2, -47.88, 172.88, 0.0]\n  Target bbox: [625.34, 325.41, 654.36, 393.76]\n\nFrame 3:\n  Drone pose: [42.47, 8.26, 20.67, -48.48, 167.82, 0.0]\n  Target bbox: [621.99, 322.9, 657.66, 396.2]\n\nFrame 4:\n  Drone pose: [41.33, 7.43, 20.64, -49.13, 164.65, 0.0]\n  Target bbox: [623.76, 326.24, 656.03, 392.78]\n\nFrame 5 (current):\n  Drone pose: [40.46, 6.99, 20.62, -49.48, 162.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.86, \"ymin\": 327.31, \"xmax\": 656.02, \"ymax\": 391.56}, \"waypoint_deltas\": [{\"dx\": -0.71, \"dy\": -0.21, \"dz\": -0.03, \"dpitch\": -0.2, \"dyaw\": -0.89, \"droll\": 0.0}, {\"dx\": -1.33, \"dy\": -0.27, \"dz\": -0.05, \"dpitch\": -0.33, \"dyaw\": -1.23, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": -0.26, \"dz\": -0.07, \"dpitch\": -0.43, \"dyaw\": -1.28, \"droll\": 0.0}, {\"dx\": -2.46, \"dy\": -0.21, \"dz\": -0.09, \"dpitch\": -0.51, \"dyaw\": -1.17, \"droll\": 0.0}, {\"dx\": -3.0, \"dy\": -0.14, \"dz\": -0.2, \"dpitch\": -0.45, \"dyaw\": -0.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.14, "window_alt_abs_m": 1.38, "target_px_mean_hist": 587.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00018/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [34.99, 7.15, 20.27, -49.83, 162.92, 0.0]\n  Target bbox: [621.11, 321.08, 658.53, 397.97]\n\nFrame 2:\n  Drone pose: [34.52, 7.18, 20.24, -49.75, 163.06, 0.0]\n  Target bbox: [623.13, 325.84, 656.73, 392.94]\n\nFrame 3:\n  Drone pose: [34.05, 7.2, 20.22, -49.67, 163.15, 0.0]\n  Target bbox: [624.06, 327.09, 655.82, 391.74]\n\nFrame 4:\n  Drone pose: [33.58, 7.21, 20.19, -49.59, 163.22, 0.0]\n  Target bbox: [622.88, 324.35, 656.89, 394.6]\n\nFrame 5 (current):\n  Drone pose: [33.11, 7.22, 20.17, -49.52, 163.25, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.63, \"ymin\": 320.25, \"xmax\": 659.99, \"ymax\": 398.77}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": -0.01, \"dz\": -0.02, \"dpitch\": 0.06, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.01, \"dz\": -0.03, \"dpitch\": 0.1, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": -0.02, \"dz\": -0.05, \"dpitch\": 0.14, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -1.97, \"dy\": -0.03, \"dz\": -0.07, \"dpitch\": 0.16, \"dyaw\": -0.05, \"droll\": 0.0}, {\"dx\": -2.47, \"dy\": -0.04, \"dz\": -0.08, \"dpitch\": 0.19, \"dyaw\": -0.1, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.33, "window_alt_abs_m": 0.1, "target_px_mean_hist": 667.0, "cur_frame_id": 18, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [27.59, 6.96, 20.04, -49.21, 162.37, 0.0]\n  Target bbox: [621.94, 322.41, 657.78, 396.57]\n\nFrame 2:\n  Drone pose: [27.08, 6.89, 20.03, -49.18, 162.13, 0.0]\n  Target bbox: [621.22, 324.01, 658.59, 394.83]\n\nFrame 3:\n  Drone pose: [26.57, 6.81, 20.03, -49.16, 161.85, 0.0]\n  Target bbox: [623.67, 326.65, 656.21, 392.17]\n\nFrame 4:\n  Drone pose: [26.05, 6.71, 20.02, -49.12, 161.53, 0.0]\n  Target bbox: [620.72, 320.74, 658.91, 398.35]\n\nFrame 5 (current):\n  Drone pose: [25.54, 6.61, 20.02, -49.09, 161.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.39, \"ymin\": 326.1, \"xmax\": 656.5, \"ymax\": 392.68}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.4, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": -0.24, \"dz\": -0.01, \"dpitch\": 0.08, \"dyaw\": -0.82, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": -0.36, \"dz\": -0.01, \"dpitch\": 0.11, \"dyaw\": -1.26, \"droll\": 0.0}, {\"dx\": -2.1, \"dy\": -0.49, \"dz\": -0.01, \"dpitch\": 0.14, \"dyaw\": -1.7, \"droll\": 0.0}, {\"dx\": -2.63, \"dy\": -0.61, \"dz\": -0.01, \"dpitch\": 0.16, \"dyaw\": -2.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.2, "window_alt_abs_m": 0.02, "target_px_mean_hist": 662.2, "cur_frame_id": 33, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00047/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [20.21, 5.64, 20.0, -48.67, 156.06, 0.0]\n  Target bbox: [620.22, 320.92, 659.49, 398.15]\n\nFrame 2:\n  Drone pose: [19.67, 5.62, 20.0, -48.71, 155.93, 0.0]\n  Target bbox: [625.05, 324.92, 655.12, 393.98]\n\nFrame 3:\n  Drone pose: [19.13, 5.61, 20.0, -48.41, 154.29, 0.0]\n  Target bbox: [621.41, 324.2, 658.5, 394.68]\n\nFrame 4:\n  Drone pose: [18.6, 5.6, 20.0, -48.46, 154.23, 0.0]\n  Target bbox: [623.75, 325.1, 656.42, 393.8]\n\nFrame 5 (current):\n  Drone pose: [18.06, 5.61, 20.0, -48.15, 152.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.35, \"ymin\": 324.66, \"xmax\": 657.57, \"ymax\": 394.26}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -1.08, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -1.49, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": -1.46, \"droll\": 0.0}, {\"dx\": -2.17, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": 0.46, \"dyaw\": -2.85, \"droll\": 0.0}, {\"dx\": -2.71, \"dy\": 0.17, \"dz\": 0.0, \"dpitch\": 0.36, \"dyaw\": -2.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.38, "window_alt_abs_m": 0.0, "target_px_mean_hist": 672.8, "cur_frame_id": 47, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [12.21, 6.49, 20.0, -47.32, 147.46, 0.0]\n  Target bbox: [620.82, 322.99, 659.02, 396.15]\n\nFrame 2:\n  Drone pose: [11.71, 6.67, 20.0, -47.46, 147.95, 0.0]\n  Target bbox: [626.12, 325.57, 654.09, 393.42]\n\nFrame 3:\n  Drone pose: [11.22, 6.85, 20.0, -47.18, 147.09, 0.0]\n  Target bbox: [618.92, 321.02, 660.84, 398.21]\n\nFrame 4:\n  Drone pose: [10.73, 7.03, 20.0, -47.32, 147.61, 0.0]\n  Target bbox: [622.78, 325.18, 657.46, 393.85]\n\nFrame 5 (current):\n  Drone pose: [10.24, 7.21, 20.0, -47.03, 146.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.01, \"ymin\": 325.19, \"xmax\": 656.97, \"ymax\": 393.81}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.16, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 0.45, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.31, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.43, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": 0.6, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 0.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.74, "window_alt_abs_m": 0.0, "target_px_mean_hist": 647.0, "cur_frame_id": 62, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [4.99, 8.23, 20.0, -47.82, 147.73, 0.0]\n  Target bbox: [618.46, 320.62, 661.28, 398.61]\n\nFrame 2:\n  Drone pose: [4.52, 8.22, 20.0, -47.78, 147.75, 0.0]\n  Target bbox: [618.64, 321.11, 661.16, 398.06]\n\nFrame 3:\n  Drone pose: [3.98, 8.02, 20.0, -47.65, 147.14, 0.0]\n  Target bbox: [618.54, 321.11, 661.23, 398.12]\n\nFrame 4:\n  Drone pose: [3.36, 7.59, 20.0, -47.43, 145.73, 0.0]\n  Target bbox: [620.92, 323.27, 659.05, 395.73]\n\nFrame 5 (current):\n  Drone pose: [2.65, 6.95, 20.0, -47.11, 143.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.5, \"ymin\": 324.31, \"xmax\": 657.46, \"ymax\": 394.71}, \"waypoint_deltas\": [{\"dx\": -0.76, \"dy\": -0.76, \"dz\": 0.0, \"dpitch\": 0.42, \"dyaw\": -2.44, \"droll\": 0.0}, {\"dx\": -1.51, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": 0.88, \"dyaw\": -4.77, \"droll\": 0.0}, {\"dx\": -2.21, \"dy\": -2.16, \"dz\": 0.0, \"dpitch\": 1.32, \"dyaw\": -6.68, \"droll\": 0.0}, {\"dx\": -2.84, \"dy\": -2.66, \"dz\": 0.0, \"dpitch\": 1.7, \"dyaw\": -8.06, \"droll\": 0.0}, {\"dx\": -3.41, \"dy\": -3.0, \"dz\": 0.0, \"dpitch\": 1.99, \"dyaw\": -8.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.14, "window_alt_abs_m": 0.0, "target_px_mean_hist": 644.2, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00091/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-3.77, 3.49, 20.0, -44.63, 133.71, 0.0]\n  Target bbox: [621.95, 324.24, 658.07, 395.02]\n\nFrame 2:\n  Drone pose: [-4.27, 3.49, 20.0, -44.63, 133.71, 0.0]\n  Target bbox: [623.73, 325.35, 656.33, 393.84]\n\nFrame 3:\n  Drone pose: [-4.77, 3.49, 20.0, -44.62, 133.71, 0.0]\n  Target bbox: [622.38, 324.16, 657.69, 395.05]\n\nFrame 4:\n  Drone pose: [-5.27, 3.49, 20.0, -44.62, 133.71, 0.0]\n  Target bbox: [621.93, 324.28, 658.07, 394.97]\n\nFrame 5 (current):\n  Drone pose: [-5.77, 3.49, 20.0, -44.62, 133.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.35, \"ymin\": 323.02, \"xmax\": 660.61, \"ymax\": 396.37}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 607.2, "cur_frame_id": 91, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00105/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-10.77, 3.48, 20.0, -44.62, 133.68, 0.0]\n  Target bbox: [620.3, 323.35, 659.69, 395.98]\n\nFrame 2:\n  Drone pose: [-11.28, 3.47, 20.0, -44.62, 133.64, 0.0]\n  Target bbox: [618.91, 322.83, 661.01, 396.58]\n\nFrame 3:\n  Drone pose: [-11.8, 3.46, 20.0, -44.62, 133.58, 0.0]\n  Target bbox: [618.56, 322.43, 661.39, 396.98]\n\nFrame 4:\n  Drone pose: [-12.33, 3.43, 20.0, -44.62, 133.46, 0.0]\n  Target bbox: [618.43, 322.82, 661.49, 396.63]\n\nFrame 5 (current):\n  Drone pose: [-12.8, 3.48, 20.0, -44.64, 133.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.82, \"ymin\": 321.64, \"xmax\": 662.13, \"ymax\": 397.78}, \"waypoint_deltas\": [{\"dx\": -0.44, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": 0.23, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.88, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 1.5, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": 0.56, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 2.12, \"droll\": 0.0}, {\"dx\": -1.92, \"dy\": 0.72, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 2.74, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.39, "window_alt_abs_m": 0.0, "target_px_mean_hist": 579.2, "cur_frame_id": 105, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00116/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00117/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00118/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00119/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00120/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-16.37, 5.13, 20.0, -44.2, 141.06, 0.0]\n  Target bbox: [624.3, 326.52, 655.72, 392.69]\n\nFrame 2:\n  Drone pose: [-16.45, 5.27, 20.0, -43.85, 142.13, 0.0]\n  Target bbox: [621.9, 324.44, 658.03, 394.87]\n\nFrame 3:\n  Drone pose: [-16.53, 5.4, 20.0, -43.49, 143.17, 0.0]\n  Target bbox: [620.5, 323.83, 659.37, 395.63]\n\nFrame 4:\n  Drone pose: [-16.61, 5.54, 20.0, -43.13, 144.19, 0.0]\n  Target bbox: [626.32, 327.12, 653.92, 392.18]\n\nFrame 5 (current):\n  Drone pose: [-16.7, 5.68, 20.0, -42.37, 144.05, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.82, \"ymin\": 323.3, \"xmax\": 660.05, \"ymax\": 396.23}, \"waypoint_deltas\": [{\"dx\": -0.08, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": -0.93, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": 0.28, \"dz\": 0.0, \"dpitch\": 0.41, \"dyaw\": -1.84, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": 0.42, \"dz\": 0.0, \"dpitch\": 0.62, \"dyaw\": -2.74, \"droll\": 0.0}, {\"dx\": -0.33, \"dy\": 0.56, \"dz\": 0.0, \"dpitch\": 0.84, \"dyaw\": -3.63, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": 0.7, \"dz\": 0.0, \"dpitch\": 1.06, \"dyaw\": -4.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.26, "window_alt_abs_m": 0.0, "target_px_mean_hist": 602.0, "cur_frame_id": 120, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00130/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00131/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00132/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00133/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/ORI/frames_playback/frame_00134/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-17.1, 7.03, 20.0, -39.73, 136.04, 0.0]\n  Target bbox: [622.64, 326.73, 657.31, 392.9]\n\nFrame 2:\n  Drone pose: [-17.08, 7.16, 20.0, -39.4, 135.41, 0.0]\n  Target bbox: [627.68, 330.54, 652.48, 388.87]\n\nFrame 3:\n  Drone pose: [-17.06, 7.29, 20.0, -39.48, 133.93, 0.0]\n  Target bbox: [620.71, 325.74, 659.2, 393.99] (model-predicted box)\n\nFrame 4:\n  Drone pose: [-17.04, 7.41, 20.0, -40.0, 133.28, 0.0]\n  Target bbox: [625.87, 328.41, 654.18, 391.06]\n\nFrame 5 (current):\n  Drone pose: [-17.02, 7.54, 20.0, -40.53, 132.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.51, \"ymin\": 327.39, \"xmax\": 656.5, \"ymax\": 392.18}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": -0.54, \"dyaw\": -0.7, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": 0.26, \"dz\": 0.0, \"dpitch\": -1.08, \"dyaw\": -1.42, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 0.39, \"dz\": 0.0, \"dpitch\": -1.63, \"dyaw\": -2.17, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": -2.18, \"dyaw\": -2.94, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 0.65, \"dz\": 0.0, \"dpitch\": -2.75, \"dyaw\": -3.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 1, "current_invisible": false, "window_yaw_abs_deg": 3.42, "window_alt_abs_m": 0.0, "target_px_mean_hist": 188.0, "cur_frame_id": 134, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [46.34, 11.8, 21.88, -46.37, 179.6, 0.0]\n  Target bbox: [627.65, 332.21, 652.23, 386.87]\n\nFrame 2:\n  Drone pose: [44.09, 9.7, 21.29, -47.96, 172.97, 0.0]\n  Target bbox: [625.39, 328.71, 654.41, 390.33]\n\nFrame 3:\n  Drone pose: [42.48, 8.28, 20.69, -49.96, 162.91, 0.0]\n  Target bbox: [677.24, 300.28, 714.51, 373.55]\n\nFrame 4:\n  Drone pose: [41.33, 7.54, 20.73, -49.31, 165.0, 0.0]\n  Target bbox: [623.98, 326.85, 655.87, 392.01]\n\nFrame 5 (current):\n  Drone pose: [40.46, 6.99, 20.62, -53.17, 159.2, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 663.1, \"ymin\": 265.87, \"xmax\": 697.58, \"ymax\": 331.07}, \"waypoint_deltas\": [{\"dx\": -0.71, \"dy\": -0.21, \"dz\": -0.03, \"dpitch\": 3.49, \"dyaw\": 2.77, \"droll\": 0.0}, {\"dx\": -1.33, \"dy\": -0.27, \"dz\": -0.05, \"dpitch\": 3.36, \"dyaw\": 2.43, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": -0.26, \"dz\": -0.07, \"dpitch\": 3.26, \"dyaw\": 2.38, \"droll\": 0.0}, {\"dx\": -2.46, \"dy\": -0.21, \"dz\": -0.09, \"dpitch\": 3.18, \"dyaw\": 2.49, \"droll\": 0.0}, {\"dx\": -3.0, \"dy\": -0.14, \"dz\": -0.2, \"dpitch\": 3.24, \"dyaw\": 2.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.59, "window_alt_abs_m": 1.34, "target_px_mean_hist": 582.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00018/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [35.11, 7.14, 20.13, -49.34, 158.01, 0.0]\n  Target bbox: [673.69, 323.72, 716.21, 401.56]\n\nFrame 2:\n  Drone pose: [34.52, 7.04, 20.24, -48.32, 167.57, 0.0]\n  Target bbox: [567.54, 349.47, 602.77, 418.47]\n\nFrame 3:\n  Drone pose: [34.17, 7.13, 20.13, -52.51, 168.02, 0.0]\n  Target bbox: [566.96, 273.55, 602.35, 340.94]\n\nFrame 4:\n  Drone pose: [33.58, 7.21, 20.19, -49.34, 158.22, 0.0]\n  Target bbox: [674.61, 327.79, 714.86, 403.16]\n\nFrame 5 (current):\n  Drone pose: [32.95, 7.18, 20.28, -49.91, 162.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.21, \"ymin\": 326.16, \"xmax\": 656.64, \"ymax\": 392.64}, \"waypoint_deltas\": [{\"dx\": -0.33, \"dy\": 0.03, \"dz\": -0.13, \"dpitch\": 0.45, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": -0.82, \"dy\": 0.03, \"dz\": -0.14, \"dpitch\": 0.49, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": -1.31, \"dy\": 0.02, \"dz\": -0.16, \"dpitch\": 0.53, \"dyaw\": 0.26, \"droll\": 0.0}, {\"dx\": -1.81, \"dy\": 0.01, \"dz\": -0.18, \"dpitch\": 0.55, \"dyaw\": 0.22, \"droll\": 0.0}, {\"dx\": -2.31, \"dy\": 0.0, \"dz\": -0.19, \"dpitch\": 0.58, \"dyaw\": 0.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.58, "window_alt_abs_m": 0.37, "target_px_mean_hist": 650.5, "cur_frame_id": 18, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [27.59, 6.96, 20.04, -46.81, 164.81, 0.0]\n  Target bbox: [595.78, 366.72, 629.72, 433.48]\n\nFrame 2:\n  Drone pose: [27.1, 6.98, 20.14, -47.54, 162.74, 0.0]\n  Target bbox: [619.77, 356.91, 653.68, 423.33]\n\nFrame 3:\n  Drone pose: [26.45, 6.81, 19.95, -49.07, 163.76, 0.0]\n  Target bbox: [600.8, 330.13, 633.79, 395.0]\n\nFrame 4:\n  Drone pose: [26.05, 6.71, 20.02, -47.47, 158.65, 0.0]\n  Target bbox: [653.09, 350.52, 690.8, 424.96]\n\nFrame 5 (current):\n  Drone pose: [25.46, 6.43, 20.04, -49.14, 160.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.79, \"ymin\": 323.01, \"xmax\": 657.96, \"ymax\": 395.95}, \"waypoint_deltas\": [{\"dx\": -0.44, \"dy\": 0.06, \"dz\": -0.02, \"dpitch\": 0.09, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": -0.06, \"dz\": -0.03, \"dpitch\": 0.13, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": -0.18, \"dz\": -0.03, \"dpitch\": 0.16, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": -0.31, \"dz\": -0.03, \"dpitch\": 0.19, \"dyaw\": -1.01, \"droll\": 0.0}, {\"dx\": -2.55, \"dy\": -0.43, \"dz\": -0.03, \"dpitch\": 0.21, \"dyaw\": -1.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.03, "window_alt_abs_m": 0.4, "target_px_mean_hist": 670.0, "cur_frame_id": 33, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00047/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [20.21, 5.64, 20.0, -44.62, 151.06, 0.0]\n  Target bbox: [677.84, 394.51, 714.03, 463.82]\n\nFrame 2:\n  Drone pose: [19.67, 5.64, 20.07, -49.68, 151.0, 0.0]\n  Target bbox: [675.73, 310.86, 715.98, 383.35]\n\nFrame 3:\n  Drone pose: [19.13, 5.61, 20.0, -48.41, 154.29, 0.0]\n  Target bbox: [617.94, 319.6, 661.74, 399.53]\n\nFrame 4:\n  Drone pose: [18.5, 5.57, 20.15, -51.72, 150.43, 0.0]\n  Target bbox: [662.04, 276.75, 697.32, 346.1]\n\nFrame 5 (current):\n  Drone pose: [18.06, 5.61, 20.0, -48.15, 152.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.64, \"ymin\": 320.51, \"xmax\": 661.1, \"ymax\": 398.63}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -1.08, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -1.49, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": -1.46, \"droll\": 0.0}, {\"dx\": -2.17, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": 0.46, \"dyaw\": -2.85, \"droll\": 0.0}, {\"dx\": -2.71, \"dy\": 0.17, \"dz\": 0.0, \"dpitch\": 0.36, \"dyaw\": -2.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.46, "window_alt_abs_m": 0.44, "target_px_mean_hist": 667.0, "cur_frame_id": 47, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [12.35, 6.46, 19.93, -47.0, 147.62, 0.0]\n  Target bbox: [620.71, 322.75, 659.18, 396.32]\n\nFrame 2:\n  Drone pose: [11.83, 6.78, 20.03, -47.71, 143.46, 0.0]\n  Target bbox: [679.51, 322.39, 715.08, 391.41]\n\nFrame 3:\n  Drone pose: [11.22, 6.85, 20.0, -49.64, 147.12, 0.0]\n  Target bbox: [620.56, 281.89, 658.7, 354.54]\n\nFrame 4:\n  Drone pose: [10.75, 7.05, 20.17, -47.56, 147.69, 0.0]\n  Target bbox: [626.06, 327.38, 654.09, 391.54]\n\nFrame 5 (current):\n  Drone pose: [10.24, 7.21, 20.0, -49.19, 143.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 663.65, \"ymin\": 289.91, \"xmax\": 698.25, \"ymax\": 358.35}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.16, \"dz\": 0.0, \"dpitch\": 2.03, \"dyaw\": 3.99, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.31, \"dz\": 0.0, \"dpitch\": 2.34, \"dyaw\": 3.04, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.43, \"dz\": 0.0, \"dpitch\": 2.23, \"dyaw\": 3.35, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": 2.11, \"dyaw\": 3.58, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": 0.6, \"dz\": 0.0, \"dpitch\": 2.0, \"dyaw\": 3.71, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.88, "window_alt_abs_m": 0.47, "target_px_mean_hist": 654.2, "cur_frame_id": 62, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [4.98, 8.29, 20.13, -47.47, 152.89, 0.0]\n  Target bbox: [563.38, 335.74, 603.45, 407.91]\n\nFrame 2:\n  Drone pose: [4.51, 8.28, 19.88, -47.67, 147.91, 0.0]\n  Target bbox: [619.24, 320.88, 660.54, 398.25]\n\nFrame 3:\n  Drone pose: [3.98, 8.02, 20.0, -42.8, 142.14, 0.0]\n  Target bbox: [675.82, 405.23, 718.41, 480.71]\n\nFrame 4:\n  Drone pose: [3.5, 7.56, 19.88, -45.66, 150.92, 0.0]\n  Target bbox: [563.7, 349.81, 600.99, 419.01]\n\nFrame 5 (current):\n  Drone pose: [2.65, 6.95, 20.0, -47.11, 143.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.1, \"ymin\": 324.92, \"xmax\": 656.88, \"ymax\": 394.08}, \"waypoint_deltas\": [{\"dx\": -0.76, \"dy\": -0.76, \"dz\": 0.0, \"dpitch\": 0.42, \"dyaw\": -2.44, \"droll\": 0.0}, {\"dx\": -1.51, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": 0.88, \"dyaw\": -4.77, \"droll\": 0.0}, {\"dx\": -2.21, \"dy\": -2.16, \"dz\": 0.0, \"dpitch\": 1.32, \"dyaw\": -6.68, \"droll\": 0.0}, {\"dx\": -2.84, \"dy\": -2.66, \"dz\": 0.0, \"dpitch\": 1.7, \"dyaw\": -8.06, \"droll\": 0.0}, {\"dx\": -3.41, \"dy\": -3.0, \"dz\": 0.0, \"dpitch\": 1.99, \"dyaw\": -8.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 26.83, "window_alt_abs_m": 0.61, "target_px_mean_hist": 640.8, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00091/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-3.77, 3.49, 20.0, -39.63, 136.61, 0.0]\n  Target bbox: [583.07, 406.86, 626.33, 482.03]\n\nFrame 2:\n  Drone pose: [-4.27, 3.49, 20.0, -47.88, 137.71, 0.0]\n  Target bbox: [575.25, 272.65, 608.35, 339.69]\n\nFrame 3:\n  Drone pose: [-4.87, 3.42, 20.02, -46.09, 128.33, 0.0]\n  Target bbox: [680.11, 302.57, 720.09, 373.23]\n\nFrame 4:\n  Drone pose: [-5.31, 3.38, 20.03, -43.85, 138.41, 0.0]\n  Target bbox: [562.37, 340.18, 597.44, 407.67]\n\nFrame 5 (current):\n  Drone pose: [-5.79, 3.58, 19.99, -47.68, 134.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.3, \"ymin\": 275.83, \"xmax\": 653.86, \"ymax\": 344.52}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": -0.09, \"dz\": 0.01, \"dpitch\": 3.06, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.09, \"dz\": 0.01, \"dpitch\": 3.06, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": -1.48, \"dy\": -0.09, \"dz\": 0.01, \"dpitch\": 3.06, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": -0.09, \"dz\": 0.01, \"dpitch\": 3.06, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": -0.09, \"dz\": 0.01, \"dpitch\": 3.06, \"dyaw\": -0.41, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.84, "window_alt_abs_m": 0.06, "target_px_mean_hist": 581.5, "cur_frame_id": 91, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00105/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-10.94, 3.49, 19.93, -44.69, 133.35, 0.0]\n  Target bbox: [622.87, 324.41, 657.19, 394.78]\n\nFrame 2:\n  Drone pose: [-11.19, 3.46, 19.85, -44.29, 133.81, 0.0]\n  Target bbox: [623.86, 325.05, 656.2, 394.12]\n\nFrame 3:\n  Drone pose: [-11.8, 3.46, 20.0, -45.56, 132.33, 0.0]\n  Target bbox: [635.05, 307.19, 675.53, 381.0]\n\nFrame 4:\n  Drone pose: [-12.23, 3.28, 20.04, -45.43, 136.2, 0.0]\n  Target bbox: [583.93, 305.89, 627.22, 380.79]\n\nFrame 5 (current):\n  Drone pose: [-12.73, 3.36, 19.94, -47.68, 138.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 556.92, \"ymin\": 268.25, \"xmax\": 601.81, \"ymax\": 343.43}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.19, \"dz\": 0.06, \"dpitch\": 3.01, \"dyaw\": -4.64, \"droll\": 0.0}, {\"dx\": -0.88, \"dy\": 0.35, \"dz\": 0.06, \"dpitch\": 2.97, \"dyaw\": -4.03, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": 0.52, \"dz\": 0.06, \"dpitch\": 2.94, \"dyaw\": -3.41, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": 0.68, \"dz\": 0.06, \"dpitch\": 2.91, \"dyaw\": -2.79, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": 0.84, \"dz\": 0.06, \"dpitch\": 2.88, \"dyaw\": -2.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.15, "window_alt_abs_m": 0.36, "target_px_mean_hist": 633.8, "cur_frame_id": 105, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00116/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00117/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00118/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00119/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00120/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-16.23, 5.09, 19.96, -46.09, 136.22, 0.0]\n  Target bbox: [683.46, 291.6, 718.21, 359.68]\n\nFrame 2:\n  Drone pose: [-16.41, 5.15, 19.9, -43.55, 141.93, 0.0]\n  Target bbox: [618.49, 321.9, 661.35, 397.6]\n\nFrame 3:\n  Drone pose: [-16.59, 5.35, 19.89, -41.2, 143.32, 0.0]\n  Target bbox: [619.76, 362.98, 651.03, 428.46]\n\nFrame 4:\n  Drone pose: [-16.61, 5.54, 20.0, -43.13, 144.19, 0.0]\n  Target bbox: [623.76, 326.12, 656.54, 393.23]\n\nFrame 5 (current):\n  Drone pose: [-16.77, 5.59, 19.92, -39.53, 142.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 639.74, \"ymin\": 369.88, \"xmax\": 677.46, \"ymax\": 441.31}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.23, \"dz\": 0.08, \"dpitch\": -2.64, \"dyaw\": 0.85, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": 0.37, \"dz\": 0.08, \"dpitch\": -2.43, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": 0.51, \"dz\": 0.08, \"dpitch\": -2.22, \"dyaw\": -0.96, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": 0.65, \"dz\": 0.08, \"dpitch\": -2.0, \"dyaw\": -1.85, \"droll\": 0.0}, {\"dx\": -0.34, \"dy\": 0.79, \"dz\": 0.08, \"dpitch\": -1.78, \"dyaw\": -2.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.88, "window_alt_abs_m": 0.25, "target_px_mean_hist": 602.8, "cur_frame_id": 120, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00130/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00131/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00132/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00133/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029/aug_001/frames_playback/frame_00134/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-17.01, 6.99, 20.08, -38.48, 135.92, 0.0]\n  Target bbox: [627.21, 348.57, 657.88, 413.3]\n\nFrame 2:\n  Drone pose: [-17.08, 7.16, 20.0, -43.6, 138.29, 0.0]\n  Target bbox: [591.78, 260.68, 613.38, 318.52]\n\nFrame 3:\n  Drone pose: [-16.96, 7.33, 19.91, -40.33, 135.41, 0.0]\n  Target bbox: [604.26, 308.54, 643.19, 376.89]\n\nFrame 4:\n  Drone pose: [-17.04, 7.41, 20.0, -43.04, 132.16, 0.0]\n  Target bbox: [635.8, 274.7, 673.68, 343.0]\n\nFrame 5 (current):\n  Drone pose: [-17.05, 7.51, 20.05, -40.61, 132.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.64, \"ymin\": 326.2, \"xmax\": 657.38, \"ymax\": 393.36}, \"waypoint_deltas\": [{\"dx\": 0.05, \"dy\": 0.16, \"dz\": -0.05, \"dpitch\": -0.46, \"dyaw\": -0.58, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 0.29, \"dz\": -0.05, \"dpitch\": -1.0, \"dyaw\": -1.3, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": 0.42, \"dz\": -0.05, \"dpitch\": -1.55, \"dyaw\": -2.05, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": 0.55, \"dz\": -0.05, \"dpitch\": -2.1, \"dyaw\": -2.82, \"droll\": 0.0}, {\"dx\": 0.14, \"dy\": 0.68, \"dz\": -0.05, \"dpitch\": -2.67, \"dyaw\": -3.63, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.83, "window_alt_abs_m": 0.32, "target_px_mean_hist": 196.0, "cur_frame_id": 134, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776319029", "difficulty_score": 0.529, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.36, 85.72, 22.0, -46.4, -98.53, 0.0]\n  Target bbox: [621.9, 327.37, 658.06, 391.95]\n\nFrame 2:\n  Drone pose: [104.71, 84.0, 21.2, -47.3, -95.61, 0.0]\n  Target bbox: [625.58, 327.34, 654.16, 391.81]\n\nFrame 3:\n  Drone pose: [103.77, 82.84, 20.67, -47.67, -92.8, 0.0]\n  Target bbox: [616.41, 320.75, 663.62, 398.55]\n\nFrame 4:\n  Drone pose: [103.22, 82.04, 20.64, -48.13, -92.71, 0.0]\n  Target bbox: [618.94, 321.1, 661.2, 397.98]\n\nFrame 5 (current):\n  Drone pose: [102.91, 81.39, 20.62, -48.31, -93.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.53, \"ymin\": 325.73, \"xmax\": 661.09, \"ymax\": 393.2}, \"waypoint_deltas\": [{\"dx\": -0.2, \"dy\": -0.54, \"dz\": -0.03, \"dpitch\": -0.05, \"dyaw\": 0.66, \"droll\": 0.0}, {\"dx\": -0.35, \"dy\": -1.05, \"dz\": -0.05, \"dpitch\": -0.04, \"dyaw\": 1.14, \"droll\": 0.0}, {\"dx\": -0.47, \"dy\": -1.56, \"dz\": -0.07, \"dpitch\": -0.03, \"dyaw\": 1.53, \"droll\": 0.0}, {\"dx\": -0.57, \"dy\": -2.06, \"dz\": -0.09, \"dpitch\": -0.01, \"dyaw\": 1.86, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": -2.57, \"dz\": -0.2, \"dpitch\": 0.14, \"dyaw\": 2.15, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.47, "window_alt_abs_m": 1.38, "target_px_mean_hist": 543.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00019/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.91, 75.79, 20.24, -47.99, -90.05, 0.0]\n  Target bbox: [619.93, 325.8, 660.05, 393.06]\n\nFrame 2:\n  Drone pose: [101.89, 75.28, 20.22, -47.96, -90.01, 0.0]\n  Target bbox: [620.31, 323.99, 659.68, 394.87]\n\nFrame 3:\n  Drone pose: [101.9, 74.76, 20.19, -47.94, -90.03, 0.0]\n  Target bbox: [618.72, 326.0, 661.27, 392.89]\n\nFrame 4:\n  Drone pose: [101.93, 74.25, 20.17, -47.94, -90.12, 0.0]\n  Target bbox: [616.52, 325.84, 663.42, 393.1]\n\nFrame 5 (current):\n  Drone pose: [101.98, 73.73, 20.15, -47.94, -90.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.4, \"ymin\": 325.41, \"xmax\": 660.46, \"ymax\": 393.48}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": -0.52, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": -0.23, \"droll\": 0.0}, {\"dx\": 0.15, \"dy\": -1.05, \"dz\": -0.03, \"dpitch\": -0.03, \"dyaw\": -0.53, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": -1.58, \"dz\": -0.05, \"dpitch\": -0.06, \"dyaw\": -0.9, \"droll\": 0.0}, {\"dx\": 0.39, \"dy\": -2.12, \"dz\": -0.06, \"dpitch\": -0.1, \"dyaw\": -1.34, \"droll\": 0.0}, {\"dx\": 0.54, \"dy\": -2.66, \"dz\": -0.07, \"dpitch\": -0.14, \"dyaw\": -1.82, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.31, "window_alt_abs_m": 0.09, "target_px_mean_hist": 623.2, "cur_frame_id": 19, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00034/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.1, 67.69, 20.03, -48.59, -94.13, 0.0]\n  Target bbox: [616.04, 317.69, 664.14, 401.33]\n\nFrame 2:\n  Drone pose: [103.06, 67.09, 20.03, -48.68, -95.73, 0.0]\n  Target bbox: [615.15, 316.89, 665.03, 402.14]\n\nFrame 3:\n  Drone pose: [102.99, 66.48, 20.02, -48.78, -97.23, 0.0]\n  Target bbox: [619.5, 323.05, 660.44, 395.96]\n\nFrame 4:\n  Drone pose: [102.91, 65.86, 20.02, -48.88, -98.71, 0.0]\n  Target bbox: [617.55, 318.39, 662.69, 400.6]\n\nFrame 5 (current):\n  Drone pose: [102.83, 65.24, 20.02, -48.97, -100.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.28, \"ymin\": 319.53, \"xmax\": 662.8, \"ymax\": 399.39}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": -0.63, \"dz\": -0.01, \"dpitch\": -0.06, \"dyaw\": -1.55, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -1.24, \"dz\": -0.01, \"dpitch\": -0.09, \"dyaw\": -3.15, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": -1.86, \"dz\": -0.01, \"dpitch\": -0.29, \"dyaw\": -3.07, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": -2.47, \"dz\": -0.01, \"dpitch\": -0.5, \"dyaw\": -2.95, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": -3.08, \"dz\": -0.01, \"dpitch\": -0.72, \"dyaw\": -2.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.08, "window_alt_abs_m": 0.02, "target_px_mean_hist": 649.2, "cur_frame_id": 34, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.03, 57.88, 20.0, -49.97, -110.7, 0.0]\n  Target bbox: [615.16, 317.97, 665.16, 401.05]\n\nFrame 2:\n  Drone pose: [100.72, 57.25, 20.0, -50.06, -111.48, 0.0]\n  Target bbox: [614.7, 318.81, 665.54, 400.08]\n\nFrame 3:\n  Drone pose: [100.39, 56.65, 20.0, -49.89, -112.17, 0.0]\n  Target bbox: [615.19, 315.84, 665.22, 403.09]\n\nFrame 4:\n  Drone pose: [100.04, 56.07, 20.0, -49.92, -112.81, 0.0]\n  Target bbox: [618.19, 322.11, 662.0, 396.79]\n\nFrame 5 (current):\n  Drone pose: [99.7, 55.48, 20.0, -49.96, -113.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.8, \"ymin\": 318.41, \"xmax\": 665.52, \"ymax\": 400.52}, \"waypoint_deltas\": [{\"dx\": -0.34, \"dy\": -0.55, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.59, \"droll\": 0.0}, {\"dx\": -0.69, \"dy\": -1.08, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -1.13, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -1.6, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": -1.61, \"droll\": 0.0}, {\"dx\": -1.44, \"dy\": -2.11, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -1.99, \"droll\": 0.0}, {\"dx\": -1.85, \"dy\": -2.6, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.77, "window_alt_abs_m": 0.0, "target_px_mean_hist": 613.8, "cur_frame_id": 50, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [95.03, 49.91, 20.0, -49.82, -114.65, 0.0]\n  Target bbox: [620.87, 318.83, 659.48, 400.0]\n\nFrame 2:\n  Drone pose: [94.58, 49.45, 20.0, -49.73, -114.77, 0.0]\n  Target bbox: [615.02, 317.49, 665.39, 401.52]\n\nFrame 3:\n  Drone pose: [94.21, 49.03, 20.0, -49.5, -115.07, 0.0]\n  Target bbox: [613.97, 319.0, 665.85, 399.93]\n\nFrame 4:\n  Drone pose: [93.92, 48.65, 20.0, -49.53, -113.93, 0.0]\n  Target bbox: [620.42, 320.39, 659.33, 398.41]\n\nFrame 5 (current):\n  Drone pose: [93.68, 48.27, 20.0, -49.49, -112.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.9, \"ymin\": 321.38, \"xmax\": 661.87, \"ymax\": 397.59}, \"waypoint_deltas\": [{\"dx\": -0.17, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 0.76, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": -0.74, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 1.37, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": -1.12, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 1.86, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": -1.51, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 2.25, \"droll\": 0.0}, {\"dx\": -0.56, \"dy\": -1.92, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": 2.6, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.49, "window_alt_abs_m": 0.0, "target_px_mean_hist": 652.0, "cur_frame_id": 65, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00081/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.37, 43.02, 20.0, -49.3, -107.71, 0.0]\n  Target bbox: [611.31, 319.64, 668.39, 399.33]\n\nFrame 2:\n  Drone pose: [92.21, 42.52, 20.0, -49.37, -107.18, 0.0]\n  Target bbox: [622.22, 321.74, 657.51, 397.09]\n\nFrame 3:\n  Drone pose: [92.06, 42.05, 20.0, -49.4, -106.65, 0.0]\n  Target bbox: [611.14, 320.04, 668.54, 398.95]\n\nFrame 4:\n  Drone pose: [91.92, 41.59, 20.0, -49.4, -106.16, 0.0]\n  Target bbox: [617.31, 321.75, 662.39, 397.17]\n\nFrame 5 (current):\n  Drone pose: [91.73, 41.22, 20.0, -49.28, -105.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.12, \"ymin\": 320.82, \"xmax\": 660.56, \"ymax\": 398.04}, \"waypoint_deltas\": [{\"dx\": -0.11, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": 0.49, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": -0.76, \"dz\": 0.0, \"dpitch\": 0.33, \"dyaw\": 0.82, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": -1.17, \"dz\": 0.0, \"dpitch\": 0.46, \"dyaw\": 0.92, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": -1.62, \"dz\": 0.0, \"dpitch\": 0.6, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": -2.73, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -3.37, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.33, "window_alt_abs_m": 0.0, "target_px_mean_hist": 653.5, "cur_frame_id": 81, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00096/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.22, 25.8, 20.0, -43.73, -162.73, 0.0]\n  Target bbox: [619.16, 318.59, 661.34, 400.92]\n\nFrame 2:\n  Drone pose: [106.61, 24.73, 20.0, -43.42, -164.63, 0.0]\n  Target bbox: [621.83, 320.81, 658.6, 398.69]\n\nFrame 3:\n  Drone pose: [106.97, 23.68, 20.0, -43.34, -166.4, 0.0]\n  Target bbox: [622.48, 324.1, 657.85, 395.3]\n\nFrame 4:\n  Drone pose: [107.01, 22.8, 20.0, -43.4, -167.48, 0.0]\n  Target bbox: [616.64, 319.47, 663.11, 400.06]\n\nFrame 5 (current):\n  Drone pose: [106.81, 22.06, 20.0, -43.05, -168.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.16, \"ymin\": 323.99, \"xmax\": 660.77, \"ymax\": 395.51}, \"waypoint_deltas\": [{\"dx\": -0.22, \"dy\": -0.73, \"dz\": 0.0, \"dpitch\": 0.31, \"dyaw\": -0.78, \"droll\": 0.0}, {\"dx\": -0.73, \"dy\": -1.31, \"dz\": 0.0, \"dpitch\": 0.28, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": -1.87, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -1.14, \"droll\": 0.0}, {\"dx\": -1.77, \"dy\": -2.41, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": -2.28, \"dy\": -2.93, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": -1.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.6, "window_alt_abs_m": 0.0, "target_px_mean_hist": 534.0, "cur_frame_id": 96, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00111/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.51, 16.09, 20.0, -42.92, -169.74, 0.0]\n  Target bbox: [621.11, 322.76, 658.69, 396.65]\n\nFrame 2:\n  Drone pose: [100.99, 15.7, 20.0, -42.91, -169.41, 0.0]\n  Target bbox: [617.28, 321.94, 662.69, 397.56]\n\nFrame 3:\n  Drone pose: [100.49, 15.37, 20.0, -42.87, -168.96, 0.0]\n  Target bbox: [622.44, 321.27, 658.05, 398.38]\n\nFrame 4:\n  Drone pose: [99.99, 15.07, 20.0, -42.94, -169.79, 0.0]\n  Target bbox: [619.96, 323.04, 659.98, 396.33]\n\nFrame 5 (current):\n  Drone pose: [99.5, 14.79, 20.0, -42.88, -169.18, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.3, \"ymin\": 323.57, \"xmax\": 657.08, \"ymax\": 395.88}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.27, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.75, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.54, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": -1.51, \"dy\": -0.79, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.79, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": -1.25, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.21, "window_alt_abs_m": 0.0, "target_px_mean_hist": 546.2, "cur_frame_id": 111, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [93.78, 11.9, 20.0, -42.36, -170.38, 0.0]\n  Target bbox: [616.64, 321.24, 663.27, 398.36]\n\nFrame 2:\n  Drone pose: [93.32, 11.65, 20.0, -42.24, -169.73, 0.0]\n  Target bbox: [621.97, 320.12, 658.57, 399.57]\n\nFrame 3:\n  Drone pose: [92.88, 11.39, 20.0, -42.22, -170.45, 0.0]\n  Target bbox: [619.45, 324.22, 660.51, 395.32]\n\nFrame 4:\n  Drone pose: [92.42, 11.15, 20.0, -42.1, -169.77, 0.0]\n  Target bbox: [624.15, 327.06, 656.11, 392.32]\n\nFrame 5 (current):\n  Drone pose: [91.95, 10.93, 20.0, -42.12, -170.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.53, \"ymin\": 321.46, \"xmax\": 660.25, \"ymax\": 398.03}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -0.61, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": -0.83, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.47, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -1.06, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -0.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.66, "window_alt_abs_m": 0.0, "target_px_mean_hist": 553.2, "cur_frame_id": 127, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00138/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00139/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/ORI/frames_playback/frame_00142/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [86.35, 8.51, 20.0, -42.23, -170.11, 0.0]\n  Target bbox: [620.54, 323.41, 659.39, 396.0]\n\nFrame 2:\n  Drone pose: [85.86, 8.33, 20.0, -42.15, -169.24, 0.0]\n  Target bbox: [623.54, 325.0, 656.79, 394.45]\n\nFrame 3:\n  Drone pose: [85.37, 8.16, 20.0, -42.18, -169.72, 0.0]\n  Target bbox: [624.04, 327.52, 656.19, 391.8]\n\nFrame 4:\n  Drone pose: [84.88, 7.98, 20.0, -42.21, -170.22, 0.0]\n  Target bbox: [615.67, 320.28, 664.15, 399.42]\n\nFrame 5 (current):\n  Drone pose: [84.38, 7.78, 20.0, -42.13, -169.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.17, \"ymin\": 320.32, \"xmax\": 658.36, \"ymax\": 399.37}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.2, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.55, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.42, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.22, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": -0.86, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": -1.08, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.66, "window_alt_abs_m": 0.0, "target_px_mean_hist": 557.0, "cur_frame_id": 142, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.41, 85.87, 21.88, -48.7, -100.95, 0.0]\n  Target bbox: [649.43, 281.08, 686.18, 349.31]\n\nFrame 2:\n  Drone pose: [104.77, 83.87, 21.09, -43.46, -98.61, 0.0]\n  Target bbox: [650.67, 389.52, 693.39, 460.91]\n\nFrame 3:\n  Drone pose: [103.77, 82.84, 20.67, -47.67, -92.8, 0.0]\n  Target bbox: [618.59, 322.79, 661.36, 396.46]\n\nFrame 4:\n  Drone pose: [103.23, 82.05, 20.63, -48.08, -92.73, 0.0]\n  Target bbox: [620.21, 323.71, 659.73, 395.43]\n\nFrame 5 (current):\n  Drone pose: [102.95, 81.47, 20.55, -47.8, -88.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 563.47, \"ymin\": 332.53, \"xmax\": 603.14, \"ymax\": 399.46}, \"waypoint_deltas\": [{\"dx\": -0.24, \"dy\": -0.62, \"dz\": 0.04, \"dpitch\": -0.56, \"dyaw\": -4.22, \"droll\": 0.0}, {\"dx\": -0.39, \"dy\": -1.13, \"dz\": 0.02, \"dpitch\": -0.55, \"dyaw\": -3.74, \"droll\": 0.0}, {\"dx\": -0.51, \"dy\": -1.64, \"dz\": 0.0, \"dpitch\": -0.54, \"dyaw\": -3.35, \"droll\": 0.0}, {\"dx\": -0.61, \"dy\": -2.14, \"dz\": -0.02, \"dpitch\": -0.52, \"dyaw\": -3.02, \"droll\": 0.0}, {\"dx\": -0.69, \"dy\": -2.65, \"dz\": -0.13, \"dpitch\": -0.37, \"dyaw\": -2.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.49, "window_alt_abs_m": 1.33, "target_px_mean_hist": 552.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00019/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.91, 75.79, 20.24, -47.99, -90.05, 0.0]\n  Target bbox: [624.64, 324.82, 655.33, 394.03]\n\nFrame 2:\n  Drone pose: [101.89, 75.28, 20.22, -45.03, -93.02, 0.0]\n  Target bbox: [658.94, 375.49, 689.26, 443.27]\n\nFrame 3:\n  Drone pose: [101.85, 74.83, 20.24, -52.9, -91.21, 0.0]\n  Target bbox: [640.17, 242.07, 670.9, 308.99]\n\nFrame 4:\n  Drone pose: [102.1, 74.29, 20.13, -49.42, -95.69, 0.0]\n  Target bbox: [675.54, 299.77, 717.83, 368.95]\n\nFrame 5 (current):\n  Drone pose: [101.86, 73.58, 20.13, -48.15, -89.89, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.7, \"ymin\": 325.74, \"xmax\": 661.35, \"ymax\": 393.1}, \"waypoint_deltas\": [{\"dx\": 0.18, \"dy\": -0.37, \"dz\": 0.01, \"dpitch\": 0.2, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": 0.27, \"dy\": -0.9, \"dz\": -0.01, \"dpitch\": 0.18, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": 0.38, \"dy\": -1.43, \"dz\": -0.03, \"dpitch\": 0.15, \"dyaw\": -1.28, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": -1.97, \"dz\": -0.04, \"dpitch\": 0.11, \"dyaw\": -1.72, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": -2.51, \"dz\": -0.05, \"dpitch\": 0.07, \"dyaw\": -2.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.06, "window_alt_abs_m": 0.16, "target_px_mean_hist": 629.2, "cur_frame_id": 19, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00034/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.1, 67.69, 20.03, -46.04, -99.13, 0.0]\n  Target bbox: [670.75, 363.06, 721.68, 445.17]\n\nFrame 2:\n  Drone pose: [103.18, 67.06, 20.17, -48.15, -96.14, 0.0]\n  Target bbox: [615.65, 332.5, 664.27, 412.64]\n\nFrame 3:\n  Drone pose: [103.06, 66.36, 20.06, -49.03, -97.52, 0.0]\n  Target bbox: [616.91, 319.34, 663.11, 399.59]\n\nFrame 4:\n  Drone pose: [102.91, 65.86, 20.02, -44.62, -95.18, 0.0]\n  Target bbox: [579.44, 395.03, 621.43, 469.01]\n\nFrame 5 (current):\n  Drone pose: [102.83, 65.24, 20.02, -50.13, -95.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 556.16, \"ymin\": 298.12, \"xmax\": 612.31, \"ymax\": 385.76}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": -0.63, \"dz\": -0.01, \"dpitch\": 1.1, \"dyaw\": -6.55, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -1.24, \"dz\": -0.01, \"dpitch\": 1.07, \"dyaw\": -8.15, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": -1.86, \"dz\": -0.01, \"dpitch\": 0.87, \"dyaw\": -8.07, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": -2.47, \"dz\": -0.01, \"dpitch\": 0.66, \"dyaw\": -7.95, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": -3.08, \"dz\": -0.01, \"dpitch\": 0.44, \"dyaw\": -7.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.74, "window_alt_abs_m": 0.29, "target_px_mean_hist": 617.2, "cur_frame_id": 34, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.03, 57.93, 20.13, -50.09, -110.62, 0.0]\n  Target bbox: [618.59, 323.15, 661.54, 395.77]\n\nFrame 2:\n  Drone pose: [100.7, 57.14, 19.84, -51.09, -111.58, 0.0]\n  Target bbox: [615.11, 300.22, 665.75, 382.64]\n\nFrame 3:\n  Drone pose: [100.38, 56.67, 19.83, -49.61, -112.12, 0.0]\n  Target bbox: [616.87, 316.11, 663.54, 402.77]\n\nFrame 4:\n  Drone pose: [100.13, 55.93, 20.01, -50.09, -113.29, 0.0]\n  Target bbox: [617.91, 321.59, 662.31, 397.31]\n\nFrame 5 (current):\n  Drone pose: [99.75, 55.38, 19.98, -50.28, -108.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.89, \"ymin\": 316.11, \"xmax\": 613.8, \"ymax\": 398.49}, \"waypoint_deltas\": [{\"dx\": -0.39, \"dy\": -0.45, \"dz\": 0.02, \"dpitch\": 0.36, \"dyaw\": -5.1, \"droll\": 0.0}, {\"dx\": -0.74, \"dy\": -0.98, \"dz\": 0.02, \"dpitch\": 0.42, \"dyaw\": -5.64, \"droll\": 0.0}, {\"dx\": -1.11, \"dy\": -1.5, \"dz\": 0.02, \"dpitch\": 0.49, \"dyaw\": -6.12, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": -2.01, \"dz\": 0.02, \"dpitch\": 0.56, \"dyaw\": -6.5, \"droll\": 0.0}, {\"dx\": -1.9, \"dy\": -2.5, \"dz\": 0.02, \"dpitch\": 0.27, \"dyaw\": -5.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.01, "window_alt_abs_m": 0.51, "target_px_mean_hist": 630.2, "cur_frame_id": 50, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [95.03, 49.84, 20.04, -50.66, -109.77, 0.0]\n  Target bbox: [560.26, 311.34, 610.69, 389.1]\n\nFrame 2:\n  Drone pose: [94.58, 49.45, 20.0, -48.38, -109.85, 0.0]\n  Target bbox: [563.56, 347.59, 608.61, 419.94]\n\nFrame 3:\n  Drone pose: [94.21, 49.03, 20.0, -54.5, -114.97, 0.0]\n  Target bbox: [616.71, 236.35, 660.9, 314.36]\n\nFrame 4:\n  Drone pose: [94.07, 48.75, 20.07, -44.35, -116.25, 0.0]\n  Target bbox: [636.74, 403.32, 687.63, 484.49]\n\nFrame 5 (current):\n  Drone pose: [93.7, 48.2, 19.93, -46.02, -111.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 600.19, \"ymin\": 377.44, \"xmax\": 646.08, \"ymax\": 458.47}, \"waypoint_deltas\": [{\"dx\": -0.19, \"dy\": -0.3, \"dz\": 0.07, \"dpitch\": -3.6, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": -0.33, \"dy\": -0.67, \"dz\": 0.07, \"dpitch\": -3.48, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -1.05, \"dz\": 0.07, \"dpitch\": -3.36, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": -0.51, \"dy\": -1.44, \"dz\": 0.07, \"dpitch\": -3.23, \"dyaw\": 0.92, \"droll\": 0.0}, {\"dx\": -0.58, \"dy\": -1.85, \"dz\": 0.07, \"dpitch\": -3.12, \"dyaw\": 1.27, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.08, "window_alt_abs_m": 0.24, "target_px_mean_hist": 647.5, "cur_frame_id": 65, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00081/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.26, 43.08, 20.1, -49.41, -107.31, 0.0]\n  Target bbox: [617.4, 322.32, 662.32, 396.66]\n\nFrame 2:\n  Drone pose: [92.21, 42.52, 20.0, -50.43, -102.18, 0.0]\n  Target bbox: [560.4, 306.81, 608.77, 380.29]\n\nFrame 3:\n  Drone pose: [92.11, 42.2, 20.05, -48.66, -107.48, 0.0]\n  Target bbox: [624.52, 329.9, 673.29, 406.75]\n\nFrame 4:\n  Drone pose: [91.92, 41.59, 20.0, -49.93, -103.42, 0.0]\n  Target bbox: [584.29, 313.49, 634.29, 388.62]\n\nFrame 5 (current):\n  Drone pose: [91.73, 41.22, 20.0, -44.28, -106.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.11, \"ymin\": 405.6, \"xmax\": 674.39, \"ymax\": 481.75}, \"waypoint_deltas\": [{\"dx\": -0.11, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": -4.83, \"dyaw\": 1.6, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": -0.76, \"dz\": 0.0, \"dpitch\": -4.67, \"dyaw\": 1.93, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": -1.17, \"dz\": 0.0, \"dpitch\": -4.54, \"dyaw\": 2.03, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": -1.62, \"dz\": 0.0, \"dpitch\": -4.4, \"dyaw\": 1.68, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": -2.73, \"dz\": 0.0, \"dpitch\": -4.89, \"dyaw\": -2.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.54, "window_alt_abs_m": 0.2, "target_px_mean_hist": 641.5, "cur_frame_id": 81, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00096/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.22, 25.8, 20.0, -43.73, -162.73, 0.0]\n  Target bbox: [621.24, 320.4, 659.18, 399.08]\n\nFrame 2:\n  Drone pose: [106.58, 24.81, 20.13, -43.13, -161.12, 0.0]\n  Target bbox: [579.54, 328.63, 620.93, 409.15]\n\nFrame 3:\n  Drone pose: [106.97, 23.68, 20.0, -43.34, -166.4, 0.0]\n  Target bbox: [622.8, 322.01, 657.59, 397.42]\n\nFrame 4:\n  Drone pose: [106.95, 22.96, 19.99, -46.86, -170.2, 0.0]\n  Target bbox: [658.96, 267.6, 699.91, 337.82]\n\nFrame 5 (current):\n  Drone pose: [106.81, 22.06, 20.0, -47.45, -167.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 607.6, \"ymin\": 250.37, \"xmax\": 647.88, \"ymax\": 321.41}, \"waypoint_deltas\": [{\"dx\": -0.22, \"dy\": -0.73, \"dz\": 0.0, \"dpitch\": 4.71, \"dyaw\": -1.75, \"droll\": 0.0}, {\"dx\": -0.73, \"dy\": -1.31, \"dz\": 0.0, \"dpitch\": 4.68, \"dyaw\": -1.97, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": -1.87, \"dz\": 0.0, \"dpitch\": 4.64, \"dyaw\": -2.11, \"droll\": 0.0}, {\"dx\": -1.77, \"dy\": -2.41, \"dz\": 0.0, \"dpitch\": 4.6, \"dyaw\": -2.21, \"droll\": 0.0}, {\"dx\": -2.28, \"dy\": -2.93, \"dz\": 0.0, \"dpitch\": 4.58, \"dyaw\": -2.27, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.52, "window_alt_abs_m": 0.29, "target_px_mean_hist": 514.5, "cur_frame_id": 96, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00111/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.59, 16.12, 19.85, -39.93, -166.24, 0.0]\n  Target bbox: [577.79, 368.53, 615.71, 441.41]\n\nFrame 2:\n  Drone pose: [100.84, 15.73, 20.08, -42.89, -164.26, 0.0]\n  Target bbox: [554.1, 329.1, 602.45, 405.48]\n\nFrame 3:\n  Drone pose: [100.49, 15.37, 20.0, -42.87, -168.96, 0.0]\n  Target bbox: [623.38, 321.85, 657.04, 397.65]\n\nFrame 4:\n  Drone pose: [99.89, 15.04, 20.01, -41.89, -169.99, 0.0]\n  Target bbox: [618.99, 341.36, 665.66, 418.73]\n\nFrame 5 (current):\n  Drone pose: [99.58, 14.74, 19.9, -43.34, -174.34, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 684.72, \"ymin\": 313.04, \"xmax\": 719.93, \"ymax\": 386.07}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": -0.22, \"dz\": 0.1, \"dpitch\": 0.4, \"dyaw\": 4.41, \"droll\": 0.0}, {\"dx\": -1.08, \"dy\": -0.49, \"dz\": 0.1, \"dpitch\": 0.45, \"dyaw\": 5.07, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": -0.74, \"dz\": 0.1, \"dpitch\": 0.38, \"dyaw\": 4.37, \"droll\": 0.0}, {\"dx\": -2.1, \"dy\": -0.98, \"dz\": 0.1, \"dpitch\": 0.43, \"dyaw\": 5.1, \"droll\": 0.0}, {\"dx\": -2.59, \"dy\": -1.2, \"dz\": 0.1, \"dpitch\": 0.39, \"dyaw\": 4.48, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.06, "window_alt_abs_m": 0.42, "target_px_mean_hist": 553.0, "cur_frame_id": 111, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [93.85, 11.97, 20.08, -40.0, -171.56, 0.0]\n  Target bbox: [636.51, 363.88, 677.68, 435.06]\n\nFrame 2:\n  Drone pose: [93.3, 11.71, 20.18, -46.39, -168.27, 0.0]\n  Target bbox: [605.78, 255.77, 642.34, 334.14]\n\nFrame 3:\n  Drone pose: [92.88, 11.39, 20.0, -42.22, -170.45, 0.0]\n  Target bbox: [616.03, 319.73, 663.76, 399.92]\n\nFrame 4:\n  Drone pose: [92.42, 11.15, 20.0, -43.98, -172.59, 0.0]\n  Target bbox: [657.77, 290.5, 693.73, 367.18]\n\nFrame 5 (current):\n  Drone pose: [91.95, 10.93, 20.0, -42.12, -170.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.17, \"ymin\": 321.81, \"xmax\": 662.79, \"ymax\": 397.76}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -0.61, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": -0.83, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.47, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -1.06, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -0.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.82, "window_alt_abs_m": 0.29, "target_px_mean_hist": 553.8, "cur_frame_id": 127, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00138/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00139/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820/aug_001/frames_playback/frame_00142/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [86.33, 8.56, 20.04, -40.62, -168.86, 0.0]\n  Target bbox: [607.8, 354.16, 644.35, 421.98]\n\nFrame 2:\n  Drone pose: [85.93, 8.23, 20.03, -47.12, -171.87, 0.0]\n  Target bbox: [651.39, 237.59, 687.92, 314.44]\n\nFrame 3:\n  Drone pose: [85.3, 8.27, 20.13, -41.07, -167.44, 0.0]\n  Target bbox: [599.52, 350.11, 632.2, 415.02]\n\nFrame 4:\n  Drone pose: [84.83, 8.02, 20.18, -43.59, -173.15, 0.0]\n  Target bbox: [657.94, 304.88, 698.71, 380.3]\n\nFrame 5 (current):\n  Drone pose: [84.38, 7.78, 20.0, -42.13, -169.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.1, \"ymin\": 322.43, \"xmax\": 657.36, \"ymax\": 397.2}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.2, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.55, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.42, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.22, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": -0.86, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": -1.08, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.92, "window_alt_abs_m": 0.32, "target_px_mean_hist": 560.8, "cur_frame_id": 142, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_820", "difficulty_score": 0.3082, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.11, 49.94, 22.0, -46.1, -75.96, 0.0]\n  Target bbox: [629.37, 341.37, 650.69, 378.36]\n\nFrame 2:\n  Drone pose: [-96.13, 48.08, 21.2, -46.48, -72.11, 0.0]\n  Target bbox: [628.39, 340.11, 651.66, 379.59]\n\nFrame 3:\n  Drone pose: [-96.28, 47.3, 20.67, -46.06, -71.41, 0.0]\n  Target bbox: [631.61, 340.98, 648.45, 378.75]\n\nFrame 4:\n  Drone pose: [-96.4, 46.54, 20.64, -46.33, -70.82, 0.0]\n  Target bbox: [629.23, 340.07, 650.84, 379.67]\n\nFrame 5 (current):\n  Drone pose: [-96.41, 45.9, 20.62, -46.48, -70.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.84, \"ymin\": 339.42, \"xmax\": 652.22, \"ymax\": 380.31}, \"waypoint_deltas\": [{\"dx\": 0.05, \"dy\": -0.58, \"dz\": -0.03, \"dpitch\": -0.1, \"dyaw\": -0.05, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.14, \"dz\": -0.05, \"dpitch\": -0.18, \"dyaw\": -0.17, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -1.64, \"dz\": -0.07, \"dpitch\": -0.22, \"dyaw\": -0.53, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -2.14, \"dz\": -0.09, \"dpitch\": -0.25, \"dyaw\": -0.87, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": -2.65, \"dz\": -0.2, \"dpitch\": -0.16, \"dyaw\": -1.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.3, "window_alt_abs_m": 1.38, "target_px_mean_hist": 187.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00018/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.42, 40.73, 20.27, -46.7, -73.35, 0.0]\n  Target bbox: [628.37, 339.59, 651.69, 380.09]\n\nFrame 2:\n  Drone pose: [-95.33, 40.22, 20.24, -46.71, -73.62, 0.0]\n  Target bbox: [631.85, 340.71, 648.21, 379.0]\n\nFrame 3:\n  Drone pose: [-95.24, 39.72, 20.22, -46.5, -73.88, 0.0]\n  Target bbox: [628.26, 339.31, 651.82, 380.4]\n\nFrame 4:\n  Drone pose: [-95.15, 39.21, 20.19, -46.51, -74.14, 0.0]\n  Target bbox: [627.47, 339.31, 652.61, 380.4]\n\nFrame 5 (current):\n  Drone pose: [-95.06, 38.7, 20.17, -46.53, -74.41, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 632.11, \"ymin\": 340.14, \"xmax\": 647.96, \"ymax\": 379.56}, \"waypoint_deltas\": [{\"dx\": 0.09, \"dy\": -0.49, \"dz\": -0.02, \"dpitch\": -0.01, \"dyaw\": -0.27, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -0.98, \"dz\": -0.04, \"dpitch\": 0.0, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": 0.25, \"dy\": -1.48, \"dz\": -0.05, \"dpitch\": 0.0, \"dyaw\": -0.76, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": -2.0, \"dz\": -0.07, \"dpitch\": -0.04, \"dyaw\": -0.92, \"droll\": 0.0}, {\"dx\": 0.37, \"dy\": -2.57, \"dz\": -0.08, \"dpitch\": -0.36, \"dyaw\": -1.06, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.06, "window_alt_abs_m": 0.1, "target_px_mean_hist": 208.2, "cur_frame_id": 18, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-93.28, 32.61, 20.04, -48.11, -79.52, 0.0]\n  Target bbox: [631.56, 340.34, 648.52, 379.33]\n\nFrame 2:\n  Drone pose: [-92.94, 32.08, 20.03, -48.25, -80.62, 0.0]\n  Target bbox: [631.18, 340.34, 648.9, 379.33]\n\nFrame 3:\n  Drone pose: [-92.61, 31.55, 20.03, -48.37, -81.69, 0.0]\n  Target bbox: [628.19, 340.04, 651.91, 379.61]\n\nFrame 4:\n  Drone pose: [-92.29, 31.03, 20.02, -48.47, -82.73, 0.0]\n  Target bbox: [630.76, 340.69, 649.33, 378.97]\n\nFrame 5 (current):\n  Drone pose: [-91.99, 30.53, 20.02, -48.53, -83.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.76, \"ymin\": 340.09, \"xmax\": 652.35, \"ymax\": 379.56}, \"waypoint_deltas\": [{\"dx\": 0.28, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.93, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -0.98, \"dz\": -0.01, \"dpitch\": -0.03, \"dyaw\": -1.76, \"droll\": 0.0}, {\"dx\": 0.74, \"dy\": -1.46, \"dz\": -0.01, \"dpitch\": -0.03, \"dyaw\": -2.45, \"droll\": 0.0}, {\"dx\": 0.89, \"dy\": -1.96, \"dz\": -0.01, \"dpitch\": -0.03, \"dyaw\": -2.96, \"droll\": 0.0}, {\"dx\": 0.97, \"dy\": -2.47, \"dz\": -0.01, \"dpitch\": -0.05, \"dyaw\": -3.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.2, "window_alt_abs_m": 0.02, "target_px_mean_hist": 215.2, "cur_frame_id": 33, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00047/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-91.7, 25.4, 20.0, -48.88, -91.36, 0.0]\n  Target bbox: [626.67, 336.43, 653.33, 383.24]\n\nFrame 2:\n  Drone pose: [-91.99, 24.87, 20.0, -48.92, -92.09, 0.0]\n  Target bbox: [627.21, 337.71, 652.79, 382.0]\n\nFrame 3:\n  Drone pose: [-92.28, 24.34, 20.0, -48.94, -92.79, 0.0]\n  Target bbox: [629.2, 339.21, 650.79, 380.46]\n\nFrame 4:\n  Drone pose: [-92.56, 23.82, 20.0, -48.74, -93.54, 0.0]\n  Target bbox: [628.99, 338.96, 651.01, 380.71]\n\nFrame 5 (current):\n  Drone pose: [-92.82, 23.31, 20.0, -48.73, -94.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.16, \"ymin\": 339.09, \"xmax\": 650.84, \"ymax\": 380.58}, \"waypoint_deltas\": [{\"dx\": -0.21, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.96, \"droll\": 0.0}, {\"dx\": -0.39, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -0.54, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": -1.97, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": -0.76, \"dy\": -2.45, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 214.8, "cur_frame_id": 47, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-94.15, 17.97, 20.0, -48.53, -91.53, 0.0]\n  Target bbox: [628.34, 340.58, 651.54, 379.03]\n\nFrame 2:\n  Drone pose: [-94.26, 17.49, 20.0, -48.5, -91.19, 0.0]\n  Target bbox: [632.97, 340.85, 646.95, 378.78]\n\nFrame 3:\n  Drone pose: [-94.35, 17.01, 20.0, -48.47, -90.85, 0.0]\n  Target bbox: [631.09, 341.01, 648.82, 378.62]\n\nFrame 4:\n  Drone pose: [-94.45, 16.53, 20.0, -48.44, -90.52, 0.0]\n  Target bbox: [633.75, 340.64, 646.18, 379.0]\n\nFrame 5 (current):\n  Drone pose: [-94.55, 16.04, 20.0, -48.42, -90.2, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 631.47, \"ymin\": 340.86, \"xmax\": 648.48, \"ymax\": 378.76}, \"waypoint_deltas\": [{\"dx\": -0.09, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": -0.96, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 0.61, \"droll\": 0.0}, {\"dx\": -0.27, \"dy\": -1.45, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.9, \"droll\": 0.0}, {\"dx\": -0.35, \"dy\": -1.93, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 1.16, \"droll\": 0.0}, {\"dx\": -0.42, \"dy\": -2.42, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": 1.42, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.32, "window_alt_abs_m": 0.0, "target_px_mean_hist": 219.8, "cur_frame_id": 62, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00077/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.38, 11.03, 20.0, -47.59, -87.49, 0.0]\n  Target bbox: [630.27, 340.73, 649.83, 378.9]\n\nFrame 2:\n  Drone pose: [-95.46, 10.63, 20.0, -47.42, -87.25, 0.0]\n  Target bbox: [629.28, 340.37, 650.83, 379.28]\n\nFrame 3:\n  Drone pose: [-95.55, 10.22, 20.0, -47.27, -86.97, 0.0]\n  Target bbox: [632.78, 341.12, 647.3, 378.55]\n\nFrame 4:\n  Drone pose: [-95.65, 9.8, 20.0, -47.14, -86.65, 0.0]\n  Target bbox: [629.05, 340.35, 651.05, 379.26]\n\nFrame 5 (current):\n  Drone pose: [-95.77, 9.36, 20.0, -47.03, -86.28, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.25, \"ymin\": 340.12, \"xmax\": 651.87, \"ymax\": 379.53}, \"waypoint_deltas\": [{\"dx\": -0.15, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 0.44, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": -0.84, \"dz\": 0.0, \"dpitch\": 0.29, \"dyaw\": 0.97, \"droll\": 0.0}, {\"dx\": -0.54, \"dy\": -1.19, \"dz\": 0.0, \"dpitch\": 0.55, \"dyaw\": 1.61, \"droll\": 0.0}, {\"dx\": -0.82, \"dy\": -1.52, \"dz\": 0.0, \"dpitch\": 0.86, \"dyaw\": 2.43, \"droll\": 0.0}, {\"dx\": -1.19, \"dy\": -1.88, \"dz\": 0.0, \"dpitch\": 1.14, \"dyaw\": 3.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.22, "window_alt_abs_m": 0.0, "target_px_mean_hist": 220.2, "cur_frame_id": 77, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00091/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-98.85, 4.27, 20.0, -46.44, -76.62, 0.0]\n  Target bbox: [626.9, 337.09, 653.05, 382.66]\n\nFrame 2:\n  Drone pose: [-99.02, 3.67, 20.0, -46.72, -77.56, 0.0]\n  Target bbox: [628.0, 337.97, 651.94, 381.77]\n\nFrame 3:\n  Drone pose: [-99.21, 3.11, 20.0, -46.9, -78.47, 0.0]\n  Target bbox: [629.21, 338.86, 650.74, 380.87]\n\nFrame 4:\n  Drone pose: [-99.48, 2.58, 20.0, -47.03, -79.18, 0.0]\n  Target bbox: [629.17, 338.83, 650.78, 380.89]\n\nFrame 5 (current):\n  Drone pose: [-99.82, 2.05, 20.0, -47.12, -79.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.51, \"ymin\": 338.54, \"xmax\": 651.44, \"ymax\": 381.19}, \"waypoint_deltas\": [{\"dx\": -0.39, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.33, \"droll\": 0.0}, {\"dx\": -0.8, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": -1.21, \"dy\": -1.54, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": -0.87, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": -1.17, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": -2.55, \"dz\": 0.0, \"dpitch\": -0.43, \"dyaw\": -1.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.05, "window_alt_abs_m": 0.0, "target_px_mean_hist": 205.0, "cur_frame_id": 91, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00106/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.89, -3.79, 20.0, -48.21, -84.11, 0.0]\n  Target bbox: [629.6, 339.16, 650.37, 380.51]\n\nFrame 2:\n  Drone pose: [-104.17, -4.37, 20.0, -48.39, -84.79, 0.0]\n  Target bbox: [627.66, 337.71, 652.31, 381.99]\n\nFrame 3:\n  Drone pose: [-104.44, -4.96, 20.0, -48.56, -85.55, 0.0]\n  Target bbox: [627.02, 336.62, 652.96, 383.04]\n\nFrame 4:\n  Drone pose: [-104.69, -5.54, 20.0, -48.72, -86.36, 0.0]\n  Target bbox: [628.46, 338.2, 651.52, 381.47]\n\nFrame 5 (current):\n  Drone pose: [-104.94, -6.11, 20.0, -48.87, -87.2, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.79, \"ymin\": 337.55, \"xmax\": 652.19, \"ymax\": 382.12}, \"waypoint_deltas\": [{\"dx\": -0.25, \"dy\": -0.56, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -0.84, \"droll\": 0.0}, {\"dx\": -0.5, \"dy\": -1.11, \"dz\": 0.0, \"dpitch\": -0.22, \"dyaw\": -1.66, \"droll\": 0.0}, {\"dx\": -0.76, \"dy\": -1.66, \"dz\": 0.0, \"dpitch\": -0.29, \"dyaw\": -2.48, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": -2.19, \"dz\": 0.0, \"dpitch\": -0.35, \"dyaw\": -3.31, \"droll\": 0.0}, {\"dx\": -1.26, \"dy\": -2.72, \"dz\": 0.0, \"dpitch\": -0.38, \"dyaw\": -4.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.09, "window_alt_abs_m": 0.0, "target_px_mean_hist": 219.8, "cur_frame_id": 106, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00116/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00117/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00118/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00119/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00120/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-106.96, -11.3, 20.0, -49.19, -92.22, 0.0]\n  Target bbox: [628.51, 340.28, 651.38, 379.28]\n\nFrame 2:\n  Drone pose: [-107.04, -11.77, 20.0, -49.16, -91.94, 0.0]\n  Target bbox: [629.63, 340.7, 650.27, 378.89]\n\nFrame 3:\n  Drone pose: [-107.11, -12.25, 20.0, -49.12, -91.68, 0.0]\n  Target bbox: [628.35, 340.38, 651.54, 379.19]\n\nFrame 4:\n  Drone pose: [-107.19, -12.73, 20.0, -49.08, -91.43, 0.0]\n  Target bbox: [632.8, 340.59, 647.12, 379.04]\n\nFrame 5 (current):\n  Drone pose: [-107.26, -13.2, 20.0, -49.05, -91.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.5, \"ymin\": 340.28, \"xmax\": 651.38, \"ymax\": 379.32}, \"waypoint_deltas\": [{\"dx\": -0.08, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": -0.95, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.54, \"droll\": 0.0}, {\"dx\": -0.25, \"dy\": -1.43, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 0.83, \"droll\": 0.0}, {\"dx\": -0.33, \"dy\": -1.91, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": 1.11, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": -2.39, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": 1.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.05, "window_alt_abs_m": 0.0, "target_px_mean_hist": 220.2, "cur_frame_id": 120, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00131/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00132/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00133/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00134/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/ORI/frames_playback/frame_00135/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.13, -18.48, 20.0, -48.66, -88.25, 0.0]\n  Target bbox: [630.9, 340.6, 649.2, 379.03]\n\nFrame 2:\n  Drone pose: [-108.19, -18.96, 20.0, -48.63, -88.03, 0.0]\n  Target bbox: [629.04, 340.77, 651.07, 378.87]\n\nFrame 3:\n  Drone pose: [-108.26, -19.45, 20.0, -48.6, -87.82, 0.0]\n  Target bbox: [631.16, 340.83, 648.93, 378.79]\n\nFrame 4:\n  Drone pose: [-108.32, -19.93, 20.0, -48.58, -87.62, 0.0]\n  Target bbox: [629.45, 340.73, 650.65, 378.87]\n\nFrame 5 (current):\n  Drone pose: [-108.38, -20.42, 20.0, -48.55, -87.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.44, \"ymin\": 340.34, \"xmax\": 651.67, \"ymax\": 379.25}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -0.97, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.36, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": -1.46, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": -0.21, \"dy\": -1.95, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": -2.43, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.85, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.83, "window_alt_abs_m": 0.0, "target_px_mean_hist": 218.8, "cur_frame_id": 135, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.11, 49.94, 22.0, -49.63, -75.41, 0.0]\n  Target bbox: [622.32, 282.09, 644.73, 319.32]\n\nFrame 2:\n  Drone pose: [-96.13, 48.08, 21.2, -46.48, -72.11, 0.0]\n  Target bbox: [629.92, 341.2, 650.15, 378.56]\n\nFrame 3:\n  Drone pose: [-96.23, 47.39, 20.55, -47.92, -75.89, 0.0]\n  Target bbox: [677.58, 305.48, 702.27, 345.15]\n\nFrame 4:\n  Drone pose: [-96.44, 46.73, 20.66, -44.21, -73.83, 0.0]\n  Target bbox: [665.63, 373.06, 682.74, 410.31]\n\nFrame 5 (current):\n  Drone pose: [-96.41, 45.9, 20.62, -43.43, -71.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 642.47, \"ymin\": 391.89, \"xmax\": 658.42, \"ymax\": 430.25}, \"waypoint_deltas\": [{\"dx\": 0.05, \"dy\": -0.58, \"dz\": -0.03, \"dpitch\": -3.15, \"dyaw\": 0.85, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.14, \"dz\": -0.05, \"dpitch\": -3.23, \"dyaw\": 0.73, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -1.64, \"dz\": -0.07, \"dpitch\": -3.27, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -2.14, \"dz\": -0.09, \"dpitch\": -3.3, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": -2.65, \"dz\": -0.2, \"dpitch\": -3.21, \"dyaw\": -0.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.43, "window_alt_abs_m": 1.6, "target_px_mean_hist": 180.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00018/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.42, 40.73, 20.27, -45.55, -74.09, 0.0]\n  Target bbox: [636.2, 358.52, 661.14, 399.92]\n\nFrame 2:\n  Drone pose: [-95.33, 40.22, 20.24, -50.91, -76.24, 0.0]\n  Target bbox: [657.4, 269.15, 683.34, 310.43]\n\nFrame 3:\n  Drone pose: [-95.24, 39.72, 20.22, -46.33, -68.88, 0.0]\n  Target bbox: [570.22, 344.0, 594.11, 385.3]\n\nFrame 4:\n  Drone pose: [-95.26, 39.04, 20.18, -43.45, -76.52, 0.0]\n  Target bbox: [663.71, 395.34, 681.64, 434.84]\n\nFrame 5 (current):\n  Drone pose: [-95.06, 38.7, 20.17, -50.48, -75.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 642.24, \"ymin\": 273.02, \"xmax\": 666.13, \"ymax\": 314.1}, \"waypoint_deltas\": [{\"dx\": 0.09, \"dy\": -0.49, \"dz\": -0.02, \"dpitch\": 3.94, \"dyaw\": 0.94, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -0.98, \"dz\": -0.04, \"dpitch\": 3.95, \"dyaw\": 0.67, \"droll\": 0.0}, {\"dx\": 0.25, \"dy\": -1.48, \"dz\": -0.05, \"dpitch\": 3.95, \"dyaw\": 0.45, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": -2.0, \"dz\": -0.07, \"dpitch\": 3.91, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": 0.37, \"dy\": -2.57, \"dz\": -0.08, \"dpitch\": 3.59, \"dyaw\": 0.15, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.04, "window_alt_abs_m": 0.1, "target_px_mean_hist": 211.8, "cur_frame_id": 18, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-93.28, 32.61, 20.04, -44.17, -79.91, 0.0]\n  Target bbox: [635.25, 406.7, 653.66, 445.45]\n\nFrame 2:\n  Drone pose: [-92.96, 32.19, 20.01, -44.48, -75.61, 0.0]\n  Target bbox: [570.8, 400.25, 596.57, 442.09]\n\nFrame 3:\n  Drone pose: [-92.6, 31.73, 20.06, -51.51, -84.65, 0.0]\n  Target bbox: [663.38, 284.45, 681.35, 323.39]\n\nFrame 4:\n  Drone pose: [-92.29, 31.03, 20.02, -46.44, -77.73, 0.0]\n  Target bbox: [571.75, 375.31, 596.76, 415.95]\n\nFrame 5 (current):\n  Drone pose: [-91.99, 30.53, 20.02, -53.18, -88.65, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 682.59, \"ymin\": 263.77, \"xmax\": 707.75, \"ymax\": 303.25}, \"waypoint_deltas\": [{\"dx\": 0.28, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 4.62, \"dyaw\": 4.0, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -0.98, \"dz\": -0.01, \"dpitch\": 4.62, \"dyaw\": 3.17, \"droll\": 0.0}, {\"dx\": 0.74, \"dy\": -1.46, \"dz\": -0.01, \"dpitch\": 4.62, \"dyaw\": 2.48, \"droll\": 0.0}, {\"dx\": 0.89, \"dy\": -1.96, \"dz\": -0.01, \"dpitch\": 4.62, \"dyaw\": 1.97, \"droll\": 0.0}, {\"dx\": 0.97, \"dy\": -2.47, \"dz\": -0.01, \"dpitch\": 4.6, \"dyaw\": 1.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 31.18, "window_alt_abs_m": 0.12, "target_px_mean_hist": 219.0, "cur_frame_id": 33, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00047/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-91.7, 25.4, 20.0, -49.78, -90.19, 0.0]\n  Target bbox: [613.57, 321.95, 640.14, 367.74]\n\nFrame 2:\n  Drone pose: [-91.99, 24.87, 20.0, -51.16, -89.34, 0.0]\n  Target bbox: [597.29, 301.14, 621.7, 344.31]\n\nFrame 3:\n  Drone pose: [-92.28, 24.34, 20.0, -47.89, -91.34, 0.0]\n  Target bbox: [611.96, 356.24, 635.56, 398.94]\n\nFrame 4:\n  Drone pose: [-92.56, 23.82, 20.0, -48.74, -93.54, 0.0]\n  Target bbox: [628.73, 338.7, 651.26, 380.97]\n\nFrame 5 (current):\n  Drone pose: [-92.83, 23.37, 20.02, -46.52, -89.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 570.29, \"ymin\": 374.75, \"xmax\": 598.49, \"ymax\": 420.7}, \"waypoint_deltas\": [{\"dx\": -0.2, \"dy\": -0.57, \"dz\": -0.02, \"dpitch\": -2.18, \"dyaw\": -6.03, \"droll\": 0.0}, {\"dx\": -0.38, \"dy\": -1.06, \"dz\": -0.02, \"dpitch\": -2.2, \"dyaw\": -5.42, \"droll\": 0.0}, {\"dx\": -0.53, \"dy\": -1.55, \"dz\": -0.02, \"dpitch\": -2.19, \"dyaw\": -4.92, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": -2.03, \"dz\": -0.02, \"dpitch\": -2.18, \"dyaw\": -4.51, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": -2.51, \"dz\": -0.02, \"dpitch\": -2.16, \"dyaw\": -4.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.28, "window_alt_abs_m": 0.02, "target_px_mean_hist": 219.5, "cur_frame_id": 47, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-94.23, 17.91, 19.95, -50.22, -91.38, 0.0]\n  Target bbox: [629.18, 312.69, 652.75, 351.2]\n\nFrame 2:\n  Drone pose: [-94.41, 17.51, 19.96, -53.21, -95.49, 0.0]\n  Target bbox: [681.0, 261.06, 706.78, 300.98]\n\nFrame 3:\n  Drone pose: [-94.35, 17.01, 20.0, -51.78, -90.6, 0.0]\n  Target bbox: [625.56, 284.63, 648.61, 324.05]\n\nFrame 4:\n  Drone pose: [-94.39, 16.6, 20.03, -43.37, -89.22, 0.0]\n  Target bbox: [614.97, 424.67, 631.02, 463.35]\n\nFrame 5 (current):\n  Drone pose: [-94.69, 16.01, 20.13, -46.76, -84.74, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 572.38, \"ymin\": 373.52, \"xmax\": 596.6, \"ymax\": 413.58}, \"waypoint_deltas\": [{\"dx\": 0.05, \"dy\": -0.45, \"dz\": -0.13, \"dpitch\": -1.63, \"dyaw\": -5.15, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -0.93, \"dz\": -0.13, \"dpitch\": -1.6, \"dyaw\": -4.85, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -1.42, \"dz\": -0.13, \"dpitch\": -1.57, \"dyaw\": -4.56, \"droll\": 0.0}, {\"dx\": -0.21, \"dy\": -1.9, \"dz\": -0.13, \"dpitch\": -1.55, \"dyaw\": -4.3, \"droll\": 0.0}, {\"dx\": -0.28, \"dy\": -2.39, \"dz\": -0.13, \"dpitch\": -1.52, \"dyaw\": -4.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.86, "window_alt_abs_m": 0.18, "target_px_mean_hist": 225.2, "cur_frame_id": 62, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00077/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.38, 11.03, 20.0, -52.59, -92.49, 0.0]\n  Target bbox: [685.75, 257.98, 707.96, 297.29]\n\nFrame 2:\n  Drone pose: [-95.29, 10.55, 19.98, -47.53, -87.77, 0.0]\n  Target bbox: [632.36, 341.14, 647.72, 378.53]\n\nFrame 3:\n  Drone pose: [-95.42, 10.16, 19.99, -51.14, -87.47, 0.0]\n  Target bbox: [632.14, 277.74, 650.3, 315.34]\n\nFrame 4:\n  Drone pose: [-95.81, 9.87, 19.91, -46.89, -86.16, 0.0]\n  Target bbox: [633.11, 340.89, 646.97, 378.78]\n\nFrame 5 (current):\n  Drone pose: [-95.77, 9.36, 20.0, -47.03, -86.28, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 633.11, \"ymin\": 340.99, \"xmax\": 646.97, \"ymax\": 378.68}, \"waypoint_deltas\": [{\"dx\": -0.15, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 0.44, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": -0.84, \"dz\": 0.0, \"dpitch\": 0.29, \"dyaw\": 0.97, \"droll\": 0.0}, {\"dx\": -0.54, \"dy\": -1.19, \"dz\": 0.0, \"dpitch\": 0.55, \"dyaw\": 1.61, \"droll\": 0.0}, {\"dx\": -0.82, \"dy\": -1.52, \"dz\": 0.0, \"dpitch\": 0.86, \"dyaw\": 2.43, \"droll\": 0.0}, {\"dx\": -1.19, \"dy\": -1.88, \"dz\": 0.0, \"dpitch\": 1.14, \"dyaw\": 3.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.45, "window_alt_abs_m": 0.2, "target_px_mean_hist": 222.5, "cur_frame_id": 77, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00091/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-98.85, 4.27, 20.0, -46.44, -76.62, 0.0]\n  Target bbox: [627.11, 337.34, 652.84, 382.41]\n\nFrame 2:\n  Drone pose: [-99.17, 3.7, 20.04, -47.79, -72.12, 0.0]\n  Target bbox: [571.07, 321.91, 593.47, 363.77]\n\nFrame 3:\n  Drone pose: [-99.24, 3.06, 19.88, -46.79, -78.36, 0.0]\n  Target bbox: [627.53, 337.8, 652.44, 381.92]\n\nFrame 4:\n  Drone pose: [-99.5, 2.44, 19.98, -48.64, -84.02, 0.0]\n  Target bbox: [685.97, 316.89, 708.25, 358.19]\n\nFrame 5 (current):\n  Drone pose: [-99.81, 2.14, 20.04, -47.05, -79.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.38, \"ymin\": 338.04, \"xmax\": 651.58, \"ymax\": 381.67}, \"waypoint_deltas\": [{\"dx\": -0.4, \"dy\": -0.61, \"dz\": -0.04, \"dpitch\": -0.14, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": -1.12, \"dz\": -0.04, \"dpitch\": -0.18, \"dyaw\": -0.53, \"droll\": 0.0}, {\"dx\": -1.22, \"dy\": -1.63, \"dz\": -0.04, \"dpitch\": -0.2, \"dyaw\": -0.79, \"droll\": 0.0}, {\"dx\": -1.63, \"dy\": -2.13, \"dz\": -0.04, \"dpitch\": -0.24, \"dyaw\": -1.09, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": -2.64, \"dz\": -0.04, \"dpitch\": -0.5, \"dyaw\": -1.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.67, "window_alt_abs_m": 0.36, "target_px_mean_hist": 206.8, "cur_frame_id": 91, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00106/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.89, -3.79, 20.0, -48.21, -84.11, 0.0]\n  Target bbox: [628.81, 338.48, 651.16, 381.2]\n\nFrame 2:\n  Drone pose: [-104.08, -4.26, 20.09, -46.01, -90.14, 0.0]\n  Target bbox: [683.31, 379.11, 708.56, 422.35]\n\nFrame 3:\n  Drone pose: [-104.4, -4.79, 19.93, -50.32, -90.71, 0.0]\n  Target bbox: [682.34, 302.23, 709.96, 349.01]\n\nFrame 4:\n  Drone pose: [-104.69, -5.63, 20.15, -53.42, -87.26, 0.0]\n  Target bbox: [638.93, 265.62, 661.84, 308.73]\n\nFrame 5 (current):\n  Drone pose: [-104.85, -6.17, 19.85, -48.74, -87.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.92, \"ymin\": 338.78, \"xmax\": 651.08, \"ymax\": 380.89}, \"waypoint_deltas\": [{\"dx\": -0.34, \"dy\": -0.5, \"dz\": 0.15, \"dpitch\": -0.25, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": -0.59, \"dy\": -1.05, \"dz\": 0.15, \"dpitch\": -0.35, \"dyaw\": -1.36, \"droll\": 0.0}, {\"dx\": -0.85, \"dy\": -1.6, \"dz\": 0.15, \"dpitch\": -0.42, \"dyaw\": -2.18, \"droll\": 0.0}, {\"dx\": -1.11, \"dy\": -2.13, \"dz\": 0.15, \"dpitch\": -0.48, \"dyaw\": -3.01, \"droll\": 0.0}, {\"dx\": -1.35, \"dy\": -2.66, \"dz\": 0.15, \"dpitch\": -0.51, \"dyaw\": -3.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.29, "window_alt_abs_m": 0.78, "target_px_mean_hist": 206.8, "cur_frame_id": 106, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00116/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00117/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00118/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00119/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00120/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-106.95, -11.24, 19.87, -48.91, -92.25, 0.0]\n  Target bbox: [628.57, 340.19, 651.32, 379.38]\n\nFrame 2:\n  Drone pose: [-107.16, -11.7, 20.01, -49.05, -91.53, 0.0]\n  Target bbox: [630.56, 340.59, 649.34, 379.03]\n\nFrame 3:\n  Drone pose: [-107.11, -12.25, 20.0, -54.0, -96.68, 0.0]\n  Target bbox: [686.76, 260.17, 703.51, 299.11]\n\nFrame 4:\n  Drone pose: [-107.16, -12.75, 19.83, -48.86, -91.52, 0.0]\n  Target bbox: [628.69, 340.26, 651.2, 379.31]\n\nFrame 5 (current):\n  Drone pose: [-107.19, -13.09, 19.87, -48.66, -91.41, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 633.22, \"ymin\": 340.89, \"xmax\": 646.71, \"ymax\": 378.75}, \"waypoint_deltas\": [{\"dx\": -0.15, \"dy\": -0.59, \"dz\": 0.13, \"dpitch\": -0.35, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": -1.06, \"dz\": 0.13, \"dpitch\": -0.31, \"dyaw\": 0.78, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": -1.54, \"dz\": 0.13, \"dpitch\": -0.28, \"dyaw\": 1.07, \"droll\": 0.0}, {\"dx\": -0.4, \"dy\": -2.02, \"dz\": 0.13, \"dpitch\": -0.24, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": -2.5, \"dz\": 0.13, \"dpitch\": -0.2, \"dyaw\": 1.63, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.14, "window_alt_abs_m": 0.36, "target_px_mean_hist": 222.2, "cur_frame_id": 120, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00131/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00132/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00133/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00134/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693/aug_001/frames_playback/frame_00135/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.13, -18.48, 20.0, -43.91, -92.69, 0.0]\n  Target bbox: [678.22, 421.69, 700.57, 460.43]\n\nFrame 2:\n  Drone pose: [-108.1, -19.05, 20.11, -48.93, -93.36, 0.0]\n  Target bbox: [683.1, 342.28, 707.27, 381.4]\n\nFrame 3:\n  Drone pose: [-108.21, -19.58, 20.01, -44.66, -91.35, 0.0]\n  Target bbox: [665.83, 411.45, 689.26, 450.65]\n\nFrame 4:\n  Drone pose: [-108.35, -19.79, 19.87, -48.14, -87.55, 0.0]\n  Target bbox: [628.61, 340.25, 651.5, 379.34]\n\nFrame 5 (current):\n  Drone pose: [-108.38, -20.42, 20.0, -48.55, -87.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.87, \"ymin\": 340.38, \"xmax\": 651.24, \"ymax\": 379.2}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -0.97, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.36, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": -1.46, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": -0.21, \"dy\": -1.95, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": -2.43, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.85, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.59, "window_alt_abs_m": 0.48, "target_px_mean_hist": 224.5, "cur_frame_id": 135, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_693", "difficulty_score": 0.2915, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-74.57, -85.33, 22.0, -46.48, 81.47, 0.0]\n  Target bbox: [620.65, 329.66, 659.1, 389.61]\n\nFrame 2:\n  Drone pose: [-77.43, -84.22, 21.2, -45.33, 73.1, 0.0]\n  Target bbox: [624.67, 328.74, 655.17, 390.54]\n\nFrame 3:\n  Drone pose: [-80.18, -82.93, 20.7, -44.26, 64.97, 0.0]\n  Target bbox: [622.83, 327.64, 657.06, 391.73]\n\nFrame 4:\n  Drone pose: [-82.93, -81.83, 20.67, -43.12, 57.52, 0.0]\n  Target bbox: [616.38, 323.69, 663.65, 395.89]\n\nFrame 5 (current):\n  Drone pose: [-85.2, -80.82, 20.64, -41.88, 51.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.99, \"ymin\": 326.42, \"xmax\": 659.05, \"ymax\": 393.16}, \"waypoint_deltas\": [{\"dx\": -1.58, \"dy\": 0.89, \"dz\": -0.02, \"dpitch\": 0.95, \"dyaw\": -3.76, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": 1.69, \"dz\": -0.04, \"dpitch\": 1.51, \"dyaw\": -6.01, \"droll\": 0.0}, {\"dx\": -3.01, \"dy\": 2.39, \"dz\": -0.07, \"dpitch\": 2.21, \"dyaw\": -8.04, \"droll\": 0.0}, {\"dx\": -3.22, \"dy\": 3.07, \"dz\": -0.09, \"dpitch\": 2.71, \"dyaw\": -9.52, \"droll\": 0.0}, {\"dx\": -3.22, \"dy\": 3.69, \"dz\": -0.19, \"dpitch\": 2.77, \"dyaw\": -9.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 29.67, "window_alt_abs_m": 1.36, "target_px_mean_hist": 515.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00020/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-88.0, -73.08, 20.24, -39.17, 40.95, 0.0]\n  Target bbox: [619.75, 324.53, 660.44, 395.3]\n\nFrame 2:\n  Drone pose: [-87.96, -72.58, 20.21, -39.17, 41.02, 0.0]\n  Target bbox: [625.05, 329.3, 654.98, 390.32]\n\nFrame 3:\n  Drone pose: [-87.93, -72.09, 20.19, -39.16, 41.08, 0.0]\n  Target bbox: [620.67, 325.39, 659.52, 394.42]\n\nFrame 4:\n  Drone pose: [-87.91, -71.61, 20.17, -39.13, 41.14, 0.0]\n  Target bbox: [620.66, 325.38, 659.53, 394.43]\n\nFrame 5 (current):\n  Drone pose: [-87.91, -71.14, 20.15, -39.08, 41.19, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.86, \"ymin\": 324.32, \"xmax\": 661.38, \"ymax\": 395.59}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.47, \"dz\": -0.01, \"dpitch\": 0.05, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": 0.94, \"dz\": -0.03, \"dpitch\": 0.11, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 1.41, \"dz\": -0.04, \"dpitch\": 0.19, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": 1.88, \"dz\": -0.06, \"dpitch\": 0.45, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": 2.37, \"dz\": -0.07, \"dpitch\": 0.48, \"dyaw\": 0.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.25, "window_alt_abs_m": 0.09, "target_px_mean_hist": 480.0, "cur_frame_id": 20, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00037/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-84.69, -65.77, 20.02, -38.39, 44.38, 0.0]\n  Target bbox: [625.95, 331.71, 653.82, 387.78]\n\nFrame 2:\n  Drone pose: [-83.57, -65.7, 20.02, -38.54, 46.16, 0.0]\n  Target bbox: [625.15, 328.27, 655.19, 391.28]\n\nFrame 3:\n  Drone pose: [-82.28, -65.68, 20.02, -38.75, 48.36, 0.0]\n  Target bbox: [624.94, 328.82, 655.38, 390.79]\n\nFrame 4:\n  Drone pose: [-80.93, -65.69, 20.01, -38.94, 50.73, 0.0]\n  Target bbox: [628.72, 328.65, 651.51, 390.9]\n\nFrame 5 (current):\n  Drone pose: [-79.58, -65.74, 20.01, -39.04, 53.2, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.59, \"ymin\": 327.33, \"xmax\": 653.68, \"ymax\": 392.25}, \"waypoint_deltas\": [{\"dx\": 1.33, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 2.44, \"droll\": 0.0}, {\"dx\": 2.73, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 5.04, \"droll\": 0.0}, {\"dx\": 4.31, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": -0.51, \"dyaw\": 9.07, \"droll\": 0.0}, {\"dx\": 6.23, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -1.02, \"dyaw\": 13.91, \"droll\": 0.0}, {\"dx\": 8.48, \"dy\": 0.17, \"dz\": -0.01, \"dpitch\": -1.65, \"dyaw\": 19.64, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.82, "window_alt_abs_m": 0.01, "target_px_mean_hist": 501.5, "cur_frame_id": 37, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00053/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.02, -56.6, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [623.39, 326.5, 656.65, 392.87]\n\nFrame 2:\n  Drone pose: [-52.02, -56.1, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [621.82, 325.65, 658.18, 393.76]\n\nFrame 3:\n  Drone pose: [-52.02, -55.6, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [617.36, 322.71, 662.54, 396.82]\n\nFrame 4:\n  Drone pose: [-52.02, -55.1, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [623.14, 326.44, 656.89, 392.94]\n\nFrame 5 (current):\n  Drone pose: [-52.02, -54.6, 20.0, -43.39, 128.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.86, \"ymin\": 323.17, \"xmax\": 662.05, \"ymax\": 396.35}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 535.5, "cur_frame_id": 53, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00070/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-49.52, -48.1, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [622.62, 326.24, 657.04, 393.1]\n\nFrame 2:\n  Drone pose: [-49.02, -47.6, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [623.43, 326.97, 656.3, 392.22]\n\nFrame 3:\n  Drone pose: [-48.52, -47.1, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [622.74, 325.6, 656.9, 393.78]\n\nFrame 4:\n  Drone pose: [-48.02, -46.6, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [623.81, 326.56, 655.88, 392.75]\n\nFrame 5 (current):\n  Drone pose: [-47.52, -46.1, 20.0, -43.39, 128.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.19, \"ymin\": 327.28, \"xmax\": 654.54, \"ymax\": 391.98}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": 1.42, \"dz\": 0.0, \"dpitch\": -0.28, \"dyaw\": -1.09, \"droll\": 0.0}, {\"dx\": 1.16, \"dy\": 1.83, \"dz\": 0.0, \"dpitch\": -0.55, \"dyaw\": -2.2, \"droll\": 0.0}, {\"dx\": 1.24, \"dy\": 2.25, \"dz\": 0.0, \"dpitch\": -0.81, \"dyaw\": -3.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 563.2, "cur_frame_id": 70, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.62, -40.52, 20.0, -46.16, 115.88, 0.0]\n  Target bbox: [625.29, 325.6, 654.48, 393.62]\n\nFrame 2:\n  Drone pose: [-45.53, -40.11, 20.0, -46.32, 114.59, 0.0]\n  Target bbox: [618.45, 324.09, 661.65, 395.11]\n\nFrame 3:\n  Drone pose: [-45.45, -39.69, 20.0, -46.14, 114.71, 0.0]\n  Target bbox: [616.48, 323.28, 663.63, 395.97]\n\nFrame 4:\n  Drone pose: [-45.37, -39.28, 20.0, -45.97, 114.83, 0.0]\n  Target bbox: [615.61, 321.78, 664.51, 397.45]\n\nFrame 5 (current):\n  Drone pose: [-45.38, -38.88, 20.0, -45.84, 114.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.33, \"ymin\": 322.29, \"xmax\": 665.77, \"ymax\": 397.03}, \"waypoint_deltas\": [{\"dx\": -0.02, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 0.78, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": -0.81, \"droll\": 0.0}, {\"dx\": -0.55, \"dy\": 1.12, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": -2.02, \"droll\": 0.0}, {\"dx\": -0.91, \"dy\": 1.47, \"dz\": 0.0, \"dpitch\": 0.22, \"dyaw\": -3.23, \"droll\": 0.0}, {\"dx\": -1.27, \"dy\": 1.81, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -4.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.71, "window_alt_abs_m": 0.0, "target_px_mean_hist": 611.5, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00103/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-49.19, -34.67, 20.0, -45.33, 98.92, 0.0]\n  Target bbox: [619.75, 323.29, 660.15, 396.06]\n\nFrame 2:\n  Drone pose: [-49.55, -34.32, 20.0, -45.27, 96.26, 0.0]\n  Target bbox: [620.52, 323.7, 659.41, 395.61]\n\nFrame 3:\n  Drone pose: [-49.91, -33.98, 20.0, -45.15, 93.62, 0.0]\n  Target bbox: [618.18, 322.36, 661.79, 397.06]\n\nFrame 4:\n  Drone pose: [-50.27, -33.64, 20.0, -44.96, 91.0, 0.0]\n  Target bbox: [621.31, 324.96, 658.6, 394.41]\n\nFrame 5 (current):\n  Drone pose: [-50.64, -33.29, 20.0, -44.72, 88.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.42, \"ymin\": 324.15, \"xmax\": 659.56, \"ymax\": 395.23}, \"waypoint_deltas\": [{\"dx\": -0.36, \"dy\": 0.34, \"dz\": 0.0, \"dpitch\": 0.29, \"dyaw\": -2.53, \"droll\": 0.0}, {\"dx\": -0.46, \"dy\": 0.74, \"dz\": 0.0, \"dpitch\": 0.51, \"dyaw\": -4.26, \"droll\": 0.0}, {\"dx\": -0.56, \"dy\": 1.15, \"dz\": 0.0, \"dpitch\": 0.75, \"dyaw\": -5.97, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": 1.55, \"dz\": 0.0, \"dpitch\": 1.01, \"dyaw\": -7.64, \"droll\": 0.0}, {\"dx\": -0.76, \"dy\": 1.96, \"dz\": 0.0, \"dpitch\": 1.29, \"dyaw\": -9.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.49, "window_alt_abs_m": 0.0, "target_px_mean_hist": 549.5, "cur_frame_id": 103, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00116/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00117/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00118/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00119/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00120/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.21, -28.09, 20.0, -42.17, 77.38, 0.0]\n  Target bbox: [621.18, 327.87, 658.61, 391.49]\n\nFrame 2:\n  Drone pose: [-52.31, -27.69, 20.0, -42.01, 77.17, 0.0]\n  Target bbox: [623.67, 327.91, 656.12, 391.46]\n\nFrame 3:\n  Drone pose: [-52.41, -27.28, 20.0, -41.86, 76.96, 0.0]\n  Target bbox: [617.79, 325.81, 661.95, 393.58]\n\nFrame 4:\n  Drone pose: [-52.51, -26.88, 20.0, -41.7, 76.75, 0.0]\n  Target bbox: [620.9, 327.75, 658.89, 391.66]\n\nFrame 5 (current):\n  Drone pose: [-52.61, -26.47, 20.0, -41.55, 76.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.52, \"ymin\": 327.35, \"xmax\": 661.24, \"ymax\": 392.12}, \"waypoint_deltas\": [{\"dx\": -0.1, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": -0.2, \"dy\": 0.81, \"dz\": 0.0, \"dpitch\": 0.31, \"dyaw\": -0.4, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": 1.21, \"dz\": 0.0, \"dpitch\": 0.46, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": -0.4, \"dy\": 1.62, \"dz\": 0.0, \"dpitch\": 0.61, \"dyaw\": -0.79, \"droll\": 0.0}, {\"dx\": -0.5, \"dy\": 2.02, \"dz\": 0.0, \"dpitch\": 0.76, \"dyaw\": -0.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.83, "window_alt_abs_m": 0.0, "target_px_mean_hist": 551.8, "cur_frame_id": 120, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00132/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00133/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00134/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00135/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00136/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.82, -21.61, 20.0, -39.75, 74.27, 0.0]\n  Target bbox: [625.43, 329.66, 654.4, 389.87]\n\nFrame 2:\n  Drone pose: [-53.92, -21.21, 20.0, -39.61, 74.09, 0.0]\n  Target bbox: [620.58, 328.13, 659.24, 391.41]\n\nFrame 3:\n  Drone pose: [-53.93, -20.77, 20.0, -39.53, 74.11, 0.0]\n  Target bbox: [619.9, 327.15, 659.89, 392.4]\n\nFrame 4:\n  Drone pose: [-53.92, -20.33, 20.0, -39.46, 74.17, 0.0]\n  Target bbox: [620.96, 328.49, 658.86, 391.08]\n\nFrame 5 (current):\n  Drone pose: [-53.83, -19.86, 20.0, -39.46, 74.42, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.13, \"ymin\": 328.14, \"xmax\": 657.68, \"ymax\": 391.41}, \"waypoint_deltas\": [{\"dx\": 0.09, \"dy\": 0.48, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.23, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": 0.95, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.45, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": 1.43, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.67, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": 1.91, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.88, \"droll\": 0.0}, {\"dx\": 0.42, \"dy\": 2.39, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 1.08, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.5, "window_alt_abs_m": 0.0, "target_px_mean_hist": 499.8, "cur_frame_id": 136, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00149/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00150/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00151/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00152/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/ORI/frames_playback/frame_00153/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.89, -13.61, 20.0, -39.45, 76.82, 0.0]\n  Target bbox: [618.6, 328.08, 661.17, 391.49]\n\nFrame 2:\n  Drone pose: [-52.84, -13.12, 20.0, -39.44, 76.95, 0.0]\n  Target bbox: [620.73, 328.52, 659.07, 390.99]\n\nFrame 3:\n  Drone pose: [-52.79, -12.64, 20.0, -39.44, 77.08, 0.0]\n  Target bbox: [621.39, 328.85, 658.41, 390.66]\n\nFrame 4:\n  Drone pose: [-52.74, -12.15, 20.0, -39.44, 77.2, 0.0]\n  Target bbox: [626.89, 329.95, 652.94, 389.54]\n\nFrame 5 (current):\n  Drone pose: [-52.7, -11.66, 20.0, -39.44, 77.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.89, \"ymin\": 330.29, \"xmax\": 652.95, \"ymax\": 389.22}, \"waypoint_deltas\": [{\"dx\": 0.04, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 0.98, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": 1.47, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.26, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 1.96, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.33, \"droll\": 0.0}, {\"dx\": 0.15, \"dy\": 2.45, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.38, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.49, "window_alt_abs_m": 0.0, "target_px_mean_hist": 502.0, "cur_frame_id": 153, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-74.57, -85.33, 22.0, -46.48, 81.47, 0.0]\n  Target bbox: [620.07, 325.18, 660.01, 394.22]\n\nFrame 2:\n  Drone pose: [-77.43, -84.08, 21.06, -43.99, 73.32, 0.0]\n  Target bbox: [586.19, 327.14, 627.28, 392.67]\n\nFrame 3:\n  Drone pose: [-80.18, -82.93, 20.7, -41.5, 59.97, 0.0]\n  Target bbox: [679.79, 373.89, 721.24, 442.06]\n\nFrame 4:\n  Drone pose: [-82.93, -81.83, 20.67, -47.83, 60.11, 0.0]\n  Target bbox: [587.53, 247.26, 627.9, 314.67]\n\nFrame 5 (current):\n  Drone pose: [-85.2, -80.82, 20.64, -41.14, 56.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 563.13, \"ymin\": 340.31, \"xmax\": 603.92, \"ymax\": 407.24}, \"waypoint_deltas\": [{\"dx\": -1.58, \"dy\": 0.89, \"dz\": -0.02, \"dpitch\": 0.21, \"dyaw\": -8.25, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": 1.69, \"dz\": -0.04, \"dpitch\": 0.77, \"dyaw\": -10.5, \"droll\": 0.0}, {\"dx\": -3.01, \"dy\": 2.39, \"dz\": -0.07, \"dpitch\": 1.47, \"dyaw\": -12.53, \"droll\": 0.0}, {\"dx\": -3.22, \"dy\": 3.07, \"dz\": -0.09, \"dpitch\": 1.97, \"dyaw\": -14.01, \"droll\": 0.0}, {\"dx\": -3.22, \"dy\": 3.69, \"dz\": -0.19, \"dpitch\": 2.03, \"dyaw\": -14.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 25.45, "window_alt_abs_m": 1.36, "target_px_mean_hist": 483.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00020/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-88.0, -73.08, 20.24, -37.39, 45.81, 0.0]\n  Target bbox: [557.23, 358.0, 595.57, 424.75]\n\nFrame 2:\n  Drone pose: [-87.96, -72.58, 20.21, -39.17, 41.02, 0.0]\n  Target bbox: [624.41, 328.58, 655.63, 391.05]\n\nFrame 3:\n  Drone pose: [-87.93, -72.09, 20.19, -39.16, 41.08, 0.0]\n  Target bbox: [625.32, 331.55, 654.53, 387.92]\n\nFrame 4:\n  Drone pose: [-88.02, -71.46, 20.25, -38.66, 32.81, 0.0]\n  Target bbox: [662.9, 338.07, 690.41, 399.04]\n\nFrame 5 (current):\n  Drone pose: [-87.91, -71.14, 20.15, -39.08, 41.19, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.34, \"ymin\": 324.17, \"xmax\": 660.86, \"ymax\": 395.68}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.47, \"dz\": -0.01, \"dpitch\": 0.05, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": 0.94, \"dz\": -0.03, \"dpitch\": 0.11, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 1.41, \"dz\": -0.04, \"dpitch\": 0.19, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": 1.88, \"dz\": -0.06, \"dpitch\": 0.45, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": 2.37, \"dz\": -0.07, \"dpitch\": 0.48, \"dyaw\": 0.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.52, "window_alt_abs_m": 0.19, "target_px_mean_hist": 438.0, "cur_frame_id": 20, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00037/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-84.69, -65.77, 20.02, -38.39, 44.38, 0.0]\n  Target bbox: [625.61, 330.7, 654.15, 388.77]\n\nFrame 2:\n  Drone pose: [-83.57, -65.7, 20.02, -41.04, 49.48, 0.0]\n  Target bbox: [582.23, 287.92, 610.55, 349.25]\n\nFrame 3:\n  Drone pose: [-82.28, -65.68, 20.02, -38.75, 48.36, 0.0]\n  Target bbox: [627.76, 329.94, 652.46, 389.58]\n\nFrame 4:\n  Drone pose: [-80.93, -65.69, 20.01, -38.06, 46.17, 0.0]\n  Target bbox: [683.13, 344.0, 716.66, 407.92]\n\nFrame 5 (current):\n  Drone pose: [-79.58, -65.74, 20.01, -39.3, 51.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 646.28, \"ymin\": 322.47, \"xmax\": 680.25, \"ymax\": 389.1}, \"waypoint_deltas\": [{\"dx\": 1.33, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.22, \"dyaw\": 4.19, \"droll\": 0.0}, {\"dx\": 2.73, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": 6.79, \"droll\": 0.0}, {\"dx\": 4.31, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": -0.25, \"dyaw\": 10.82, \"droll\": 0.0}, {\"dx\": 6.23, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.76, \"dyaw\": 15.66, \"droll\": 0.0}, {\"dx\": 8.48, \"dy\": 0.17, \"dz\": -0.01, \"dpitch\": -1.39, \"dyaw\": 21.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.67, "window_alt_abs_m": 0.01, "target_px_mean_hist": 507.2, "cur_frame_id": 37, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00053/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.02, -56.6, 20.0, -47.91, 130.92, 0.0]\n  Target bbox: [599.0, 250.61, 633.07, 317.44]\n\nFrame 2:\n  Drone pose: [-52.02, -56.1, 20.0, -38.39, 126.45, 0.0]\n  Target bbox: [649.95, 407.97, 693.01, 480.66]\n\nFrame 3:\n  Drone pose: [-52.02, -55.6, 20.0, -38.39, 129.99, 0.0]\n  Target bbox: [605.33, 406.99, 649.57, 480.91]\n\nFrame 4:\n  Drone pose: [-52.02, -55.1, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [623.55, 326.29, 656.49, 393.06]\n\nFrame 5 (current):\n  Drone pose: [-52.02, -54.6, 20.0, -39.06, 130.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 597.67, \"ymin\": 395.17, \"xmax\": 643.05, \"ymax\": 470.32}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -4.33, \"dyaw\": -1.57, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": -4.33, \"dyaw\": -1.57, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.5, \"dz\": 0.0, \"dpitch\": -4.33, \"dyaw\": -1.57, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.0, \"dz\": 0.0, \"dpitch\": -4.33, \"dyaw\": -1.57, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.5, \"dz\": 0.0, \"dpitch\": -4.33, \"dyaw\": -1.57, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.59, "window_alt_abs_m": 0.0, "target_px_mean_hist": 545.2, "cur_frame_id": 53, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00070/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-49.52, -48.1, 20.0, -43.39, 128.99, 0.0]\n  Target bbox: [623.69, 325.47, 655.97, 393.85]\n\nFrame 2:\n  Drone pose: [-49.02, -47.6, 20.0, -39.75, 132.25, 0.0]\n  Target bbox: [577.2, 385.24, 622.37, 458.23]\n\nFrame 3:\n  Drone pose: [-48.47, -46.91, 20.0, -38.78, 129.74, 0.0]\n  Target bbox: [572.12, 335.49, 604.26, 396.3]\n\nFrame 4:\n  Drone pose: [-48.02, -46.6, 20.0, -45.17, 123.99, 0.0]\n  Target bbox: [683.6, 297.18, 718.78, 366.1]\n\nFrame 5 (current):\n  Drone pose: [-47.52, -46.1, 20.0, -43.39, 128.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.49, \"ymin\": 327.51, \"xmax\": 651.28, \"ymax\": 391.82}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": 1.42, \"dz\": 0.0, \"dpitch\": -0.28, \"dyaw\": -1.09, \"droll\": 0.0}, {\"dx\": 1.16, \"dy\": 1.83, \"dz\": 0.0, \"dpitch\": -0.55, \"dyaw\": -2.2, \"droll\": 0.0}, {\"dx\": 1.24, \"dy\": 2.25, \"dz\": 0.0, \"dpitch\": -0.81, \"dyaw\": -3.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.51, "window_alt_abs_m": 0.01, "target_px_mean_hist": 540.2, "cur_frame_id": 70, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.62, -40.52, 20.0, -46.16, 115.88, 0.0]\n  Target bbox: [620.87, 322.69, 658.84, 396.54]\n\nFrame 2:\n  Drone pose: [-45.53, -40.11, 20.0, -46.32, 114.59, 0.0]\n  Target bbox: [622.46, 325.25, 657.69, 393.92]\n\nFrame 3:\n  Drone pose: [-45.45, -39.69, 20.0, -48.75, 118.46, 0.0]\n  Target bbox: [580.19, 285.11, 611.59, 348.37]\n\nFrame 4:\n  Drone pose: [-45.37, -39.17, 19.85, -36.02, 113.54, 0.0]\n  Target bbox: [668.94, 398.49, 701.62, 466.09]\n\nFrame 5 (current):\n  Drone pose: [-45.38, -38.88, 20.0, -48.74, 115.82, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 607.05, \"ymin\": 277.08, \"xmax\": 645.18, \"ymax\": 344.65}, \"waypoint_deltas\": [{\"dx\": -0.02, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": 3.03, \"dyaw\": -1.34, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 0.78, \"dz\": 0.0, \"dpitch\": 3.1, \"dyaw\": -1.96, \"droll\": 0.0}, {\"dx\": -0.55, \"dy\": 1.12, \"dz\": 0.0, \"dpitch\": 3.1, \"dyaw\": -3.17, \"droll\": 0.0}, {\"dx\": -0.91, \"dy\": 1.47, \"dz\": 0.0, \"dpitch\": 3.12, \"dyaw\": -4.38, \"droll\": 0.0}, {\"dx\": -1.27, \"dy\": 1.81, \"dz\": 0.0, \"dpitch\": 3.14, \"dyaw\": -5.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.38, "window_alt_abs_m": 0.29, "target_px_mean_hist": 581.0, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00103/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-49.19, -34.67, 20.0, -46.74, 93.92, 0.0]\n  Target bbox: [677.84, 300.67, 721.01, 375.13]\n\nFrame 2:\n  Drone pose: [-49.55, -34.32, 20.0, -45.27, 96.26, 0.0]\n  Target bbox: [618.69, 322.71, 661.22, 396.7]\n\nFrame 3:\n  Drone pose: [-49.91, -33.98, 20.0, -47.06, 91.36, 0.0]\n  Target bbox: [644.07, 293.0, 690.29, 362.86]\n\nFrame 4:\n  Drone pose: [-50.42, -33.7, 19.93, -48.24, 85.81, 0.0]\n  Target bbox: [672.1, 312.43, 723.0, 382.41]\n\nFrame 5 (current):\n  Drone pose: [-50.64, -33.29, 20.0, -44.72, 88.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.82, \"ymin\": 323.46, \"xmax\": 660.35, \"ymax\": 395.83}, \"waypoint_deltas\": [{\"dx\": -0.36, \"dy\": 0.34, \"dz\": 0.0, \"dpitch\": 0.29, \"dyaw\": -2.53, \"droll\": 0.0}, {\"dx\": -0.46, \"dy\": 0.74, \"dz\": 0.0, \"dpitch\": 0.51, \"dyaw\": -4.26, \"droll\": 0.0}, {\"dx\": -0.56, \"dy\": 1.15, \"dz\": 0.0, \"dpitch\": 0.75, \"dyaw\": -5.97, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": 1.55, \"dz\": 0.0, \"dpitch\": 1.01, \"dyaw\": -7.64, \"droll\": 0.0}, {\"dx\": -0.76, \"dy\": 1.96, \"dz\": 0.0, \"dpitch\": 1.29, \"dyaw\": -9.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.42, "window_alt_abs_m": 0.14, "target_px_mean_hist": 580.8, "cur_frame_id": 103, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00116/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00117/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00118/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00119/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00120/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.21, -28.09, 20.0, -42.17, 77.38, 0.0]\n  Target bbox: [628.77, 328.57, 651.42, 390.77]\n\nFrame 2:\n  Drone pose: [-52.32, -27.68, 20.01, -40.03, 80.42, 0.0]\n  Target bbox: [642.27, 361.24, 677.79, 430.94]\n\nFrame 3:\n  Drone pose: [-52.42, -27.19, 20.0, -39.57, 84.28, 0.0]\n  Target bbox: [558.48, 348.73, 593.77, 410.31]\n\nFrame 4:\n  Drone pose: [-52.43, -26.82, 19.98, -46.47, 80.3, 0.0]\n  Target bbox: [618.54, 324.46, 661.66, 394.68]\n\nFrame 5 (current):\n  Drone pose: [-52.61, -26.47, 20.0, -42.21, 81.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 555.69, \"ymin\": 318.84, \"xmax\": 597.64, \"ymax\": 382.05}, \"waypoint_deltas\": [{\"dx\": -0.1, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": 0.81, \"dyaw\": -5.2, \"droll\": 0.0}, {\"dx\": -0.2, \"dy\": 0.81, \"dz\": 0.0, \"dpitch\": 0.97, \"dyaw\": -5.4, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": 1.21, \"dz\": 0.0, \"dpitch\": 1.12, \"dyaw\": -5.6, \"droll\": 0.0}, {\"dx\": -0.4, \"dy\": 1.62, \"dz\": 0.0, \"dpitch\": 1.27, \"dyaw\": -5.79, \"droll\": 0.0}, {\"dx\": -0.5, \"dy\": 2.02, \"dz\": 0.0, \"dpitch\": 1.42, \"dyaw\": -5.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.15, "window_alt_abs_m": 0.07, "target_px_mean_hist": 562.2, "cur_frame_id": 120, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00132/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00133/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00134/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00135/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00136/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.82, -21.61, 20.0, -38.87, 77.08, 0.0]\n  Target bbox: [580.88, 342.62, 625.54, 407.66]\n\nFrame 2:\n  Drone pose: [-53.92, -21.21, 20.0, -39.61, 74.09, 0.0]\n  Target bbox: [625.21, 327.76, 654.95, 391.85]\n\nFrame 3:\n  Drone pose: [-53.95, -20.88, 20.09, -33.82, 74.24, 0.0]\n  Target bbox: [588.03, 400.99, 618.83, 462.32]\n\nFrame 4:\n  Drone pose: [-53.92, -20.33, 20.0, -36.9, 69.97, 0.0]\n  Target bbox: [676.08, 373.21, 713.36, 434.92]\n\nFrame 5 (current):\n  Drone pose: [-53.83, -19.9, 20.06, -44.99, 72.26, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.72, \"ymin\": 327.76, \"xmax\": 656.09, \"ymax\": 391.45}, \"waypoint_deltas\": [{\"dx\": 0.09, \"dy\": 0.52, \"dz\": -0.06, \"dpitch\": 5.53, \"dyaw\": 2.39, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": 0.99, \"dz\": -0.06, \"dpitch\": 5.53, \"dyaw\": 2.61, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": 1.47, \"dz\": -0.06, \"dpitch\": 5.53, \"dyaw\": 2.83, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": 1.95, \"dz\": -0.06, \"dpitch\": 5.53, \"dyaw\": 3.04, \"droll\": 0.0}, {\"dx\": 0.42, \"dy\": 2.43, \"dz\": -0.06, \"dpitch\": 5.53, \"dyaw\": 3.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.71, "window_alt_abs_m": 0.23, "target_px_mean_hist": 499.8, "cur_frame_id": 136, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00149/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00150/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00151/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00152/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840/aug_001/frames_playback/frame_00153/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-52.89, -13.61, 20.0, -39.32, 79.19, 0.0]\n  Target bbox: [588.7, 332.16, 628.83, 392.2]\n\nFrame 2:\n  Drone pose: [-52.84, -13.22, 19.96, -33.03, 77.45, 0.0]\n  Target bbox: [622.54, 357.79, 653.69, 409.69]\n\nFrame 3:\n  Drone pose: [-52.79, -12.64, 20.0, -39.44, 77.08, 0.0]\n  Target bbox: [621.03, 327.91, 658.75, 391.61]\n\nFrame 4:\n  Drone pose: [-52.74, -12.15, 20.0, -39.44, 77.2, 0.0]\n  Target bbox: [623.1, 328.37, 656.99, 391.24]\n\nFrame 5 (current):\n  Drone pose: [-52.79, -11.58, 20.01, -41.27, 80.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 651.04, \"ymin\": 278.57, \"xmax\": 689.08, \"ymax\": 345.88}, \"waypoint_deltas\": [{\"dx\": 0.13, \"dy\": 0.41, \"dz\": -0.01, \"dpitch\": 1.84, \"dyaw\": -3.27, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": 0.9, \"dz\": -0.01, \"dpitch\": 1.84, \"dyaw\": -3.19, \"droll\": 0.0}, {\"dx\": 0.19, \"dy\": 1.39, \"dz\": -0.01, \"dpitch\": 1.84, \"dyaw\": -3.11, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": 1.88, \"dz\": -0.01, \"dpitch\": 1.85, \"dyaw\": -3.04, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": 2.37, \"dz\": -0.01, \"dpitch\": 1.85, \"dyaw\": -2.99, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.71, "window_alt_abs_m": 0.08, "target_px_mean_hist": 465.8, "cur_frame_id": 153, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-14/trajectory_1776128840", "difficulty_score": 0.3846, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.87, 13.94, 22.0, -46.42, -177.14, 0.0]\n  Target bbox: [621.92, 325.57, 658.09, 393.69]\n\nFrame 2:\n  Drone pose: [24.31, 12.7, 21.2, -46.9, -179.22, 0.0]\n  Target bbox: [626.71, 328.38, 653.54, 390.72]\n\nFrame 3:\n  Drone pose: [23.33, 12.11, 20.67, -46.87, 178.95, 0.0]\n  Target bbox: [625.97, 322.02, 653.61, 397.28]\n\nFrame 4:\n  Drone pose: [22.66, 11.86, 20.64, -47.08, 178.17, 0.0]\n  Target bbox: [625.21, 328.6, 654.57, 390.32]\n\nFrame 5 (current):\n  Drone pose: [22.15, 11.76, 20.62, -47.05, 177.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.81, \"ymin\": 323.23, \"xmax\": 653.83, \"ymax\": 395.87}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": -0.06, \"dz\": -0.03, \"dpitch\": 0.1, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": -0.12, \"dz\": -0.05, \"dpitch\": 0.19, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": -1.4, \"dy\": -0.18, \"dz\": -0.07, \"dpitch\": 0.26, \"dyaw\": -0.55, \"droll\": 0.0}, {\"dx\": -1.89, \"dy\": -0.24, \"dz\": -0.09, \"dpitch\": 0.31, \"dyaw\": -0.75, \"droll\": 0.0}, {\"dx\": -2.4, \"dy\": -0.31, \"dz\": -0.2, \"dpitch\": 0.47, \"dyaw\": -0.97, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.0, "window_alt_abs_m": 1.38, "target_px_mean_hist": 555.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [15.46, 11.26, 20.19, -46.69, 176.26, 0.0]\n  Target bbox: [624.23, 321.1, 655.32, 398.13]\n\nFrame 2:\n  Drone pose: [14.94, 11.31, 20.17, -46.69, 176.42, 0.0]\n  Target bbox: [624.99, 324.69, 654.68, 394.38]\n\nFrame 3:\n  Drone pose: [14.44, 11.38, 20.15, -46.68, 176.62, 0.0]\n  Target bbox: [625.24, 322.46, 654.38, 396.69]\n\nFrame 4:\n  Drone pose: [13.94, 11.46, 20.14, -46.66, 176.89, 0.0]\n  Target bbox: [625.65, 322.21, 653.98, 396.9]\n\nFrame 5 (current):\n  Drone pose: [13.45, 11.57, 20.12, -46.63, 177.22, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.48, \"ymin\": 321.44, \"xmax\": 654.1, \"ymax\": 397.78}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.12, \"dz\": -0.02, \"dpitch\": 0.04, \"dyaw\": 0.4, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": 0.26, \"dz\": -0.03, \"dpitch\": 0.08, \"dyaw\": 0.83, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": 0.4, \"dz\": -0.04, \"dpitch\": 0.08, \"dyaw\": 1.27, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": 0.54, \"dz\": -0.05, \"dpitch\": 0.04, \"dyaw\": 1.71, \"droll\": 0.0}, {\"dx\": -2.57, \"dy\": 0.67, \"dz\": -0.06, \"dpitch\": -0.05, \"dyaw\": 2.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.96, "window_alt_abs_m": 0.07, "target_px_mean_hist": 614.8, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [5.21, 12.95, 20.01, -48.32, -173.19, 0.0]\n  Target bbox: [620.49, 323.26, 659.55, 395.72]\n\nFrame 2:\n  Drone pose: [4.57, 12.82, 20.01, -48.47, -171.88, 0.0]\n  Target bbox: [615.66, 318.52, 664.27, 400.53]\n\nFrame 3:\n  Drone pose: [3.94, 12.66, 20.01, -48.6, -170.68, 0.0]\n  Target bbox: [620.29, 321.81, 659.63, 397.1]\n\nFrame 4:\n  Drone pose: [3.31, 12.47, 20.01, -48.73, -169.55, 0.0]\n  Target bbox: [615.41, 318.84, 664.45, 400.28]\n\nFrame 5 (current):\n  Drone pose: [2.66, 12.27, 20.01, -48.88, -168.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.96, \"ymin\": 318.46, \"xmax\": 664.86, \"ymax\": 400.67}, \"waypoint_deltas\": [{\"dx\": -0.65, \"dy\": -0.2, \"dz\": -0.01, \"dpitch\": -0.13, \"dyaw\": 1.12, \"droll\": 0.0}, {\"dx\": -1.3, \"dy\": -0.41, \"dz\": -0.01, \"dpitch\": -0.05, \"dyaw\": 2.21, \"droll\": 0.0}, {\"dx\": -1.92, \"dy\": -0.64, \"dz\": -0.01, \"dpitch\": -0.14, \"dyaw\": 3.25, \"droll\": 0.0}, {\"dx\": -2.52, \"dy\": -0.88, \"dz\": -0.01, \"dpitch\": -0.18, \"dyaw\": 4.21, \"droll\": 0.0}, {\"dx\": -3.08, \"dy\": -1.15, \"dz\": -0.01, \"dpitch\": -0.17, \"dyaw\": 5.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.75, "window_alt_abs_m": 0.01, "target_px_mean_hist": 429.8, "cur_frame_id": 39, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-4.3, 8.02, 20.0, -48.37, -160.56, 0.0]\n  Target bbox: [622.19, 321.75, 657.59, 397.16]\n\nFrame 2:\n  Drone pose: [-4.76, 7.62, 20.0, -48.26, -160.3, 0.0]\n  Target bbox: [618.64, 321.1, 661.15, 397.91]\n\nFrame 3:\n  Drone pose: [-5.21, 7.25, 20.0, -48.1, -159.93, 0.0]\n  Target bbox: [620.34, 319.04, 660.06, 400.09]\n\nFrame 4:\n  Drone pose: [-5.65, 6.93, 20.0, -48.19, -161.03, 0.0]\n  Target bbox: [620.76, 318.91, 659.63, 400.15]\n\nFrame 5 (current):\n  Drone pose: [-6.09, 6.65, 20.0, -48.24, -162.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.97, \"ymin\": 317.58, \"xmax\": 661.53, \"ymax\": 401.62}, \"waypoint_deltas\": [{\"dx\": -0.43, \"dy\": -0.26, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.89, \"droll\": 0.0}, {\"dx\": -0.86, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -1.7, \"droll\": 0.0}, {\"dx\": -1.29, \"dy\": -0.7, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -2.45, \"droll\": 0.0}, {\"dx\": -1.73, \"dy\": -0.89, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -3.16, \"droll\": 0.0}, {\"dx\": -2.16, \"dy\": -1.08, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -3.83, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.71, "window_alt_abs_m": 0.0, "target_px_mean_hist": 641.8, "cur_frame_id": 56, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-12.41, 4.81, 20.0, -47.92, -168.61, 0.0]\n  Target bbox: [623.05, 322.17, 657.28, 396.84]\n\nFrame 2:\n  Drone pose: [-12.91, 4.86, 20.0, -47.9, -168.42, 0.0]\n  Target bbox: [622.8, 322.04, 657.52, 396.86]\n\nFrame 3:\n  Drone pose: [-13.42, 4.95, 20.0, -47.89, -168.14, 0.0]\n  Target bbox: [623.05, 324.45, 657.18, 394.39]\n\nFrame 4:\n  Drone pose: [-13.95, 5.06, 20.0, -47.89, -167.77, 0.0]\n  Target bbox: [621.14, 318.73, 659.34, 400.41]\n\nFrame 5 (current):\n  Drone pose: [-14.48, 5.18, 20.0, -47.91, -167.34, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.6, \"ymin\": 318.89, \"xmax\": 659.87, \"ymax\": 400.22}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.45, \"droll\": 0.0}, {\"dx\": -1.1, \"dy\": 0.24, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.87, \"droll\": 0.0}, {\"dx\": -1.66, \"dy\": 0.33, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 1.2, \"droll\": 0.0}, {\"dx\": -2.23, \"dy\": 0.37, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": 1.39, \"droll\": 0.0}, {\"dx\": -2.82, \"dy\": 0.35, \"dz\": 0.0, \"dpitch\": -0.38, \"dyaw\": 1.4, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.27, "window_alt_abs_m": 0.0, "target_px_mean_hist": 652.8, "cur_frame_id": 74, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-22.9, 3.65, 20.0, -48.84, -156.19, 0.0]\n  Target bbox: [621.53, 325.06, 658.35, 393.81]\n\nFrame 2:\n  Drone pose: [-23.55, 3.38, 20.0, -48.91, -155.27, 0.0]\n  Target bbox: [617.93, 320.77, 661.79, 398.21]\n\nFrame 3:\n  Drone pose: [-24.16, 3.11, 20.0, -48.91, -154.38, 0.0]\n  Target bbox: [616.28, 319.31, 663.33, 399.8]\n\nFrame 4:\n  Drone pose: [-24.79, 2.77, 20.0, -48.99, -153.68, 0.0]\n  Target bbox: [621.23, 324.61, 658.6, 394.27]\n\nFrame 5 (current):\n  Drone pose: [-25.4, 2.44, 20.0, -49.04, -152.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.63, \"ymin\": 323.43, \"xmax\": 659.15, \"ymax\": 395.46}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": -0.32, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.65, \"droll\": 0.0}, {\"dx\": -1.17, \"dy\": -0.66, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 1.31, \"droll\": 0.0}, {\"dx\": -1.76, \"dy\": -0.99, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 1.97, \"droll\": 0.0}, {\"dx\": -2.35, \"dy\": -1.33, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 2.62, \"droll\": 0.0}, {\"dx\": -2.94, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 3.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.22, "window_alt_abs_m": 0.0, "target_px_mean_hist": 488.0, "cur_frame_id": 92, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00109/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-33.12, -2.04, 20.0, -49.04, -144.78, 0.0]\n  Target bbox: [619.06, 322.38, 660.6, 396.43]\n\nFrame 2:\n  Drone pose: [-33.75, -2.38, 20.0, -49.06, -144.09, 0.0]\n  Target bbox: [619.25, 322.17, 660.39, 396.66]\n\nFrame 3:\n  Drone pose: [-34.38, -2.72, 20.0, -49.07, -143.36, 0.0]\n  Target bbox: [619.64, 321.99, 659.98, 396.85]\n\nFrame 4:\n  Drone pose: [-35.02, -3.05, 20.0, -49.1, -142.59, 0.0]\n  Target bbox: [620.13, 324.79, 659.58, 394.01]\n\nFrame 5 (current):\n  Drone pose: [-35.68, -3.37, 20.0, -49.12, -141.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.0, \"ymin\": 322.29, \"xmax\": 659.61, \"ymax\": 396.54}, \"waypoint_deltas\": [{\"dx\": -0.67, \"dy\": -0.33, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.85, \"droll\": 0.0}, {\"dx\": -1.34, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 1.73, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": -0.96, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 2.64, \"droll\": 0.0}, {\"dx\": -2.7, \"dy\": -1.27, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 3.54, \"droll\": 0.0}, {\"dx\": -3.37, \"dy\": -1.59, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 4.41, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 641.0, "cur_frame_id": 109, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-42.54, -8.0, 20.0, -49.21, -130.09, 0.0]\n  Target bbox: [620.0, 321.99, 659.93, 396.96]\n\nFrame 2:\n  Drone pose: [-42.67, -8.39, 20.0, -49.19, -129.48, 0.0]\n  Target bbox: [614.9, 318.79, 665.18, 400.26]\n\nFrame 3:\n  Drone pose: [-42.79, -8.78, 20.0, -49.18, -128.94, 0.0]\n  Target bbox: [614.9, 318.22, 665.12, 400.8]\n\nFrame 4:\n  Drone pose: [-42.91, -9.19, 20.0, -49.2, -128.4, 0.0]\n  Target bbox: [614.8, 318.37, 665.28, 400.61]\n\nFrame 5 (current):\n  Drone pose: [-43.06, -9.61, 20.0, -49.25, -127.83, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.45, \"ymin\": 318.2, \"xmax\": 666.64, \"ymax\": 400.89}, \"waypoint_deltas\": [{\"dx\": -0.19, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.66, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -0.9, \"dz\": 0.0, \"dpitch\": -0.3, \"dyaw\": 1.44, \"droll\": 0.0}, {\"dx\": -0.73, \"dy\": -1.37, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.92, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 0.52, \"droll\": 0.0}, {\"dx\": -1.46, \"dy\": -2.35, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": 0.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.26, "window_alt_abs_m": 0.0, "target_px_mean_hist": 644.0, "cur_frame_id": 127, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00144/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.25, -15.95, 20.0, -48.46, -128.32, 0.0]\n  Target bbox: [620.72, 321.04, 659.74, 397.97]\n\nFrame 2:\n  Drone pose: [-48.77, -16.42, 20.0, -48.45, -128.2, 0.0]\n  Target bbox: [620.17, 322.57, 660.21, 396.26]\n\nFrame 3:\n  Drone pose: [-49.3, -16.88, 20.0, -48.43, -128.03, 0.0]\n  Target bbox: [623.33, 324.42, 656.95, 394.37]\n\nFrame 4:\n  Drone pose: [-49.85, -17.33, 20.0, -48.42, -127.79, 0.0]\n  Target bbox: [625.66, 323.86, 654.61, 394.97]\n\nFrame 5 (current):\n  Drone pose: [-50.42, -17.78, 20.0, -48.41, -127.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.29, \"ymin\": 324.97, \"xmax\": 658.0, \"ymax\": 393.92}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.34, \"droll\": 0.0}, {\"dx\": -1.16, \"dy\": -0.87, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": -1.73, \"dy\": -1.3, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 1.05, \"droll\": 0.0}, {\"dx\": -2.3, \"dy\": -1.73, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 1.37, \"droll\": 0.0}, {\"dx\": -2.84, \"dy\": -2.16, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 1.62, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.83, "window_alt_abs_m": 0.0, "target_px_mean_hist": 638.2, "cur_frame_id": 144, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00158/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00159/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/ORI/frames_playback/frame_00162/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.59, -25.61, 20.0, -48.61, -131.51, 0.0]\n  Target bbox: [620.61, 322.33, 659.33, 396.63]\n\nFrame 2:\n  Drone pose: [-53.42, -26.34, 20.0, -48.7, -132.49, 0.0]\n  Target bbox: [617.85, 320.32, 662.16, 398.7]\n\nFrame 3:\n  Drone pose: [-53.24, -27.07, 20.0, -48.79, -133.48, 0.0]\n  Target bbox: [614.98, 317.62, 665.17, 401.4]\n\nFrame 4:\n  Drone pose: [-53.07, -27.8, 20.0, -48.86, -134.47, 0.0]\n  Target bbox: [616.31, 318.64, 663.82, 400.36]\n\nFrame 5 (current):\n  Drone pose: [-52.89, -28.53, 20.0, -48.93, -135.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.57, \"ymin\": 317.1, \"xmax\": 666.66, \"ymax\": 402.08}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": -0.73, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.45, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -2.0, \"droll\": 0.0}, {\"dx\": 0.37, \"dy\": -1.97, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -2.11, \"droll\": 0.0}, {\"dx\": 0.4, \"dy\": -2.49, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -2.21, \"droll\": 0.0}, {\"dx\": 0.43, \"dy\": -3.0, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -2.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.95, "window_alt_abs_m": 0.0, "target_px_mean_hist": 655.2, "cur_frame_id": 162, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.87, 13.94, 22.0, -46.42, -177.14, 0.0]\n  Target bbox: [621.99, 325.47, 657.92, 393.81]\n\nFrame 2:\n  Drone pose: [24.23, 12.55, 21.19, -50.83, -174.67, 0.0]\n  Target bbox: [567.48, 262.05, 597.45, 331.91]\n\nFrame 3:\n  Drone pose: [23.33, 12.11, 20.67, -44.07, 173.95, 0.0]\n  Target bbox: [682.06, 370.82, 713.0, 446.06]\n\nFrame 4:\n  Drone pose: [22.68, 11.96, 20.51, -50.85, 178.78, 0.0]\n  Target bbox: [622.32, 261.18, 650.16, 323.68]\n\nFrame 5 (current):\n  Drone pose: [22.15, 11.76, 20.62, -46.85, 179.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 605.98, \"ymin\": 332.1, \"xmax\": 634.15, \"ymax\": 393.95}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": -0.06, \"dz\": -0.03, \"dpitch\": -0.1, \"dyaw\": -1.88, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": -0.12, \"dz\": -0.05, \"dpitch\": -0.01, \"dyaw\": -2.06, \"droll\": 0.0}, {\"dx\": -1.4, \"dy\": -0.18, \"dz\": -0.07, \"dpitch\": 0.06, \"dyaw\": -2.25, \"droll\": 0.0}, {\"dx\": -1.89, \"dy\": -0.24, \"dz\": -0.09, \"dpitch\": 0.11, \"dyaw\": -2.45, \"droll\": 0.0}, {\"dx\": -2.4, \"dy\": -0.31, \"dz\": -0.2, \"dpitch\": 0.27, \"dyaw\": -2.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.46, "window_alt_abs_m": 1.59, "target_px_mean_hist": 566.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [15.33, 11.23, 20.17, -46.86, 176.13, 0.0]\n  Target bbox: [625.48, 321.6, 654.13, 397.53]\n\nFrame 2:\n  Drone pose: [14.87, 11.17, 20.13, -44.59, 173.19, 0.0]\n  Target bbox: [655.84, 362.93, 688.18, 428.73]\n\nFrame 3:\n  Drone pose: [14.46, 11.5, 20.11, -46.59, 177.02, 0.0]\n  Target bbox: [625.21, 325.94, 654.5, 393.1]\n\nFrame 4:\n  Drone pose: [13.91, 11.48, 19.97, -48.62, 176.93, 0.0]\n  Target bbox: [625.87, 287.63, 654.2, 358.51]\n\nFrame 5 (current):\n  Drone pose: [13.32, 11.65, 20.08, -46.0, 177.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.36, \"ymin\": 334.35, \"xmax\": 651.53, \"ymax\": 411.09}, \"waypoint_deltas\": [{\"dx\": -0.36, \"dy\": 0.04, \"dz\": 0.02, \"dpitch\": -0.59, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -0.84, \"dy\": 0.18, \"dz\": 0.01, \"dpitch\": -0.55, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": -1.34, \"dy\": 0.32, \"dz\": 0.0, \"dpitch\": -0.55, \"dyaw\": 0.81, \"droll\": 0.0}, {\"dx\": -1.88, \"dy\": 0.46, \"dz\": -0.01, \"dpitch\": -0.59, \"dyaw\": 1.25, \"droll\": 0.0}, {\"dx\": -2.44, \"dy\": 0.59, \"dz\": -0.02, \"dpitch\": -0.68, \"dyaw\": 1.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.59, "window_alt_abs_m": 0.32, "target_px_mean_hist": 619.8, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [5.3, 13.01, 20.01, -48.16, -173.04, 0.0]\n  Target bbox: [617.39, 320.46, 662.63, 398.58]\n\nFrame 2:\n  Drone pose: [4.57, 12.82, 20.01, -50.67, -176.88, 0.0]\n  Target bbox: [669.44, 282.7, 723.19, 366.29]\n\nFrame 3:\n  Drone pose: [3.79, 12.59, 20.06, -48.95, -170.85, 0.0]\n  Target bbox: [616.55, 319.34, 663.36, 399.67]\n\nFrame 4:\n  Drone pose: [3.4, 12.35, 20.06, -53.69, -167.06, 0.0]\n  Target bbox: [586.92, 238.31, 626.12, 313.61]\n\nFrame 5 (current):\n  Drone pose: [2.8, 12.36, 20.0, -53.6, -169.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.24, \"ymin\": 234.82, \"xmax\": 678.61, \"ymax\": 316.23}, \"waypoint_deltas\": [{\"dx\": -0.79, \"dy\": -0.29, \"dz\": 0.0, \"dpitch\": 4.59, \"dyaw\": 2.19, \"droll\": 0.0}, {\"dx\": -1.44, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 4.67, \"dyaw\": 3.28, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": -0.73, \"dz\": 0.0, \"dpitch\": 4.58, \"dyaw\": 4.32, \"droll\": 0.0}, {\"dx\": -2.66, \"dy\": -0.97, \"dz\": 0.0, \"dpitch\": 4.54, \"dyaw\": 5.28, \"droll\": 0.0}, {\"dx\": -3.22, \"dy\": -1.24, \"dz\": 0.0, \"dpitch\": 4.55, \"dyaw\": 6.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.11, "window_alt_abs_m": 0.11, "target_px_mean_hist": 439.0, "cur_frame_id": 39, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-4.36, 7.98, 20.1, -48.65, -160.62, 0.0]\n  Target bbox: [615.58, 318.48, 664.09, 400.67]\n\nFrame 2:\n  Drone pose: [-4.76, 7.62, 20.0, -49.16, -160.86, 0.0]\n  Target bbox: [622.68, 303.33, 669.86, 385.46]\n\nFrame 3:\n  Drone pose: [-5.15, 7.22, 20.17, -48.28, -160.1, 0.0]\n  Target bbox: [620.8, 319.7, 659.56, 399.34]\n\nFrame 4:\n  Drone pose: [-5.68, 7.1, 20.1, -48.29, -160.47, 0.0]\n  Target bbox: [619.0, 318.07, 661.44, 401.08]\n\nFrame 5 (current):\n  Drone pose: [-6.09, 6.65, 20.0, -51.02, -167.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 675.53, \"ymin\": 273.2, \"xmax\": 717.93, \"ymax\": 356.23}, \"waypoint_deltas\": [{\"dx\": -0.43, \"dy\": -0.26, \"dz\": 0.0, \"dpitch\": 2.77, \"dyaw\": 4.11, \"droll\": 0.0}, {\"dx\": -0.86, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": 2.77, \"dyaw\": 3.3, \"droll\": 0.0}, {\"dx\": -1.29, \"dy\": -0.7, \"dz\": 0.0, \"dpitch\": 2.78, \"dyaw\": 2.55, \"droll\": 0.0}, {\"dx\": -1.73, \"dy\": -0.89, \"dz\": 0.0, \"dpitch\": 2.8, \"dyaw\": 1.84, \"droll\": 0.0}, {\"dx\": -2.16, \"dy\": -1.08, \"dz\": 0.0, \"dpitch\": 2.82, \"dyaw\": 1.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.91, "window_alt_abs_m": 0.44, "target_px_mean_hist": 640.0, "cur_frame_id": 56, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-12.44, 4.75, 19.94, -47.91, -168.77, 0.0]\n  Target bbox: [621.72, 318.24, 658.79, 400.98]\n\nFrame 2:\n  Drone pose: [-12.96, 4.74, 20.12, -49.13, -173.81, 0.0]\n  Target bbox: [678.8, 305.94, 714.38, 385.41]\n\nFrame 3:\n  Drone pose: [-13.31, 5.09, 19.98, -47.63, -167.74, 0.0]\n  Target bbox: [622.52, 318.85, 657.91, 400.22]\n\nFrame 4:\n  Drone pose: [-13.95, 5.06, 20.0, -47.51, -172.77, 0.0]\n  Target bbox: [679.07, 333.65, 714.54, 401.59]\n\nFrame 5 (current):\n  Drone pose: [-14.62, 5.2, 20.05, -48.21, -167.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.92, \"ymin\": 320.25, \"xmax\": 658.47, \"ymax\": 398.77}, \"waypoint_deltas\": [{\"dx\": -0.41, \"dy\": 0.11, \"dz\": -0.05, \"dpitch\": 0.28, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -0.96, \"dy\": 0.22, \"dz\": -0.05, \"dpitch\": 0.24, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": 0.31, \"dz\": -0.05, \"dpitch\": 0.17, \"dyaw\": 1.02, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": 0.35, \"dz\": -0.05, \"dpitch\": 0.07, \"dyaw\": 1.21, \"droll\": 0.0}, {\"dx\": -2.68, \"dy\": 0.33, \"dz\": -0.05, \"dpitch\": -0.08, \"dyaw\": 1.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.75, "window_alt_abs_m": 0.38, "target_px_mean_hist": 637.0, "cur_frame_id": 74, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-22.9, 3.65, 20.0, -51.47, -160.29, 0.0]\n  Target bbox: [660.7, 275.57, 710.81, 357.6]\n\nFrame 2:\n  Drone pose: [-23.66, 3.39, 19.92, -53.79, -153.8, 0.0]\n  Target bbox: [607.29, 241.13, 643.57, 315.36]\n\nFrame 3:\n  Drone pose: [-24.12, 3.05, 20.06, -48.98, -154.64, 0.0]\n  Target bbox: [617.76, 320.58, 661.95, 398.37]\n\nFrame 4:\n  Drone pose: [-24.83, 2.75, 19.81, -48.79, -153.69, 0.0]\n  Target bbox: [616.65, 318.87, 662.95, 400.22]\n\nFrame 5 (current):\n  Drone pose: [-25.4, 2.44, 20.0, -52.38, -157.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 676.08, \"ymin\": 266.23, \"xmax\": 714.86, \"ymax\": 343.95}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": -0.32, \"dz\": 0.0, \"dpitch\": 3.36, \"dyaw\": 5.65, \"droll\": 0.0}, {\"dx\": -1.17, \"dy\": -0.66, \"dz\": 0.0, \"dpitch\": 3.36, \"dyaw\": 6.31, \"droll\": 0.0}, {\"dx\": -1.76, \"dy\": -0.99, \"dz\": 0.0, \"dpitch\": 3.36, \"dyaw\": 6.97, \"droll\": 0.0}, {\"dx\": -2.35, \"dy\": -1.33, \"dz\": 0.0, \"dpitch\": 3.36, \"dyaw\": 7.62, \"droll\": 0.0}, {\"dx\": -2.94, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": 3.36, \"dyaw\": 8.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.56, "window_alt_abs_m": 0.65, "target_px_mean_hist": 492.8, "cur_frame_id": 92, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00109/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-33.06, -2.0, 19.98, -48.88, -144.81, 0.0]\n  Target bbox: [621.81, 323.65, 657.89, 395.23]\n\nFrame 2:\n  Drone pose: [-33.79, -2.33, 20.17, -47.2, -148.87, 0.0]\n  Target bbox: [673.08, 358.99, 717.11, 434.74]\n\nFrame 3:\n  Drone pose: [-34.38, -2.72, 20.0, -49.07, -143.36, 0.0]\n  Target bbox: [619.42, 321.91, 660.19, 396.93]\n\nFrame 4:\n  Drone pose: [-35.02, -3.05, 20.0, -49.1, -142.59, 0.0]\n  Target bbox: [622.7, 326.17, 657.06, 392.65]\n\nFrame 5 (current):\n  Drone pose: [-35.63, -3.32, 20.02, -47.24, -146.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 670.23, \"ymin\": 355.31, \"xmax\": 712.38, \"ymax\": 426.75}, \"waypoint_deltas\": [{\"dx\": -0.72, \"dy\": -0.38, \"dz\": -0.02, \"dpitch\": -1.92, \"dyaw\": 5.44, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": -0.69, \"dz\": -0.02, \"dpitch\": -1.95, \"dyaw\": 6.32, \"droll\": 0.0}, {\"dx\": -2.07, \"dy\": -1.01, \"dz\": -0.02, \"dpitch\": -1.98, \"dyaw\": 7.23, \"droll\": 0.0}, {\"dx\": -2.75, \"dy\": -1.32, \"dz\": -0.02, \"dpitch\": -2.01, \"dyaw\": 8.13, \"droll\": 0.0}, {\"dx\": -3.42, \"dy\": -1.64, \"dz\": -0.02, \"dpitch\": -2.01, \"dyaw\": 9.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.12, "window_alt_abs_m": 0.38, "target_px_mean_hist": 640.5, "cur_frame_id": 109, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-42.57, -7.85, 19.9, -48.89, -129.68, 0.0]\n  Target bbox: [612.78, 317.5, 667.34, 401.68]\n\nFrame 2:\n  Drone pose: [-42.65, -8.36, 20.02, -49.18, -129.48, 0.0]\n  Target bbox: [614.96, 318.18, 665.14, 400.79]\n\nFrame 3:\n  Drone pose: [-42.79, -8.78, 20.0, -49.18, -128.94, 0.0]\n  Target bbox: [613.38, 318.09, 666.74, 401.02]\n\nFrame 4:\n  Drone pose: [-42.91, -9.19, 20.0, -49.2, -128.4, 0.0]\n  Target bbox: [619.1, 320.77, 660.85, 398.11]\n\nFrame 5 (current):\n  Drone pose: [-43.02, -9.72, 19.94, -48.93, -133.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 669.42, \"ymin\": 325.1, \"xmax\": 721.6, \"ymax\": 408.84}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": -0.33, \"dz\": 0.06, \"dpitch\": -0.43, \"dyaw\": 6.0, \"droll\": 0.0}, {\"dx\": -0.47, \"dy\": -0.79, \"dz\": 0.06, \"dpitch\": -0.62, \"dyaw\": 6.78, \"droll\": 0.0}, {\"dx\": -0.77, \"dy\": -1.26, \"dz\": 0.06, \"dpitch\": -0.37, \"dyaw\": 6.26, \"droll\": 0.0}, {\"dx\": -1.11, \"dy\": -1.75, \"dz\": 0.06, \"dpitch\": -0.2, \"dyaw\": 5.86, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -2.24, \"dz\": 0.06, \"dpitch\": -0.07, \"dyaw\": 5.56, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.04, "window_alt_abs_m": 0.2, "target_px_mean_hist": 648.8, "cur_frame_id": 127, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00144/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.25, -15.95, 20.0, -48.46, -128.32, 0.0]\n  Target bbox: [623.11, 320.98, 657.29, 397.95]\n\nFrame 2:\n  Drone pose: [-48.69, -16.54, 19.98, -48.49, -128.69, 0.0]\n  Target bbox: [621.33, 326.02, 658.94, 392.8]\n\nFrame 3:\n  Drone pose: [-49.32, -16.89, 19.83, -45.93, -133.0, 0.0]\n  Target bbox: [681.69, 364.02, 711.23, 434.84]\n\nFrame 4:\n  Drone pose: [-49.85, -17.33, 20.0, -48.42, -127.79, 0.0]\n  Target bbox: [624.48, 324.55, 655.78, 394.25]\n\nFrame 5 (current):\n  Drone pose: [-50.35, -17.68, 20.05, -48.3, -127.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.71, \"ymin\": 325.33, \"xmax\": 659.57, \"ymax\": 393.52}, \"waypoint_deltas\": [{\"dx\": -0.65, \"dy\": -0.54, \"dz\": -0.05, \"dpitch\": -0.12, \"dyaw\": 0.32, \"droll\": 0.0}, {\"dx\": -1.23, \"dy\": -0.97, \"dz\": -0.05, \"dpitch\": -0.11, \"dyaw\": 0.68, \"droll\": 0.0}, {\"dx\": -1.8, \"dy\": -1.4, \"dz\": -0.05, \"dpitch\": -0.08, \"dyaw\": 1.03, \"droll\": 0.0}, {\"dx\": -2.37, \"dy\": -1.83, \"dz\": -0.05, \"dpitch\": -0.05, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": -2.91, \"dy\": -2.26, \"dz\": -0.05, \"dpitch\": 0.0, \"dyaw\": 1.6, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.19, "window_alt_abs_m": 0.4, "target_px_mean_hist": 665.2, "cur_frame_id": 144, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00158/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00159/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734/aug_001/frames_playback/frame_00162/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.59, -25.61, 20.0, -43.61, -132.06, 0.0]\n  Target bbox: [619.58, 401.41, 673.11, 486.11]\n\nFrame 2:\n  Drone pose: [-53.37, -26.32, 20.02, -45.27, -137.56, 0.0]\n  Target bbox: [673.19, 379.09, 718.98, 457.57]\n\nFrame 3:\n  Drone pose: [-53.24, -27.07, 20.0, -48.79, -133.48, 0.0]\n  Target bbox: [619.95, 321.2, 659.98, 397.69]\n\nFrame 4:\n  Drone pose: [-53.09, -27.96, 20.07, -49.19, -134.8, 0.0]\n  Target bbox: [615.27, 317.8, 664.91, 401.21]\n\nFrame 5 (current):\n  Drone pose: [-52.96, -28.58, 20.17, -49.32, -135.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.26, \"ymin\": 322.4, \"xmax\": 659.72, \"ymax\": 396.54}, \"waypoint_deltas\": [{\"dx\": 0.24, \"dy\": -0.68, \"dz\": -0.17, \"dpitch\": 0.33, \"dyaw\": -1.02, \"droll\": 0.0}, {\"dx\": 0.42, \"dy\": -1.4, \"dz\": -0.17, \"dpitch\": 0.28, \"dyaw\": -2.02, \"droll\": 0.0}, {\"dx\": 0.44, \"dy\": -1.92, \"dz\": -0.17, \"dpitch\": 0.3, \"dyaw\": -2.13, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": -2.44, \"dz\": -0.17, \"dpitch\": 0.32, \"dyaw\": -2.23, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": -2.95, \"dz\": -0.17, \"dpitch\": 0.33, \"dyaw\": -2.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.55, "window_alt_abs_m": 0.21, "target_px_mean_hist": 648.8, "cur_frame_id": 162, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_734", "difficulty_score": 0.2491, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.57, 39.22, 22.0, -46.48, -81.47, 0.0]\n  Target bbox: [619.13, 324.8, 660.76, 394.68]\n\nFrame 2:\n  Drone pose: [-103.96, 37.0, 21.2, -47.29, -74.96, 0.0]\n  Target bbox: [625.73, 328.45, 654.44, 390.7]\n\nFrame 3:\n  Drone pose: [-104.71, 35.52, 20.67, -47.67, -71.83, 0.0]\n  Target bbox: [621.59, 323.71, 658.18, 395.49]\n\nFrame 4:\n  Drone pose: [-105.06, 34.49, 20.64, -47.98, -68.68, 0.0]\n  Target bbox: [624.08, 325.26, 655.7, 393.91]\n\nFrame 5 (current):\n  Drone pose: [-105.09, 33.82, 20.62, -47.86, -66.91, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.57, \"ymin\": 323.9, \"xmax\": 663.54, \"ymax\": 395.26}, \"waypoint_deltas\": [{\"dx\": -0.02, \"dy\": -0.65, \"dz\": -0.03, \"dpitch\": -0.18, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": -1.28, \"dz\": -0.05, \"dpitch\": -0.34, \"dyaw\": 0.39, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.8, \"dz\": -0.07, \"dpitch\": -0.4, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": -2.31, \"dz\": -0.09, \"dpitch\": -0.44, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": 0.25, \"dy\": -2.82, \"dz\": -0.2, \"dpitch\": -0.34, \"dyaw\": -0.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.56, "window_alt_abs_m": 1.38, "target_px_mean_hist": 542.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00019/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.53, 27.98, 20.24, -48.17, -68.17, 0.0]\n  Target bbox: [622.49, 325.6, 657.66, 393.44]\n\nFrame 2:\n  Drone pose: [-104.53, 27.47, 20.22, -48.15, -68.18, 0.0]\n  Target bbox: [623.08, 325.48, 657.08, 393.56]\n\nFrame 3:\n  Drone pose: [-104.54, 26.96, 20.19, -48.12, -68.12, 0.0]\n  Target bbox: [620.86, 325.19, 659.29, 393.88]\n\nFrame 4:\n  Drone pose: [-104.57, 26.45, 20.17, -48.1, -68.0, 0.0]\n  Target bbox: [616.01, 323.06, 664.12, 396.02]\n\nFrame 5 (current):\n  Drone pose: [-104.62, 25.93, 20.15, -48.07, -67.82, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.38, \"ymin\": 322.77, \"xmax\": 662.79, \"ymax\": 396.28}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": -0.53, \"dz\": -0.01, \"dpitch\": 0.04, \"dyaw\": 0.24, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": -1.05, \"dz\": -0.03, \"dpitch\": 0.08, \"dyaw\": 0.54, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": -1.58, \"dz\": -0.05, \"dpitch\": 0.12, \"dyaw\": 0.88, \"droll\": 0.0}, {\"dx\": -0.37, \"dy\": -2.12, \"dz\": -0.06, \"dpitch\": 0.16, \"dyaw\": 1.26, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": -2.65, \"dz\": -0.07, \"dpitch\": 0.2, \"dyaw\": 1.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.36, "window_alt_abs_m": 0.09, "target_px_mean_hist": 616.5, "cur_frame_id": 19, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.96, 19.53, 20.03, -47.91, -64.76, 0.0]\n  Target bbox: [622.51, 323.49, 657.22, 395.67]\n\nFrame 2:\n  Drone pose: [-104.71, 19.01, 20.02, -47.55, -63.98, 0.0]\n  Target bbox: [619.95, 322.51, 659.75, 396.66]\n\nFrame 3:\n  Drone pose: [-104.41, 18.49, 20.02, -47.42, -63.33, 0.0]\n  Target bbox: [622.52, 323.4, 657.2, 395.78]\n\nFrame 4:\n  Drone pose: [-104.11, 17.97, 20.02, -47.29, -62.75, 0.0]\n  Target bbox: [618.91, 322.7, 660.8, 396.47]\n\nFrame 5 (current):\n  Drone pose: [-103.8, 17.46, 20.01, -47.16, -62.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.2, \"ymin\": 324.05, \"xmax\": 655.54, \"ymax\": 395.1}, \"waypoint_deltas\": [{\"dx\": 0.29, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": 0.62, \"droll\": 0.0}, {\"dx\": 0.54, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": 1.33, \"droll\": 0.0}, {\"dx\": 0.76, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": 0.71, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": -1.99, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": -2.48, \"dz\": -0.01, \"dpitch\": -0.03, \"dyaw\": -0.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.59, "window_alt_abs_m": 0.01, "target_px_mean_hist": 631.0, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00051/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.84, 11.58, 20.0, -47.67, -65.17, 0.0]\n  Target bbox: [614.68, 320.66, 665.44, 398.42]\n\nFrame 2:\n  Drone pose: [-101.72, 11.09, 20.0, -47.72, -65.55, 0.0]\n  Target bbox: [613.54, 320.34, 666.58, 398.75]\n\nFrame 3:\n  Drone pose: [-101.61, 10.61, 20.0, -47.77, -65.91, 0.0]\n  Target bbox: [615.79, 322.19, 664.32, 396.89]\n\nFrame 4:\n  Drone pose: [-101.52, 10.14, 20.0, -47.79, -66.24, 0.0]\n  Target bbox: [623.02, 325.42, 657.13, 393.64]\n\nFrame 5 (current):\n  Drone pose: [-101.44, 9.68, 20.0, -47.78, -66.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.61, \"ymin\": 322.34, \"xmax\": 664.5, \"ymax\": 396.75}, \"waypoint_deltas\": [{\"dx\": 0.07, \"dy\": -0.45, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": -0.88, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.49, \"droll\": 0.0}, {\"dx\": 0.15, \"dy\": -1.32, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": -0.69, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -1.75, \"dz\": 0.0, \"dpitch\": 0.27, \"dyaw\": -0.86, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": -2.18, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": -1.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.38, "window_alt_abs_m": 0.0, "target_px_mean_hist": 621.2, "cur_frame_id": 51, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.24, 4.13, 20.0, -47.22, -67.72, 0.0]\n  Target bbox: [620.57, 324.75, 659.59, 394.38]\n\nFrame 2:\n  Drone pose: [-101.29, 3.6, 20.0, -47.25, -67.52, 0.0]\n  Target bbox: [616.99, 323.12, 663.14, 395.97]\n\nFrame 3:\n  Drone pose: [-101.35, 3.06, 20.0, -47.27, -67.31, 0.0]\n  Target bbox: [616.47, 322.78, 663.66, 396.31]\n\nFrame 4:\n  Drone pose: [-101.4, 2.52, 20.0, -47.29, -67.1, 0.0]\n  Target bbox: [615.92, 322.8, 664.21, 396.34]\n\nFrame 5 (current):\n  Drone pose: [-101.45, 1.99, 20.0, -47.3, -66.91, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.01, \"ymin\": 324.48, \"xmax\": 659.14, \"ymax\": 394.59}, \"waypoint_deltas\": [{\"dx\": -0.04, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": -1.04, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": -1.55, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.26, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -2.54, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.81, "window_alt_abs_m": 0.0, "target_px_mean_hist": 569.0, "cur_frame_id": 67, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00083/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.57, -4.04, 20.02, -47.29, -66.49, 0.0]\n  Target bbox: [620.58, 324.8, 659.56, 394.31]\n\nFrame 2:\n  Drone pose: [-101.58, -4.54, 20.02, -47.29, -66.47, 0.0]\n  Target bbox: [615.45, 322.7, 664.67, 396.49]\n\nFrame 3:\n  Drone pose: [-101.57, -5.03, 20.02, -47.3, -66.5, 0.0]\n  Target bbox: [618.04, 322.79, 662.12, 396.31]\n\nFrame 4:\n  Drone pose: [-101.56, -5.53, 20.03, -47.3, -66.57, 0.0]\n  Target bbox: [616.43, 322.68, 663.69, 396.41]\n\nFrame 5 (current):\n  Drone pose: [-101.53, -6.02, 20.03, -47.32, -66.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.89, \"ymin\": 322.88, \"xmax\": 664.24, \"ymax\": 396.3}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.51, \"dz\": 0.01, \"dpitch\": -0.02, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -1.01, \"dz\": 0.01, \"dpitch\": -0.05, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -1.53, \"dz\": 0.02, \"dpitch\": -0.09, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -2.04, \"dz\": 0.03, \"dpitch\": -0.13, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -2.56, \"dz\": 0.03, \"dpitch\": -0.18, \"dyaw\": -0.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.19, "window_alt_abs_m": 0.02, "target_px_mean_hist": 619.2, "cur_frame_id": 83, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.2, -12.56, 20.15, -47.85, -63.89, 0.0]\n  Target bbox: [614.48, 320.61, 665.62, 398.48]\n\nFrame 2:\n  Drone pose: [-102.38, -13.2, 20.17, -47.95, -63.16, 0.0]\n  Target bbox: [615.97, 322.5, 664.1, 396.64]\n\nFrame 3:\n  Drone pose: [-102.53, -13.83, 20.19, -48.06, -62.51, 0.0]\n  Target bbox: [614.95, 320.65, 665.14, 398.44]\n\nFrame 4:\n  Drone pose: [-102.62, -14.43, 20.22, -48.18, -62.08, 0.0]\n  Target bbox: [621.71, 323.89, 658.04, 395.17]\n\nFrame 5 (current):\n  Drone pose: [-102.61, -15.0, 20.24, -47.92, -60.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.31, \"ymin\": 323.18, \"xmax\": 660.38, \"ymax\": 395.95}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.69, \"dz\": 0.02, \"dpitch\": 0.09, \"dyaw\": 1.71, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -1.45, \"dz\": 0.05, \"dpitch\": 0.17, \"dyaw\": 3.67, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -2.24, \"dz\": 0.07, \"dpitch\": 0.24, \"dyaw\": 5.64, \"droll\": 0.0}, {\"dx\": -0.74, \"dy\": -3.8, \"dz\": 0.1, \"dpitch\": 0.01, \"dyaw\": 10.83, \"droll\": 0.0}, {\"dx\": -1.92, \"dy\": -5.97, \"dz\": 0.12, \"dpitch\": -0.07, \"dyaw\": 18.66, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.35, "window_alt_abs_m": 0.09, "target_px_mean_hist": 613.5, "cur_frame_id": 99, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00113/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00115/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-109.81, -34.98, 20.56, -41.86, -2.84, 0.0]\n  Target bbox: [619.04, 323.53, 660.98, 396.09]\n\nFrame 2:\n  Drone pose: [-109.61, -36.04, 20.57, -41.51, -1.34, 0.0]\n  Target bbox: [621.95, 326.28, 658.13, 393.31]\n\nFrame 3:\n  Drone pose: [-109.18, -36.78, 21.37, -42.58, -0.73, 0.0]\n  Target bbox: [620.77, 325.31, 659.22, 394.25]\n\nFrame 4:\n  Drone pose: [-108.74, -37.36, 20.58, -41.37, -0.52, 0.0]\n  Target bbox: [619.25, 324.47, 660.77, 395.23]\n\nFrame 5 (current):\n  Drone pose: [-108.3, -37.79, 20.58, -41.29, -0.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.96, \"ymin\": 327.22, \"xmax\": 650.76, \"ymax\": 392.31}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.3, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.77, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": -0.49, \"dz\": -0.01, \"dpitch\": 0.21, \"dyaw\": 1.27, \"droll\": 0.0}, {\"dx\": 1.25, \"dy\": -0.61, \"dz\": -0.02, \"dpitch\": 0.34, \"dyaw\": 1.57, \"droll\": 0.0}, {\"dx\": 1.65, \"dy\": -0.68, \"dz\": -0.02, \"dpitch\": 0.48, \"dyaw\": 1.75, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": -0.74, \"dz\": -0.03, \"dpitch\": 0.62, \"dyaw\": 1.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.49, "window_alt_abs_m": 1.6, "target_px_mean_hist": 540.5, "cur_frame_id": 115, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00129/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00130/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00131/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.44, -40.52, 20.44, -39.65, 2.36, 0.0]\n  Target bbox: [623.95, 328.04, 655.96, 391.59]\n\nFrame 2:\n  Drone pose: [-103.08, -40.95, 20.44, -39.49, 2.19, 0.0]\n  Target bbox: [622.29, 327.59, 657.75, 392.13]\n\nFrame 3:\n  Drone pose: [-102.72, -41.39, 20.45, -39.33, 2.03, 0.0]\n  Target bbox: [628.81, 326.89, 651.5, 392.82] (model-predicted box)\n\nFrame 4:\n  Drone pose: [-102.36, -41.83, 20.45, -39.77, 1.92, 0.0]\n  Target bbox: [627.96, 325.59, 652.39, 394.08]\n\nFrame 5 (current):\n  Drone pose: [-101.99, -42.27, 20.46, -40.23, 1.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.14, \"ymin\": 328.53, \"xmax\": 651.11, \"ymax\": 391.01}, \"waypoint_deltas\": [{\"dx\": 0.36, \"dy\": -0.44, \"dz\": 0.02, \"dpitch\": -0.47, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.72, \"dy\": -0.88, \"dz\": 0.03, \"dpitch\": -0.96, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": -1.32, \"dz\": 0.05, \"dpitch\": -1.45, \"dyaw\": -0.39, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": -1.76, \"dz\": 0.07, \"dpitch\": -1.3, \"dyaw\": -0.56, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": -2.2, \"dz\": 0.09, \"dpitch\": -1.16, \"dyaw\": -0.72, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 1, "current_invisible": false, "window_yaw_abs_deg": 0.56, "window_alt_abs_m": 0.02, "target_px_mean_hist": 283.2, "cur_frame_id": 131, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00146/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/ORI/frames_playback/frame_00147/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.09, -47.35, 20.75, -41.16, -0.52, 0.0]\n  Target bbox: [620.49, 325.44, 659.53, 394.23]\n\nFrame 2:\n  Drone pose: [-96.5, -47.66, 20.77, -41.31, -1.02, 0.0]\n  Target bbox: [619.38, 324.67, 660.65, 395.04]\n\nFrame 3:\n  Drone pose: [-95.91, -47.96, 20.8, -41.45, -1.53, 0.0]\n  Target bbox: [624.27, 328.41, 655.8, 391.14]\n\nFrame 4:\n  Drone pose: [-95.33, -48.26, 20.83, -41.59, -2.04, 0.0]\n  Target bbox: [620.04, 325.14, 659.98, 394.53]\n\nFrame 5 (current):\n  Drone pose: [-94.74, -48.57, 20.85, -41.73, -2.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.02, \"ymin\": 329.22, \"xmax\": 651.74, \"ymax\": 390.21}, \"waypoint_deltas\": [{\"dx\": 0.59, \"dy\": -0.3, \"dz\": 0.03, \"dpitch\": -0.16, \"dyaw\": 0.78, \"droll\": 0.0}, {\"dx\": 1.17, \"dy\": -0.6, \"dz\": 0.05, \"dpitch\": -0.29, \"dyaw\": 0.26, \"droll\": 0.0}, {\"dx\": 1.76, \"dy\": -0.91, \"dz\": 0.07, \"dpitch\": -0.45, \"dyaw\": 1.05, \"droll\": 0.0}, {\"dx\": 2.35, \"dy\": -1.21, \"dz\": 0.08, \"dpitch\": -0.57, \"dyaw\": 0.52, \"droll\": 0.0}, {\"dx\": 2.93, \"dy\": -1.52, \"dz\": 0.09, \"dpitch\": -0.71, \"dyaw\": 1.32, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.04, "window_alt_abs_m": 0.11, "target_px_mean_hist": 481.8, "cur_frame_id": 147, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.57, 39.22, 22.0, -46.48, -81.47, 0.0]\n  Target bbox: [622.45, 326.95, 657.44, 392.44]\n\nFrame 2:\n  Drone pose: [-103.85, 37.07, 21.11, -47.11, -75.34, 0.0]\n  Target bbox: [620.73, 327.28, 659.47, 391.89]\n\nFrame 3:\n  Drone pose: [-104.73, 35.69, 20.66, -46.57, -69.8, 0.0]\n  Target bbox: [597.26, 339.69, 633.22, 407.56]\n\nFrame 4:\n  Drone pose: [-105.06, 34.49, 20.64, -46.83, -66.46, 0.0]\n  Target bbox: [598.61, 345.14, 630.49, 413.36]\n\nFrame 5 (current):\n  Drone pose: [-105.06, 33.71, 20.46, -46.88, -71.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 675.84, \"ymin\": 343.26, \"xmax\": 717.96, \"ymax\": 410.9}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.54, \"dz\": 0.13, \"dpitch\": -1.16, \"dyaw\": 5.19, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -1.17, \"dz\": 0.11, \"dpitch\": -1.32, \"dyaw\": 5.33, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -1.69, \"dz\": 0.09, \"dpitch\": -1.38, \"dyaw\": 5.1, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": -2.2, \"dz\": 0.07, \"dpitch\": -1.42, \"dyaw\": 4.85, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": -2.71, \"dz\": -0.04, \"dpitch\": -1.32, \"dyaw\": 4.6, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.4, "window_alt_abs_m": 1.54, "target_px_mean_hist": 549.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00019/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.53, 27.98, 20.24, -48.27, -69.8, 0.0]\n  Target bbox: [633.29, 321.8, 684.25, 394.5]\n\nFrame 2:\n  Drone pose: [-104.5, 27.48, 20.14, -48.03, -68.27, 0.0]\n  Target bbox: [621.54, 324.79, 658.63, 394.26]\n\nFrame 3:\n  Drone pose: [-104.58, 26.88, 20.07, -48.03, -67.88, 0.0]\n  Target bbox: [614.09, 321.21, 666.07, 397.86]\n\nFrame 4:\n  Drone pose: [-104.57, 26.45, 20.17, -49.67, -72.5, 0.0]\n  Target bbox: [664.57, 297.28, 717.74, 371.71]\n\nFrame 5 (current):\n  Drone pose: [-104.58, 25.85, 20.29, -48.41, -67.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.47, \"ymin\": 322.42, \"xmax\": 663.69, \"ymax\": 396.61}, \"waypoint_deltas\": [{\"dx\": -0.11, \"dy\": -0.45, \"dz\": -0.15, \"dpitch\": 0.38, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": -0.2, \"dy\": -0.97, \"dz\": -0.17, \"dpitch\": 0.42, \"dyaw\": 0.58, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": -1.5, \"dz\": -0.19, \"dpitch\": 0.46, \"dyaw\": 0.92, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": -2.04, \"dz\": -0.2, \"dpitch\": 0.5, \"dyaw\": 1.3, \"droll\": 0.0}, {\"dx\": -0.52, \"dy\": -2.57, \"dz\": -0.21, \"dpitch\": 0.54, \"dyaw\": 1.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.17, "window_alt_abs_m": 0.4, "target_px_mean_hist": 614.2, "cur_frame_id": 19, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.94, 19.68, 20.14, -52.81, -60.61, 0.0]\n  Target bbox: [568.78, 241.01, 610.51, 314.5]\n\nFrame 2:\n  Drone pose: [-104.71, 19.01, 20.02, -48.73, -68.14, 0.0]\n  Target bbox: [669.84, 304.74, 705.03, 377.15]\n\nFrame 3:\n  Drone pose: [-104.49, 18.56, 19.94, -42.13, -60.15, 0.0]\n  Target bbox: [586.85, 409.98, 622.18, 478.62]\n\nFrame 4:\n  Drone pose: [-104.11, 17.97, 20.02, -43.66, -57.75, 0.0]\n  Target bbox: [565.7, 386.83, 599.24, 457.97]\n\nFrame 5 (current):\n  Drone pose: [-103.73, 17.47, 19.96, -45.52, -63.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.28, \"ymin\": 347.7, \"xmax\": 669.4, \"ymax\": 425.44}, \"waypoint_deltas\": [{\"dx\": 0.22, \"dy\": -0.52, \"dz\": 0.05, \"dpitch\": -1.48, \"dyaw\": 1.51, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": -1.02, \"dz\": 0.05, \"dpitch\": -1.29, \"dyaw\": 2.22, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -1.51, \"dz\": 0.05, \"dpitch\": -1.44, \"dyaw\": 1.6, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": -2.0, \"dz\": 0.05, \"dpitch\": -1.57, \"dyaw\": 1.06, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": -2.49, \"dz\": 0.04, \"dpitch\": -1.67, \"dyaw\": 0.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.24, "window_alt_abs_m": 0.32, "target_px_mean_hist": 625.8, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00051/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.84, 11.58, 20.0, -50.51, -61.9, 0.0]\n  Target bbox: [583.25, 277.31, 621.94, 347.78]\n\nFrame 2:\n  Drone pose: [-101.72, 11.09, 20.0, -42.72, -65.72, 0.0]\n  Target bbox: [620.17, 407.35, 663.94, 479.98]\n\nFrame 3:\n  Drone pose: [-101.61, 10.61, 20.0, -46.25, -70.91, 0.0]\n  Target bbox: [671.89, 350.33, 722.29, 423.32]\n\nFrame 4:\n  Drone pose: [-101.61, 10.14, 19.85, -47.5, -65.98, 0.0]\n  Target bbox: [613.35, 320.22, 666.77, 398.88]\n\nFrame 5 (current):\n  Drone pose: [-101.45, 9.74, 19.89, -48.54, -63.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 582.37, \"ymin\": 308.97, \"xmax\": 618.41, \"ymax\": 377.81}, \"waypoint_deltas\": [{\"dx\": 0.08, \"dy\": -0.51, \"dz\": 0.11, \"dpitch\": 0.79, \"dyaw\": -3.69, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": -0.94, \"dz\": 0.11, \"dpitch\": 0.86, \"dyaw\": -3.92, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": -1.38, \"dz\": 0.11, \"dpitch\": 0.94, \"dyaw\": -4.12, \"droll\": 0.0}, {\"dx\": 0.19, \"dy\": -1.81, \"dz\": 0.11, \"dpitch\": 1.03, \"dyaw\": -4.29, \"droll\": 0.0}, {\"dx\": 0.21, \"dy\": -2.24, \"dz\": 0.11, \"dpitch\": 1.11, \"dyaw\": -4.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.79, "window_alt_abs_m": 0.19, "target_px_mean_hist": 611.5, "cur_frame_id": 51, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.2, 4.12, 19.9, -51.06, -68.18, 0.0]\n  Target bbox: [627.33, 259.45, 660.98, 327.55]\n\nFrame 2:\n  Drone pose: [-101.17, 3.61, 19.97, -47.03, -68.83, 0.0]\n  Target bbox: [634.44, 329.05, 667.31, 397.56]\n\nFrame 3:\n  Drone pose: [-101.37, 3.03, 19.93, -47.35, -63.39, 0.0]\n  Target bbox: [573.02, 320.42, 619.16, 395.37]\n\nFrame 4:\n  Drone pose: [-101.34, 2.58, 19.92, -47.11, -67.35, 0.0]\n  Target bbox: [623.25, 325.61, 656.9, 393.48]\n\nFrame 5 (current):\n  Drone pose: [-101.45, 1.99, 20.0, -47.3, -66.91, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.76, \"ymin\": 323.97, \"xmax\": 661.37, \"ymax\": 395.13}, \"waypoint_deltas\": [{\"dx\": -0.04, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": -1.04, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": -1.55, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.26, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -2.54, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.5, "window_alt_abs_m": 0.2, "target_px_mean_hist": 603.0, "cur_frame_id": 67, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00083/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.57, -4.04, 20.02, -49.03, -71.49, 0.0]\n  Target bbox: [670.78, 294.36, 724.63, 369.88]\n\nFrame 2:\n  Drone pose: [-101.45, -4.64, 19.96, -43.68, -68.31, 0.0]\n  Target bbox: [634.87, 386.3, 681.8, 459.15]\n\nFrame 3:\n  Drone pose: [-101.44, -5.04, 19.92, -43.55, -69.83, 0.0]\n  Target bbox: [648.27, 383.55, 700.17, 460.43]\n\nFrame 4:\n  Drone pose: [-101.59, -5.66, 19.94, -45.63, -68.84, 0.0]\n  Target bbox: [649.76, 354.03, 688.85, 423.53]\n\nFrame 5 (current):\n  Drone pose: [-101.53, -6.2, 20.07, -47.65, -66.41, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.18, \"ymin\": 325.38, \"xmax\": 657.97, \"ymax\": 393.7}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.33, \"dz\": -0.03, \"dpitch\": 0.31, \"dyaw\": -0.28, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -0.83, \"dz\": -0.03, \"dpitch\": 0.28, \"dyaw\": -0.31, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -1.35, \"dz\": -0.02, \"dpitch\": 0.24, \"dyaw\": -0.34, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -1.86, \"dz\": -0.01, \"dpitch\": 0.2, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -2.38, \"dz\": -0.01, \"dpitch\": 0.15, \"dyaw\": -0.38, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.13, "window_alt_abs_m": 0.24, "target_px_mean_hist": 627.8, "cur_frame_id": 83, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.2, -12.56, 20.15, -47.85, -63.89, 0.0]\n  Target bbox: [620.85, 324.28, 659.28, 394.77]\n\nFrame 2:\n  Drone pose: [-102.54, -13.1, 20.21, -47.74, -62.84, 0.0]\n  Target bbox: [615.32, 322.29, 664.75, 396.91]\n\nFrame 3:\n  Drone pose: [-102.53, -13.83, 20.19, -47.81, -62.22, 0.0]\n  Target bbox: [613.56, 325.92, 659.89, 401.41]\n\nFrame 4:\n  Drone pose: [-102.62, -14.43, 20.22, -50.42, -63.55, 0.0]\n  Target bbox: [638.32, 286.17, 675.14, 357.97]\n\nFrame 5 (current):\n  Drone pose: [-102.61, -15.0, 20.24, -42.92, -61.2, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.02, \"ymin\": 407.04, \"xmax\": 667.85, \"ymax\": 480.36}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.69, \"dz\": 0.02, \"dpitch\": -4.91, \"dyaw\": 2.37, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -1.45, \"dz\": 0.05, \"dpitch\": -4.83, \"dyaw\": 4.33, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -2.24, \"dz\": 0.07, \"dpitch\": -4.76, \"dyaw\": 6.3, \"droll\": 0.0}, {\"dx\": -0.74, \"dy\": -3.8, \"dz\": 0.1, \"dpitch\": -4.99, \"dyaw\": 11.49, \"droll\": 0.0}, {\"dx\": -1.92, \"dy\": -5.97, \"dz\": 0.12, \"dpitch\": -5.07, \"dyaw\": 19.32, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.36, "window_alt_abs_m": 0.12, "target_px_mean_hist": 614.2, "cur_frame_id": 99, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00113/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00115/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-109.81, -34.98, 20.56, -41.41, -2.08, 0.0]\n  Target bbox: [613.3, 334.71, 647.52, 400.1]\n\nFrame 2:\n  Drone pose: [-109.49, -36.08, 20.58, -43.43, -3.61, 0.0]\n  Target bbox: [651.92, 297.32, 688.19, 364.06]\n\nFrame 3:\n  Drone pose: [-109.18, -36.78, 21.37, -39.54, -2.75, 0.0]\n  Target bbox: [647.03, 377.71, 683.74, 444.57]\n\nFrame 4:\n  Drone pose: [-108.76, -37.26, 20.46, -41.75, 0.24, 0.0]\n  Target bbox: [610.86, 317.65, 643.43, 382.1]\n\nFrame 5 (current):\n  Drone pose: [-108.27, -37.88, 20.6, -43.26, 0.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.39, \"ymin\": 299.24, \"xmax\": 638.18, \"ymax\": 356.51}, \"waypoint_deltas\": [{\"dx\": 0.4, \"dy\": -0.21, \"dz\": -0.02, \"dpitch\": 2.07, \"dyaw\": -0.46, \"droll\": 0.0}, {\"dx\": 0.82, \"dy\": -0.4, \"dz\": -0.03, \"dpitch\": 2.18, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 1.22, \"dy\": -0.52, \"dz\": -0.04, \"dpitch\": 2.31, \"dyaw\": 0.34, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": -0.59, \"dz\": -0.04, \"dpitch\": 2.45, \"dyaw\": 0.52, \"droll\": 0.0}, {\"dx\": 2.02, \"dy\": -0.65, \"dz\": -0.05, \"dpitch\": 2.59, \"dyaw\": 0.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.67, "window_alt_abs_m": 1.87, "target_px_mean_hist": 475.0, "cur_frame_id": 115, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00129/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00130/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00131/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.44, -40.52, 20.44, -41.58, 5.42, 0.0]\n  Target bbox: [584.14, 296.61, 616.05, 359.4]\n\nFrame 2:\n  Drone pose: [-103.14, -40.85, 20.53, -39.41, 6.94, 0.0]\n  Target bbox: [554.39, 329.82, 595.42, 398.03]\n\nFrame 3:\n  Drone pose: [-102.79, -41.32, 20.44, -39.24, 1.85, 0.0]\n  Target bbox: [628.99, 326.9, 651.31, 392.79] (model-predicted box)\n\nFrame 4:\n  Drone pose: [-102.36, -41.83, 20.45, -38.25, 4.26, 0.0]\n  Target bbox: [598.4, 356.95, 621.1, 414.2]\n\nFrame 5 (current):\n  Drone pose: [-101.99, -42.27, 20.46, -40.23, 1.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.19, \"ymin\": 331.23, \"xmax\": 651.0, \"ymax\": 388.22}, \"waypoint_deltas\": [{\"dx\": 0.36, \"dy\": -0.44, \"dz\": 0.02, \"dpitch\": -0.47, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.72, \"dy\": -0.88, \"dz\": 0.03, \"dpitch\": -0.96, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": -1.32, \"dz\": 0.05, \"dpitch\": -1.45, \"dyaw\": -0.39, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": -1.76, \"dz\": 0.07, \"dpitch\": -1.3, \"dyaw\": -0.56, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": -2.2, \"dz\": 0.09, \"dpitch\": -1.16, \"dyaw\": -0.72, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 1, "current_invisible": false, "window_yaw_abs_deg": 11.48, "window_alt_abs_m": 0.2, "target_px_mean_hist": 281.8, "cur_frame_id": 131, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00146/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194/aug_001/frames_playback/frame_00147/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.09, -47.35, 20.75, -41.21, -5.52, 0.0]\n  Target bbox: [684.67, 327.41, 722.44, 394.16]\n\nFrame 2:\n  Drone pose: [-96.5, -47.66, 20.77, -44.46, 1.17, 0.0]\n  Target bbox: [594.07, 274.54, 629.97, 340.11]\n\nFrame 3:\n  Drone pose: [-95.91, -47.96, 20.8, -41.45, -1.53, 0.0]\n  Target bbox: [619.16, 324.16, 660.87, 395.53]\n\nFrame 4:\n  Drone pose: [-95.37, -48.25, 20.92, -38.61, 0.3, 0.0]\n  Target bbox: [589.07, 376.13, 630.94, 446.98]\n\nFrame 5 (current):\n  Drone pose: [-94.61, -48.65, 20.83, -41.88, -2.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.68, \"ymin\": 326.23, \"xmax\": 651.99, \"ymax\": 393.41}, \"waypoint_deltas\": [{\"dx\": 0.46, \"dy\": -0.22, \"dz\": 0.05, \"dpitch\": -0.01, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": 1.04, \"dy\": -0.52, \"dz\": 0.07, \"dpitch\": -0.14, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.63, \"dy\": -0.83, \"dz\": 0.09, \"dpitch\": -0.3, \"dyaw\": 0.84, \"droll\": 0.0}, {\"dx\": 2.22, \"dy\": -1.13, \"dz\": 0.1, \"dpitch\": -0.42, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": 2.8, \"dy\": -1.44, \"dz\": 0.11, \"dpitch\": -0.56, \"dyaw\": 1.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.87, "window_alt_abs_m": 0.25, "target_px_mean_hist": 483.5, "cur_frame_id": 147, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_194", "difficulty_score": 0.5091, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-63.97, -67.2, 22.0, -46.91, 135.0, 0.0]\n  Target bbox: [633.22, 340.63, 646.78, 379.06]\n\nFrame 2:\n  Drone pose: [-65.56, -67.67, 21.2, -45.66, 130.81, 0.0]\n  Target bbox: [628.7, 340.18, 651.17, 379.52]\n\nFrame 3:\n  Drone pose: [-66.67, -67.66, 20.67, -44.93, 128.55, 0.0]\n  Target bbox: [630.59, 337.65, 649.28, 382.09]\n\nFrame 4:\n  Drone pose: [-67.51, -67.36, 20.64, -44.96, 127.43, 0.0]\n  Target bbox: [629.32, 339.54, 650.57, 380.16]\n\nFrame 5 (current):\n  Drone pose: [-68.21, -66.94, 20.62, -45.0, 126.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.32, \"ymin\": 337.5, \"xmax\": 649.54, \"ymax\": 382.24}, \"waypoint_deltas\": [{\"dx\": -0.61, \"dy\": 0.47, \"dz\": -0.03, \"dpitch\": -0.03, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": -1.19, \"dy\": 0.93, \"dz\": -0.05, \"dpitch\": -0.02, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": -1.74, \"dy\": 1.39, \"dz\": -0.07, \"dpitch\": 0.02, \"dyaw\": -0.77, \"droll\": 0.0}, {\"dx\": -2.27, \"dy\": 1.83, \"dz\": -0.09, \"dpitch\": 0.09, \"dyaw\": -0.92, \"droll\": 0.0}, {\"dx\": -2.79, \"dy\": 2.28, \"dz\": -0.2, \"dpitch\": 0.29, \"dyaw\": -1.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.16, "window_alt_abs_m": 1.38, "target_px_mean_hist": 225.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00006/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00007/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00008/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00010/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-69.4, -66.01, 20.57, -45.02, 126.27, 0.0]\n  Target bbox: [626.84, 338.83, 653.03, 380.89]\n\nFrame 2:\n  Drone pose: [-69.95, -65.55, 20.55, -44.98, 126.07, 0.0]\n  Target bbox: [630.55, 339.51, 649.36, 380.18]\n\nFrame 3:\n  Drone pose: [-70.48, -65.11, 20.53, -44.91, 125.92, 0.0]\n  Target bbox: [626.94, 338.88, 652.93, 380.84]\n\nFrame 4:\n  Drone pose: [-71.0, -64.66, 20.42, -44.71, 125.79, 0.0]\n  Target bbox: [631.37, 339.42, 648.54, 380.26]\n\nFrame 5 (current):\n  Drone pose: [-71.51, -64.2, 20.39, -44.62, 125.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.42, \"ymin\": 339.8, \"xmax\": 649.49, \"ymax\": 379.88}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.46, \"dz\": -0.03, \"dpitch\": 0.08, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": 0.94, \"dz\": -0.06, \"dpitch\": 0.14, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": 1.44, \"dz\": -0.09, \"dpitch\": 0.18, \"dyaw\": -0.16, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": 1.94, \"dz\": -0.12, \"dpitch\": 0.21, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": -2.54, \"dy\": 2.43, \"dz\": -0.15, \"dpitch\": 0.25, \"dyaw\": -0.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.58, "window_alt_abs_m": 0.18, "target_px_mean_hist": 236.0, "cur_frame_id": 10, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00017/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-73.03, -62.76, 20.3, -44.44, 125.53, 0.0]\n  Target bbox: [627.23, 338.55, 652.64, 381.18]\n\nFrame 2:\n  Drone pose: [-73.53, -62.26, 20.27, -44.41, 125.54, 0.0]\n  Target bbox: [631.17, 338.28, 648.72, 381.44]\n\nFrame 3:\n  Drone pose: [-74.05, -61.77, 20.24, -44.37, 125.49, 0.0]\n  Target bbox: [629.9, 337.05, 649.95, 382.7]\n\nFrame 4:\n  Drone pose: [-74.56, -61.27, 20.22, -44.34, 125.46, 0.0]\n  Target bbox: [629.69, 337.2, 650.16, 382.56]\n\nFrame 5 (current):\n  Drone pose: [-75.09, -60.74, 20.19, -44.35, 125.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.96, \"ymin\": 338.41, \"xmax\": 652.91, \"ymax\": 381.29}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": 0.57, \"dz\": -0.02, \"dpitch\": -0.07, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": 1.18, \"dz\": -0.04, \"dpitch\": -0.17, \"dyaw\": 0.26, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": 1.49, \"dz\": -0.06, \"dpitch\": 0.05, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": -2.15, \"dy\": 1.58, \"dz\": -0.07, \"dpitch\": 0.47, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": -2.77, \"dy\": 1.66, \"dz\": -0.09, \"dpitch\": 0.89, \"dyaw\": -1.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.11, "window_alt_abs_m": 0.11, "target_px_mean_hist": 190.8, "cur_frame_id": 17, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-76.63, -59.25, 20.13, -44.3, 125.33, 0.0]\n  Target bbox: [629.75, 337.3, 650.11, 382.45]\n\nFrame 2:\n  Drone pose: [-77.24, -59.16, 20.12, -43.88, 124.39, 0.0]\n  Target bbox: [627.09, 338.52, 652.78, 381.23]\n\nFrame 3:\n  Drone pose: [-77.86, -59.08, 20.1, -43.46, 123.46, 0.0]\n  Target bbox: [627.07, 338.38, 652.8, 381.37]\n\nFrame 4:\n  Drone pose: [-78.61, -59.35, 20.09, -42.73, 121.71, 0.0]\n  Target bbox: [630.18, 338.17, 649.7, 381.62]\n\nFrame 5 (current):\n  Drone pose: [-79.41, -59.72, 20.08, -41.91, 119.83, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.67, \"ymin\": 338.55, \"xmax\": 649.23, \"ymax\": 381.24}, \"waypoint_deltas\": [{\"dx\": -0.89, \"dy\": -0.6, \"dz\": -0.01, \"dpitch\": 1.02, \"dyaw\": -2.29, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": -0.75, \"dz\": -0.02, \"dpitch\": 1.65, \"dyaw\": -3.5, \"droll\": 0.0}, {\"dx\": -2.29, \"dy\": -0.9, \"dz\": -0.03, \"dpitch\": 2.27, \"dyaw\": -4.64, \"droll\": 0.0}, {\"dx\": -2.93, \"dy\": -0.9, \"dz\": -0.04, \"dpitch\": 2.76, \"dyaw\": -5.46, \"droll\": 0.0}, {\"dx\": -3.46, \"dy\": -0.57, \"dz\": -0.04, \"dpitch\": 2.94, \"dyaw\": -5.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.5, "window_alt_abs_m": 0.06, "target_px_mean_hist": 236.5, "cur_frame_id": 24, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-81.7, -60.62, 20.05, -39.64, 115.19, 0.0]\n  Target bbox: [629.62, 338.46, 650.27, 381.42]\n\nFrame 2:\n  Drone pose: [-82.34, -60.62, 20.04, -39.15, 114.37, 0.0]\n  Target bbox: [629.75, 339.05, 650.16, 380.81]\n\nFrame 3:\n  Drone pose: [-82.87, -60.29, 20.04, -38.97, 114.13, 0.0]\n  Target bbox: [629.2, 339.92, 650.73, 379.94]\n\nFrame 4:\n  Drone pose: [-83.38, -59.87, 20.03, -38.89, 114.04, 0.0]\n  Target bbox: [629.51, 338.46, 650.38, 381.44]\n\nFrame 5 (current):\n  Drone pose: [-83.87, -59.4, 20.03, -38.84, 114.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.27, \"ymin\": 340.23, \"xmax\": 649.67, \"ymax\": 379.6}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.49, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.99, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": 1.49, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.01, \"dy\": 1.99, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": 2.48, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": -0.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.17, "window_alt_abs_m": 0.02, "target_px_mean_hist": 204.5, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-85.36, -57.91, 20.02, -38.81, 114.03, 0.0]\n  Target bbox: [629.35, 338.77, 650.54, 381.13]\n\nFrame 2:\n  Drone pose: [-85.88, -57.41, 20.01, -38.81, 114.0, 0.0]\n  Target bbox: [629.39, 340.2, 650.56, 379.64]\n\nFrame 3:\n  Drone pose: [-86.38, -56.92, 20.01, -38.81, 113.98, 0.0]\n  Target bbox: [626.99, 339.22, 652.93, 380.68]\n\nFrame 4:\n  Drone pose: [-86.89, -56.42, 20.01, -39.01, 113.97, 0.0]\n  Target bbox: [629.93, 339.33, 649.98, 380.53]\n\nFrame 5 (current):\n  Drone pose: [-87.38, -55.9, 20.01, -39.04, 114.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.21, \"ymin\": 338.58, \"xmax\": 650.68, \"ymax\": 381.33}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.99, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 1.47, \"dz\": -0.01, \"dpitch\": 0.04, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 1.92, \"dz\": -0.01, \"dpitch\": 0.1, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": 2.32, \"dz\": -0.01, \"dpitch\": 0.19, \"dyaw\": -0.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.09, "window_alt_abs_m": 0.01, "target_px_mean_hist": 187.2, "cur_frame_id": 38, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00044/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-88.38, -54.91, 20.0, -39.02, 114.0, 0.0]\n  Target bbox: [629.87, 338.96, 650.02, 380.93]\n\nFrame 2:\n  Drone pose: [-88.88, -54.43, 20.0, -39.0, 113.98, 0.0]\n  Target bbox: [629.21, 338.56, 650.67, 381.35]\n\nFrame 3:\n  Drone pose: [-89.38, -53.98, 20.0, -38.94, 113.93, 0.0]\n  Target bbox: [627.14, 339.1, 652.78, 380.79]\n\nFrame 4:\n  Drone pose: [-89.89, -53.58, 20.0, -38.85, 113.81, 0.0]\n  Target bbox: [630.03, 339.7, 649.89, 380.17]\n\nFrame 5 (current):\n  Drone pose: [-90.42, -53.21, 20.0, -38.72, 113.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.63, \"ymin\": 339.14, \"xmax\": 650.28, \"ymax\": 380.74}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.34, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": -1.11, \"dy\": 0.69, \"dz\": 0.0, \"dpitch\": -0.25, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": -1.69, \"dy\": 1.07, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -0.34, \"droll\": 0.0}, {\"dx\": -2.26, \"dy\": 1.5, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": -2.83, \"dy\": 1.96, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.38, "window_alt_abs_m": 0.0, "target_px_mean_hist": 206.2, "cur_frame_id": 44, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00051/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-92.11, -52.14, 20.0, -38.88, 113.28, 0.0]\n  Target bbox: [630.2, 339.64, 649.73, 380.21]\n\nFrame 2:\n  Drone pose: [-92.68, -51.71, 20.0, -38.84, 113.05, 0.0]\n  Target bbox: [630.15, 339.81, 649.78, 380.05]\n\nFrame 3:\n  Drone pose: [-93.25, -51.25, 20.0, -38.83, 112.86, 0.0]\n  Target bbox: [626.92, 339.18, 653.01, 380.73]\n\nFrame 4:\n  Drone pose: [-93.81, -50.75, 20.0, -38.85, 112.73, 0.0]\n  Target bbox: [629.14, 338.42, 650.75, 381.49]\n\nFrame 5 (current):\n  Drone pose: [-94.35, -50.24, 20.0, -38.88, 112.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.26, \"ymin\": 338.97, \"xmax\": 650.64, \"ymax\": 380.94}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": 1.06, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": 1.59, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 2.12, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -2.46, \"dy\": 2.64, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 0.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.64, "window_alt_abs_m": 0.0, "target_px_mean_hist": 196.5, "cur_frame_id": 51, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.87, -48.65, 20.0, -38.99, 112.68, 0.0]\n  Target bbox: [629.73, 339.45, 650.19, 380.42]\n\nFrame 2:\n  Drone pose: [-96.35, -48.12, 20.0, -39.01, 112.77, 0.0]\n  Target bbox: [629.07, 338.51, 650.82, 381.4]\n\nFrame 3:\n  Drone pose: [-96.81, -47.6, 20.0, -39.02, 112.87, 0.0]\n  Target bbox: [629.3, 338.84, 650.6, 381.06]\n\nFrame 4:\n  Drone pose: [-97.27, -47.09, 20.0, -39.01, 112.96, 0.0]\n  Target bbox: [627.35, 339.21, 652.57, 380.69]\n\nFrame 5 (current):\n  Drone pose: [-97.74, -46.62, 20.0, -38.96, 113.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.26, \"ymin\": 340.18, \"xmax\": 650.69, \"ymax\": 379.66}, \"waypoint_deltas\": [{\"dx\": -0.47, \"dy\": 0.43, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.95, \"dy\": 0.81, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": -1.43, \"dy\": 1.13, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -1.92, \"dy\": 1.41, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -2.41, \"dy\": 1.64, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.33, "window_alt_abs_m": 0.0, "target_px_mean_hist": 205.8, "cur_frame_id": 58, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/ORI/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-99.17, -45.49, 20.0, -39.07, 113.28, 0.0]\n  Target bbox: [627.24, 339.28, 652.69, 380.62]\n\nFrame 2:\n  Drone pose: [-99.66, -45.21, 20.0, -38.83, 113.1, 0.0]\n  Target bbox: [630.87, 339.6, 649.21, 380.27]\n\nFrame 3:\n  Drone pose: [-100.15, -44.98, 20.0, -39.07, 113.35, 0.0]\n  Target bbox: [629.32, 339.94, 650.62, 379.91]\n\nFrame 4:\n  Drone pose: [-100.65, -44.79, 20.0, -38.74, 113.05, 0.0]\n  Target bbox: [628.92, 339.25, 651.16, 380.63]\n\nFrame 5 (current):\n  Drone pose: [-101.16, -44.63, 20.0, -38.91, 113.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.99, \"ymin\": 339.17, \"xmax\": 651.09, \"ymax\": 380.71}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": 0.14, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": 1.02, \"dz\": 0.0, \"dpitch\": -0.55, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": -1.83, \"dy\": 1.7, \"dz\": 0.0, \"dpitch\": -0.68, \"dyaw\": 1.06, \"droll\": 0.0}, {\"dx\": -2.1, \"dy\": 2.56, \"dz\": 0.0, \"dpitch\": -1.51, \"dyaw\": 2.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.83, "window_alt_abs_m": 0.0, "target_px_mean_hist": 193.5, "cur_frame_id": 65, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-63.97, -67.2, 22.0, -46.91, 135.0, 0.0]\n  Target bbox: [630.87, 341.57, 649.13, 378.11]\n\nFrame 2:\n  Drone pose: [-65.56, -67.67, 21.2, -45.66, 130.81, 0.0]\n  Target bbox: [631.31, 340.66, 648.6, 379.02]\n\nFrame 3:\n  Drone pose: [-66.67, -67.66, 20.67, -42.45, 133.16, 0.0]\n  Target bbox: [575.71, 383.03, 594.43, 423.12]\n\nFrame 4:\n  Drone pose: [-67.51, -67.36, 20.64, -42.98, 122.43, 0.0]\n  Target bbox: [687.18, 372.74, 711.96, 417.14]\n\nFrame 5 (current):\n  Drone pose: [-68.21, -66.87, 20.59, -43.15, 120.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 660.27, \"ymin\": 337.14, \"xmax\": 684.06, \"ymax\": 378.42}, \"waypoint_deltas\": [{\"dx\": -0.61, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": -1.88, \"dyaw\": 5.93, \"droll\": 0.0}, {\"dx\": -1.19, \"dy\": 0.86, \"dz\": -0.02, \"dpitch\": -1.87, \"dyaw\": 5.68, \"droll\": 0.0}, {\"dx\": -1.74, \"dy\": 1.32, \"dz\": -0.04, \"dpitch\": -1.83, \"dyaw\": 5.48, \"droll\": 0.0}, {\"dx\": -2.27, \"dy\": 1.76, \"dz\": -0.06, \"dpitch\": -1.76, \"dyaw\": 5.33, \"droll\": 0.0}, {\"dx\": -2.79, \"dy\": 2.21, \"dz\": -0.17, \"dpitch\": -1.56, \"dyaw\": 5.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.12, "window_alt_abs_m": 1.41, "target_px_mean_hist": 227.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00005/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00006/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00007/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00008/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00009/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-68.77, -66.59, 20.45, -41.81, 127.32, 0.0]\n  Target bbox: [684.65, 392.59, 714.33, 439.41]\n\nFrame 2:\n  Drone pose: [-69.4, -66.01, 20.57, -41.37, 122.81, 0.0]\n  Target bbox: [671.01, 399.63, 691.45, 444.51]\n\nFrame 3:\n  Drone pose: [-69.95, -65.55, 20.55, -44.98, 126.07, 0.0]\n  Target bbox: [629.87, 337.36, 649.99, 382.38]\n\nFrame 4:\n  Drone pose: [-70.48, -65.11, 20.53, -43.26, 126.72, 0.0]\n  Target bbox: [618.22, 366.67, 642.32, 408.54]\n\nFrame 5 (current):\n  Drone pose: [-71.0, -64.66, 20.42, -44.71, 125.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.55, \"ymin\": 339.82, \"xmax\": 649.37, \"ymax\": 379.87}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.46, \"dz\": -0.03, \"dpitch\": 0.09, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": 0.92, \"dz\": -0.06, \"dpitch\": 0.17, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": 1.4, \"dz\": -0.09, \"dpitch\": 0.23, \"dyaw\": -0.23, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": 1.9, \"dz\": -0.12, \"dpitch\": 0.27, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": 2.4, \"dz\": -0.15, \"dpitch\": 0.3, \"dyaw\": -0.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.36, "window_alt_abs_m": 0.27, "target_px_mean_hist": 241.5, "cur_frame_id": 9, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00015/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-72.02, -63.74, 20.36, -48.43, 127.12, 0.0]\n  Target bbox: [611.52, 274.56, 631.93, 314.75]\n\nFrame 2:\n  Drone pose: [-72.53, -63.26, 20.33, -47.81, 126.04, 0.0]\n  Target bbox: [622.69, 282.55, 645.66, 325.53]\n\nFrame 3:\n  Drone pose: [-73.03, -62.63, 20.2, -45.59, 126.01, 0.0]\n  Target bbox: [628.49, 339.82, 651.37, 379.83]\n\nFrame 4:\n  Drone pose: [-73.53, -62.26, 20.27, -46.71, 130.25, 0.0]\n  Target bbox: [572.24, 302.71, 594.44, 343.0]\n\nFrame 5 (current):\n  Drone pose: [-74.05, -61.77, 20.24, -46.47, 126.9, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.27, \"ymin\": 302.55, \"xmax\": 632.48, \"ymax\": 346.82}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.5, \"dz\": -0.02, \"dpitch\": 2.13, \"dyaw\": -1.44, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": 1.03, \"dz\": -0.05, \"dpitch\": 2.12, \"dyaw\": -1.46, \"droll\": 0.0}, {\"dx\": -1.56, \"dy\": 1.6, \"dz\": -0.07, \"dpitch\": 2.05, \"dyaw\": -1.4, \"droll\": 0.0}, {\"dx\": -2.05, \"dy\": 2.21, \"dz\": -0.09, \"dpitch\": 1.95, \"dyaw\": -1.2, \"droll\": 0.0}, {\"dx\": -2.58, \"dy\": 2.52, \"dz\": -0.11, \"dpitch\": 2.17, \"dyaw\": -1.57, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.7, "window_alt_abs_m": 0.26, "target_px_mean_hist": 175.2, "cur_frame_id": 15, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00020/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-74.56, -61.27, 20.22, -49.1, 128.7, 0.0]\n  Target bbox: [590.37, 258.63, 611.11, 302.67]\n\nFrame 2:\n  Drone pose: [-75.09, -60.74, 20.19, -44.35, 125.44, 0.0]\n  Target bbox: [626.95, 338.41, 652.92, 381.3]\n\nFrame 3:\n  Drone pose: [-75.61, -60.17, 20.17, -41.48, 130.5, 0.0]\n  Target bbox: [564.87, 387.11, 594.77, 434.98]\n\nFrame 4:\n  Drone pose: [-76.05, -59.39, 20.2, -43.49, 125.93, 0.0]\n  Target bbox: [689.29, 331.45, 713.89, 373.09]\n\nFrame 5 (current):\n  Drone pose: [-76.58, -59.16, 20.26, -39.62, 126.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 631.75, \"ymin\": 339.55, \"xmax\": 648.13, \"ymax\": 380.29}, \"waypoint_deltas\": [{\"dx\": -0.66, \"dy\": 0.0, \"dz\": -0.14, \"dpitch\": -4.26, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": -1.28, \"dy\": 0.08, \"dz\": -0.16, \"dpitch\": -3.84, \"dyaw\": -3.01, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": -0.19, \"dz\": -0.17, \"dpitch\": -3.11, \"dyaw\": -4.76, \"droll\": 0.0}, {\"dx\": -2.83, \"dy\": -0.56, \"dz\": -0.18, \"dpitch\": -2.29, \"dyaw\": -6.64, \"droll\": 0.0}, {\"dx\": -3.72, \"dy\": -1.16, \"dz\": -0.19, \"dpitch\": -1.27, \"dyaw\": -8.93, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.43, "window_alt_abs_m": 0.13, "target_px_mean_hist": 235.0, "cur_frame_id": 20, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00025/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00026/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-77.86, -59.08, 20.1, -43.46, 123.46, 0.0]\n  Target bbox: [630.09, 339.51, 649.82, 380.21]\n\nFrame 2:\n  Drone pose: [-78.61, -59.35, 20.09, -45.62, 126.14, 0.0]\n  Target bbox: [573.24, 292.2, 597.07, 333.4]\n\nFrame 3:\n  Drone pose: [-79.41, -59.72, 20.08, -36.91, 122.99, 0.0]\n  Target bbox: [589.45, 424.36, 610.92, 464.93]\n\nFrame 4:\n  Drone pose: [-80.3, -60.32, 20.07, -39.04, 112.54, 0.0]\n  Target bbox: [690.15, 371.09, 716.95, 414.3]\n\nFrame 5 (current):\n  Drone pose: [-81.0, -60.47, 20.06, -35.3, 119.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 586.29, \"ymin\": 423.38, \"xmax\": 604.84, \"ymax\": 464.74}, \"waypoint_deltas\": [{\"dx\": -0.7, \"dy\": -0.15, \"dz\": -0.01, \"dpitch\": -4.34, \"dyaw\": -4.59, \"droll\": 0.0}, {\"dx\": -1.34, \"dy\": -0.15, \"dz\": -0.02, \"dpitch\": -3.85, \"dyaw\": -5.41, \"droll\": 0.0}, {\"dx\": -1.87, \"dy\": 0.18, \"dz\": -0.02, \"dpitch\": -3.67, \"dyaw\": -5.65, \"droll\": 0.0}, {\"dx\": -2.38, \"dy\": 0.6, \"dz\": -0.03, \"dpitch\": -3.59, \"dyaw\": -5.74, \"droll\": 0.0}, {\"dx\": -2.87, \"dy\": 1.07, \"dz\": -0.03, \"dpitch\": -3.54, \"dyaw\": -5.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.52, "window_alt_abs_m": 0.04, "target_px_mean_hist": 228.5, "cur_frame_id": 26, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-81.7, -60.62, 20.05, -40.98, 112.48, 0.0]\n  Target bbox: [665.74, 317.69, 684.47, 358.41]\n\nFrame 2:\n  Drone pose: [-82.34, -60.62, 20.04, -39.15, 114.37, 0.0]\n  Target bbox: [629.74, 339.04, 650.17, 380.82]\n\nFrame 3:\n  Drone pose: [-82.87, -60.29, 20.04, -38.97, 114.13, 0.0]\n  Target bbox: [627.87, 339.3, 652.05, 380.58]\n\nFrame 4:\n  Drone pose: [-83.38, -59.87, 20.03, -39.6, 112.78, 0.0]\n  Target bbox: [644.79, 327.56, 668.5, 368.5]\n\nFrame 5 (current):\n  Drone pose: [-83.9, -59.48, 20.19, -42.57, 117.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 584.27, \"ymin\": 310.72, \"xmax\": 606.61, \"ymax\": 353.74}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": 0.57, \"dz\": -0.17, \"dpitch\": 3.75, \"dyaw\": -3.69, \"droll\": 0.0}, {\"dx\": -0.96, \"dy\": 1.07, \"dz\": -0.17, \"dpitch\": 3.76, \"dyaw\": -3.69, \"droll\": 0.0}, {\"dx\": -1.46, \"dy\": 1.57, \"dz\": -0.17, \"dpitch\": 3.76, \"dyaw\": -3.7, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": 2.07, \"dz\": -0.18, \"dpitch\": 3.76, \"dyaw\": -3.73, \"droll\": 0.0}, {\"dx\": -2.48, \"dy\": 2.56, \"dz\": -0.18, \"dpitch\": 3.76, \"dyaw\": -3.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.44, "window_alt_abs_m": 0.18, "target_px_mean_hist": 205.8, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00037/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-84.92, -58.32, 19.91, -35.98, 112.63, 0.0]\n  Target bbox: [628.99, 340.22, 651.07, 379.72]\n\nFrame 2:\n  Drone pose: [-85.36, -57.91, 20.02, -38.81, 114.03, 0.0]\n  Target bbox: [631.03, 339.68, 648.9, 380.17]\n\nFrame 3:\n  Drone pose: [-85.88, -57.41, 20.01, -39.74, 115.66, 0.0]\n  Target bbox: [608.71, 325.12, 627.58, 364.03]\n\nFrame 4:\n  Drone pose: [-86.38, -56.92, 20.01, -38.81, 113.98, 0.0]\n  Target bbox: [630.19, 340.29, 649.75, 379.55]\n\nFrame 5 (current):\n  Drone pose: [-86.89, -56.42, 20.01, -35.97, 116.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 599.41, \"ymin\": 391.78, \"xmax\": 618.04, \"ymax\": 430.82}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": -3.07, \"dyaw\": -2.36, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": -3.06, \"dyaw\": -2.36, \"droll\": 0.0}, {\"dx\": -1.49, \"dy\": 1.51, \"dz\": -0.01, \"dpitch\": -3.05, \"dyaw\": -2.36, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": 1.99, \"dz\": -0.01, \"dpitch\": -3.03, \"dyaw\": -2.38, \"droll\": 0.0}, {\"dx\": -2.49, \"dy\": 2.44, \"dz\": -0.01, \"dpitch\": -2.97, \"dyaw\": -2.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.1, "window_alt_abs_m": 0.11, "target_px_mean_hist": 182.2, "cur_frame_id": 37, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00042/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-87.38, -55.9, 20.01, -39.04, 114.0, 0.0]\n  Target bbox: [631.1, 340.02, 648.83, 379.82]\n\nFrame 2:\n  Drone pose: [-87.88, -55.41, 20.01, -38.29, 109.0, 0.0]\n  Target bbox: [694.52, 353.34, 715.97, 394.76]\n\nFrame 3:\n  Drone pose: [-88.38, -54.91, 20.0, -38.22, 113.53, 0.0]\n  Target bbox: [633.23, 352.49, 659.09, 394.28]\n\nFrame 4:\n  Drone pose: [-88.88, -54.43, 20.0, -40.96, 111.74, 0.0]\n  Target bbox: [657.45, 306.75, 681.28, 348.26]\n\nFrame 5 (current):\n  Drone pose: [-89.44, -53.82, 20.04, -35.9, 118.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.35, \"ymin\": 391.02, \"xmax\": 638.19, \"ymax\": 430.69}, \"waypoint_deltas\": [{\"dx\": -0.45, \"dy\": 0.24, \"dz\": -0.04, \"dpitch\": -2.95, \"dyaw\": -4.69, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": 0.61, \"dz\": -0.04, \"dpitch\": -2.82, \"dyaw\": -4.88, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": 0.95, \"dz\": -0.04, \"dpitch\": -2.68, \"dyaw\": -5.13, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": 1.3, \"dz\": -0.04, \"dpitch\": -3.07, \"dyaw\": -4.94, \"droll\": 0.0}, {\"dx\": -2.67, \"dy\": 1.68, \"dz\": -0.04, \"dpitch\": -2.98, \"dyaw\": -5.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.07, "window_alt_abs_m": 0.04, "target_px_mean_hist": 200.0, "cur_frame_id": 42, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00048/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-90.42, -53.21, 20.0, -39.94, 108.62, 0.0]\n  Target bbox: [694.97, 321.0, 716.1, 361.52]\n\nFrame 2:\n  Drone pose: [-90.97, -52.87, 20.0, -38.78, 118.27, 0.0]\n  Target bbox: [566.76, 338.6, 584.38, 377.85]\n\nFrame 3:\n  Drone pose: [-91.56, -52.42, 19.9, -41.17, 111.23, 0.0]\n  Target bbox: [625.82, 337.6, 654.12, 382.29]\n\nFrame 4:\n  Drone pose: [-92.11, -52.14, 20.0, -38.59, 111.88, 0.0]\n  Target bbox: [645.89, 344.21, 671.04, 385.74]\n\nFrame 5 (current):\n  Drone pose: [-92.68, -51.71, 20.0, -35.5, 113.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.57, \"ymin\": 394.56, \"xmax\": 644.7, \"ymax\": 437.54}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": 0.46, \"dz\": 0.0, \"dpitch\": -3.33, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": -1.13, \"dy\": 0.96, \"dz\": 0.0, \"dpitch\": -3.35, \"dyaw\": -0.76, \"droll\": 0.0}, {\"dx\": -1.67, \"dy\": 1.47, \"dz\": 0.0, \"dpitch\": -3.38, \"dyaw\": -0.85, \"droll\": 0.0}, {\"dx\": -2.2, \"dy\": 1.99, \"dz\": 0.0, \"dpitch\": -3.42, \"dyaw\": -0.88, \"droll\": 0.0}, {\"dx\": -2.7, \"dy\": 2.53, \"dz\": 0.0, \"dpitch\": -3.46, \"dyaw\": -0.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.95, "window_alt_abs_m": 0.21, "target_px_mean_hist": 208.5, "cur_frame_id": 48, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144/aug_001/frames_playback/frame_00053/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-93.25, -51.25, 20.0, -41.13, 111.24, 0.0]\n  Target bbox: [652.99, 301.97, 669.5, 341.24]\n\nFrame 2:\n  Drone pose: [-93.68, -50.75, 19.89, -39.12, 116.61, 0.0]\n  Target bbox: [630.49, 339.5, 649.61, 380.36]\n\nFrame 3:\n  Drone pose: [-94.27, -50.21, 20.03, -41.05, 106.83, 0.0]\n  Target bbox: [626.05, 337.14, 654.0, 382.8]\n\nFrame 4:\n  Drone pose: [-94.88, -49.72, 20.0, -38.92, 112.61, 0.0]\n  Target bbox: [629.6, 340.07, 650.34, 379.78]\n\nFrame 5 (current):\n  Drone pose: [-95.38, -49.18, 20.0, -39.44, 115.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 589.33, \"ymin\": 331.71, \"xmax\": 610.04, \"ymax\": 373.38}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": 0.45, \"dyaw\": -3.02, \"droll\": 0.0}, {\"dx\": -0.97, \"dy\": 1.06, \"dz\": 0.0, \"dpitch\": 0.43, \"dyaw\": -2.93, \"droll\": 0.0}, {\"dx\": -1.43, \"dy\": 1.58, \"dz\": 0.0, \"dpitch\": 0.42, \"dyaw\": -2.83, \"droll\": 0.0}, {\"dx\": -1.89, \"dy\": 2.09, \"dz\": 0.0, \"dpitch\": 0.43, \"dyaw\": -2.74, \"droll\": 0.0}, {\"dx\": -2.36, \"dy\": 2.56, \"dz\": 0.0, \"dpitch\": 0.48, \"dyaw\": -2.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.02, "window_alt_abs_m": 0.28, "target_px_mean_hist": 205.8, "cur_frame_id": 53, "source": "aug_001", "fut_invisible_cnt": 3}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057144", "difficulty_score": 0.343, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.63, 124.44, 22.0, -46.27, -177.14, 0.0]\n  Target bbox: [622.49, 327.4, 657.54, 391.98]\n\nFrame 2:\n  Drone pose: [-50.19, 123.22, 21.2, -46.74, -179.15, 0.0]\n  Target bbox: [627.89, 323.88, 652.44, 395.37]\n\nFrame 3:\n  Drone pose: [-51.14, 122.65, 20.67, -46.67, 179.08, 0.0]\n  Target bbox: [627.06, 325.9, 652.64, 393.3]\n\nFrame 4:\n  Drone pose: [-51.8, 122.42, 20.64, -46.86, 178.36, 0.0]\n  Target bbox: [627.37, 327.9, 652.39, 391.17]\n\nFrame 5 (current):\n  Drone pose: [-52.34, 122.34, 20.62, -46.89, 178.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.27, \"ymin\": 326.28, \"xmax\": 652.45, \"ymax\": 392.87}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": -0.03, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": -0.06, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": -0.08, \"dz\": -0.07, \"dpitch\": 0.04, \"dyaw\": -0.28, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": -0.11, \"dz\": -0.09, \"dpitch\": 0.05, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": -2.57, \"dy\": -0.14, \"dz\": -0.2, \"dpitch\": 0.19, \"dyaw\": -0.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.74, "window_alt_abs_m": 1.38, "target_px_mean_hist": 506.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00019/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-57.98, 122.07, 20.24, -46.76, 177.25, 0.0]\n  Target bbox: [625.1, 320.88, 654.45, 398.42]\n\nFrame 2:\n  Drone pose: [-58.49, 122.07, 20.22, -46.74, 177.23, 0.0]\n  Target bbox: [626.85, 329.13, 652.95, 389.9]\n\nFrame 3:\n  Drone pose: [-59.0, 122.07, 20.19, -46.71, 177.24, 0.0]\n  Target bbox: [626.4, 322.84, 653.24, 396.35]\n\nFrame 4:\n  Drone pose: [-59.51, 122.08, 20.17, -46.7, 177.26, 0.0]\n  Target bbox: [625.02, 320.78, 654.53, 398.5]\n\nFrame 5 (current):\n  Drone pose: [-60.02, 122.09, 20.15, -46.69, 177.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.7, \"ymin\": 322.48, \"xmax\": 653.89, \"ymax\": 396.83}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.03, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": 0.05, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": 0.08, \"dz\": -0.05, \"dpitch\": 0.02, \"dyaw\": 0.24, \"droll\": 0.0}, {\"dx\": -2.04, \"dy\": 0.12, \"dz\": -0.06, \"dpitch\": 0.02, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": -2.54, \"dy\": 0.15, \"dz\": -0.07, \"dpitch\": 0.03, \"dyaw\": 0.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.09, "window_alt_abs_m": 0.09, "target_px_mean_hist": 593.0, "cur_frame_id": 19, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-66.09, 122.27, 20.03, -46.64, 177.85, 0.0]\n  Target bbox: [626.97, 324.8, 652.72, 394.34]\n\nFrame 2:\n  Drone pose: [-66.61, 122.18, 20.02, -46.65, 177.57, 0.0]\n  Target bbox: [617.87, 320.95, 662.18, 398.29]\n\nFrame 3:\n  Drone pose: [-67.14, 122.08, 20.02, -46.71, 178.84, 0.0]\n  Target bbox: [618.03, 321.01, 661.94, 398.2]\n\nFrame 4:\n  Drone pose: [-67.67, 121.98, 20.02, -46.76, -179.87, 0.0]\n  Target bbox: [626.25, 321.12, 653.85, 398.18]\n\nFrame 5 (current):\n  Drone pose: [-68.2, 121.9, 20.01, -46.8, 179.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.34, \"ymin\": 322.06, \"xmax\": 653.56, \"ymax\": 397.23}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.34, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.08, \"dz\": -0.01, \"dpitch\": -0.09, \"dyaw\": -0.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.11, "window_alt_abs_m": 0.01, "target_px_mean_hist": 585.2, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-73.89, 122.42, 20.0, -47.07, -178.45, 0.0]\n  Target bbox: [626.32, 324.41, 654.02, 394.68]\n\nFrame 2:\n  Drone pose: [-74.43, 122.61, 20.0, -47.14, -177.85, 0.0]\n  Target bbox: [626.54, 325.32, 653.77, 393.8]\n\nFrame 3:\n  Drone pose: [-75.0, 122.83, 20.0, -47.23, -177.13, 0.0]\n  Target bbox: [626.06, 324.77, 654.26, 394.3]\n\nFrame 4:\n  Drone pose: [-75.58, 123.08, 20.0, -47.34, -176.29, 0.0]\n  Target bbox: [626.64, 324.64, 653.67, 394.45]\n\nFrame 5 (current):\n  Drone pose: [-76.21, 123.38, 20.0, -47.51, -175.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.74, \"ymin\": 321.4, \"xmax\": 654.65, \"ymax\": 397.77}, \"waypoint_deltas\": [{\"dx\": -0.66, \"dy\": 0.34, \"dz\": 0.0, \"dpitch\": -0.22, \"dyaw\": 1.19, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": 0.74, \"dz\": 0.0, \"dpitch\": -0.51, \"dyaw\": 2.59, \"droll\": 0.0}, {\"dx\": -2.17, \"dy\": 1.19, \"dz\": 0.0, \"dpitch\": -0.87, \"dyaw\": 4.22, \"droll\": 0.0}, {\"dx\": -3.02, \"dy\": 1.68, \"dz\": 0.0, \"dpitch\": -1.32, \"dyaw\": 6.07, \"droll\": 0.0}, {\"dx\": -3.94, \"dy\": 2.17, \"dz\": 0.0, \"dpitch\": -1.65, \"dyaw\": 9.72, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.16, "window_alt_abs_m": 0.0, "target_px_mean_hist": 623.8, "cur_frame_id": 50, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-86.44, 127.24, 20.0, -50.27, -146.26, 0.0]\n  Target bbox: [620.66, 325.58, 659.09, 393.31]\n\nFrame 2:\n  Drone pose: [-87.0, 127.16, 20.0, -49.94, -144.89, 0.0]\n  Target bbox: [623.04, 325.13, 656.7, 393.74]\n\nFrame 3:\n  Drone pose: [-87.45, 127.02, 20.0, -49.5, -143.96, 0.0]\n  Target bbox: [616.08, 317.84, 664.21, 401.25]\n\nFrame 4:\n  Drone pose: [-87.79, 126.84, 20.0, -49.64, -142.32, 0.0]\n  Target bbox: [616.82, 318.48, 663.45, 400.57]\n\nFrame 5 (current):\n  Drone pose: [-88.06, 126.63, 20.0, -49.7, -140.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.13, \"ymin\": 317.04, \"xmax\": 666.2, \"ymax\": 402.17}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": -0.24, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.22, \"droll\": 0.0}, {\"dx\": -0.44, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 2.36, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": -0.76, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 3.53, \"droll\": 0.0}, {\"dx\": -0.93, \"dy\": -1.02, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 4.8, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": -1.27, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": 6.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.32, "window_alt_abs_m": 0.0, "target_px_mean_hist": 658.0, "cur_frame_id": 66, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00082/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-91.95, 123.59, 20.0, -50.45, -123.36, 0.0]\n  Target bbox: [621.77, 323.91, 658.13, 394.97]\n\nFrame 2:\n  Drone pose: [-92.27, 123.24, 20.0, -50.53, -122.08, 0.0]\n  Target bbox: [616.05, 321.11, 663.93, 397.84]\n\nFrame 3:\n  Drone pose: [-92.57, 122.87, 20.0, -50.6, -120.91, 0.0]\n  Target bbox: [622.07, 324.36, 657.82, 394.53]\n\nFrame 4:\n  Drone pose: [-92.86, 122.49, 20.0, -50.69, -119.78, 0.0]\n  Target bbox: [621.55, 323.64, 658.33, 395.2]\n\nFrame 5 (current):\n  Drone pose: [-93.15, 122.09, 20.0, -50.79, -118.65, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.85, \"ymin\": 320.38, \"xmax\": 665.1, \"ymax\": 398.48}, \"waypoint_deltas\": [{\"dx\": -0.3, \"dy\": -0.39, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 1.15, \"droll\": 0.0}, {\"dx\": -0.6, \"dy\": -0.77, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 2.33, \"droll\": 0.0}, {\"dx\": -0.9, \"dy\": -1.14, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": 3.56, \"droll\": 0.0}, {\"dx\": -1.21, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 4.83, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": -1.83, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 6.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.7, "window_alt_abs_m": 0.0, "target_px_mean_hist": 642.2, "cur_frame_id": 82, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-96.41, 118.24, 20.0, -50.24, -105.36, 0.0]\n  Target bbox: [621.11, 324.5, 658.67, 394.36]\n\nFrame 2:\n  Drone pose: [-96.66, 117.88, 20.0, -50.11, -104.37, 0.0]\n  Target bbox: [624.24, 325.88, 655.55, 393.01]\n\nFrame 3:\n  Drone pose: [-96.89, 117.5, 20.0, -49.99, -103.44, 0.0]\n  Target bbox: [618.45, 323.68, 661.28, 395.18]\n\nFrame 4:\n  Drone pose: [-97.11, 117.12, 20.0, -49.89, -102.58, 0.0]\n  Target bbox: [616.52, 323.77, 663.21, 395.06]\n\nFrame 5 (current):\n  Drone pose: [-97.32, 116.72, 20.0, -49.79, -101.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.46, \"ymin\": 326.5, \"xmax\": 655.33, \"ymax\": 392.41}, \"waypoint_deltas\": [{\"dx\": -0.2, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.76, \"droll\": 0.0}, {\"dx\": -0.39, \"dy\": -0.83, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": 1.47, \"droll\": 0.0}, {\"dx\": -0.56, \"dy\": -1.25, \"dz\": 0.0, \"dpitch\": 0.26, \"dyaw\": 2.13, \"droll\": 0.0}, {\"dx\": -0.72, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": 0.34, \"dyaw\": 2.74, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": -2.1, \"dz\": 0.0, \"dpitch\": 0.43, \"dyaw\": 3.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.58, "window_alt_abs_m": 0.0, "target_px_mean_hist": 628.8, "cur_frame_id": 97, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00113/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-98.61, 111.49, 20.0, -48.84, -96.89, 0.0]\n  Target bbox: [615.0, 324.81, 664.62, 394.11]\n\nFrame 2:\n  Drone pose: [-98.56, 111.01, 20.0, -48.79, -97.05, 0.0]\n  Target bbox: [626.56, 326.91, 653.23, 392.01]\n\nFrame 3:\n  Drone pose: [-98.48, 110.52, 20.0, -48.75, -97.31, 0.0]\n  Target bbox: [615.4, 324.13, 664.23, 394.75]\n\nFrame 4:\n  Drone pose: [-98.38, 110.03, 20.0, -48.72, -97.67, 0.0]\n  Target bbox: [619.47, 325.93, 660.24, 392.99]\n\nFrame 5 (current):\n  Drone pose: [-98.25, 109.52, 20.0, -48.7, -98.1, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.48, \"ymin\": 325.3, \"xmax\": 662.21, \"ymax\": 393.62}, \"waypoint_deltas\": [{\"dx\": 0.15, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": -1.05, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": -1.58, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.61, \"droll\": 0.0}, {\"dx\": 0.61, \"dy\": -2.13, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -2.13, \"droll\": 0.0}, {\"dx\": 0.73, \"dy\": -2.69, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -2.57, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.21, "window_alt_abs_m": 0.0, "target_px_mean_hist": 624.5, "cur_frame_id": 113, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00128/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.84, 103.24, 20.0, -49.12, -106.78, 0.0]\n  Target bbox: [616.88, 320.75, 663.33, 398.39]\n\nFrame 2:\n  Drone pose: [-98.04, 102.61, 20.0, -49.17, -107.92, 0.0]\n  Target bbox: [616.5, 320.04, 663.73, 399.04]\n\nFrame 3:\n  Drone pose: [-98.26, 101.99, 20.0, -49.22, -108.99, 0.0]\n  Target bbox: [617.59, 320.27, 662.64, 398.74]\n\nFrame 4:\n  Drone pose: [-98.49, 101.37, 20.0, -49.27, -110.01, 0.0]\n  Target bbox: [622.07, 323.94, 658.11, 395.04]\n\nFrame 5 (current):\n  Drone pose: [-98.74, 100.74, 20.0, -49.31, -110.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.84, \"ymin\": 320.42, \"xmax\": 662.41, \"ymax\": 398.57}, \"waypoint_deltas\": [{\"dx\": -0.25, \"dy\": -0.62, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.95, \"droll\": 0.0}, {\"dx\": -0.52, \"dy\": -1.24, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -1.87, \"droll\": 0.0}, {\"dx\": -0.8, \"dy\": -1.85, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -2.75, \"droll\": 0.0}, {\"dx\": -1.08, \"dy\": -2.47, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": -3.6, \"droll\": 0.0}, {\"dx\": -1.38, \"dy\": -3.07, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": -4.41, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.21, "window_alt_abs_m": 0.0, "target_px_mean_hist": 603.8, "cur_frame_id": 128, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/ORI/frames_playback/frame_00144/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.31, 93.46, 20.0, -49.48, -120.63, 0.0]\n  Target bbox: [620.29, 324.68, 659.93, 394.2]\n\nFrame 2:\n  Drone pose: [-102.63, 92.86, 20.0, -49.47, -121.34, 0.0]\n  Target bbox: [618.81, 321.31, 661.56, 397.65]\n\nFrame 3:\n  Drone pose: [-102.96, 92.26, 20.0, -49.47, -122.04, 0.0]\n  Target bbox: [622.77, 322.05, 657.55, 396.85]\n\nFrame 4:\n  Drone pose: [-103.29, 91.65, 20.0, -49.46, -122.74, 0.0]\n  Target bbox: [618.85, 321.89, 661.53, 397.11]\n\nFrame 5 (current):\n  Drone pose: [-103.63, 91.05, 20.0, -49.46, -123.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.99, \"ymin\": 321.73, \"xmax\": 661.39, \"ymax\": 397.24}, \"waypoint_deltas\": [{\"dx\": -0.33, \"dy\": -0.61, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.7, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": -1.23, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -1.42, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": -1.85, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -2.14, \"droll\": 0.0}, {\"dx\": -1.32, \"dy\": -2.47, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -2.87, \"droll\": 0.0}, {\"dx\": -1.66, \"dy\": -3.09, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -3.6, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.8, "window_alt_abs_m": 0.0, "target_px_mean_hist": 630.8, "cur_frame_id": 144, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-48.75, 124.37, 21.93, -46.34, -177.35, 0.0]\n  Target bbox: [619.88, 324.52, 660.12, 394.86]\n\nFrame 2:\n  Drone pose: [-50.19, 123.22, 21.2, -46.74, -179.15, 0.0]\n  Target bbox: [627.2, 323.9, 653.18, 395.47]\n\nFrame 3:\n  Drone pose: [-51.14, 122.65, 20.67, -46.67, 179.08, 0.0]\n  Target bbox: [627.59, 324.87, 652.11, 394.32]\n\nFrame 4:\n  Drone pose: [-51.83, 122.43, 20.49, -41.69, 182.43, 0.0]\n  Target bbox: [578.65, 406.47, 607.84, 483.41]\n\nFrame 5 (current):\n  Drone pose: [-52.34, 122.34, 20.62, -47.21, 173.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 683.23, \"ymin\": 322.73, \"xmax\": 711.76, \"ymax\": 389.37}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": -0.03, \"dz\": -0.03, \"dpitch\": 0.33, \"dyaw\": 4.89, \"droll\": 0.0}, {\"dx\": -1.03, \"dy\": -0.06, \"dz\": -0.05, \"dpitch\": 0.35, \"dyaw\": 4.8, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": -0.08, \"dz\": -0.07, \"dpitch\": 0.36, \"dyaw\": 4.72, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": -0.11, \"dz\": -0.09, \"dpitch\": 0.37, \"dyaw\": 4.64, \"droll\": 0.0}, {\"dx\": -2.57, \"dy\": -0.14, \"dz\": -0.2, \"dpitch\": 0.51, \"dyaw\": 4.55, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.24, "window_alt_abs_m": 1.56, "target_px_mean_hist": 531.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00019/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.11, 122.09, 20.34, -47.12, 177.27, 0.0]\n  Target bbox: [625.06, 321.33, 654.51, 397.88]\n\nFrame 2:\n  Drone pose: [-58.49, 122.07, 20.22, -44.27, 179.38, 0.0]\n  Target bbox: [600.71, 365.83, 628.98, 436.83]\n\nFrame 3:\n  Drone pose: [-58.83, 122.09, 20.21, -51.34, 177.32, 0.0]\n  Target bbox: [626.57, 242.47, 652.93, 313.43]\n\nFrame 4:\n  Drone pose: [-59.39, 122.23, 20.2, -44.81, 178.99, 0.0]\n  Target bbox: [611.18, 351.04, 639.1, 427.59]\n\nFrame 5 (current):\n  Drone pose: [-60.02, 122.09, 20.15, -41.69, 180.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 589.03, \"ymin\": 413.69, \"xmax\": 615.14, \"ymax\": 475.06}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": 0.03, \"dz\": -0.02, \"dpitch\": -4.99, \"dyaw\": -3.19, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": 0.05, \"dz\": -0.03, \"dpitch\": -4.98, \"dyaw\": -3.1, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": 0.08, \"dz\": -0.05, \"dpitch\": -4.98, \"dyaw\": -3.01, \"droll\": 0.0}, {\"dx\": -2.04, \"dy\": 0.12, \"dz\": -0.06, \"dpitch\": -4.98, \"dyaw\": -2.9, \"droll\": 0.0}, {\"dx\": -2.54, \"dy\": 0.15, \"dz\": -0.07, \"dpitch\": -4.97, \"dyaw\": -2.78, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.4, "window_alt_abs_m": 0.19, "target_px_mean_hist": 593.2, "cur_frame_id": 19, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-66.09, 122.27, 20.03, -46.64, 177.85, 0.0]\n  Target bbox: [626.47, 322.24, 653.16, 396.97]\n\nFrame 2:\n  Drone pose: [-66.46, 122.22, 20.01, -47.09, 172.7, 0.0]\n  Target bbox: [676.44, 312.0, 720.42, 387.94]\n\nFrame 3:\n  Drone pose: [-67.31, 122.17, 19.97, -46.91, 179.11, 0.0]\n  Target bbox: [620.3, 323.16, 659.73, 396.02]\n\nFrame 4:\n  Drone pose: [-67.62, 121.97, 20.17, -46.91, -179.9, 0.0]\n  Target bbox: [627.47, 323.02, 652.6, 396.13]\n\nFrame 5 (current):\n  Drone pose: [-68.2, 121.95, 19.86, -46.58, -179.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.84, \"ymin\": 320.68, \"xmax\": 654.17, \"ymax\": 398.54}, \"waypoint_deltas\": [{\"dx\": -0.52, \"dy\": -0.11, \"dz\": 0.15, \"dpitch\": -0.25, \"dyaw\": -0.34, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": -0.15, \"dz\": 0.15, \"dpitch\": -0.28, \"dyaw\": -0.47, \"droll\": 0.0}, {\"dx\": -1.55, \"dy\": -0.16, \"dz\": 0.15, \"dpitch\": -0.29, \"dyaw\": -0.52, \"droll\": 0.0}, {\"dx\": -2.06, \"dy\": -0.16, \"dz\": 0.15, \"dpitch\": -0.3, \"dyaw\": -0.49, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.13, \"dz\": 0.14, \"dpitch\": -0.31, \"dyaw\": -0.4, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.63, "window_alt_abs_m": 0.56, "target_px_mean_hist": 594.5, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-73.76, 122.32, 20.02, -46.9, -178.81, 0.0]\n  Target bbox: [626.08, 322.38, 654.31, 396.77]\n\nFrame 2:\n  Drone pose: [-74.43, 122.61, 20.0, -47.14, -177.85, 0.0]\n  Target bbox: [626.81, 323.11, 653.55, 396.05]\n\nFrame 3:\n  Drone pose: [-75.0, 122.83, 20.0, -47.23, -177.13, 0.0]\n  Target bbox: [625.26, 320.21, 655.2, 399.0]\n\nFrame 4:\n  Drone pose: [-75.58, 123.08, 20.0, -47.34, -176.29, 0.0]\n  Target bbox: [625.94, 321.6, 654.45, 397.57]\n\nFrame 5 (current):\n  Drone pose: [-76.22, 123.31, 20.02, -47.57, -175.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.28, \"ymin\": 323.52, \"xmax\": 655.07, \"ymax\": 395.56}, \"waypoint_deltas\": [{\"dx\": -0.65, \"dy\": 0.41, \"dz\": -0.02, \"dpitch\": -0.16, \"dyaw\": 1.41, \"droll\": 0.0}, {\"dx\": -1.38, \"dy\": 0.81, \"dz\": -0.02, \"dpitch\": -0.45, \"dyaw\": 2.81, \"droll\": 0.0}, {\"dx\": -2.16, \"dy\": 1.26, \"dz\": -0.02, \"dpitch\": -0.81, \"dyaw\": 4.44, \"droll\": 0.0}, {\"dx\": -3.01, \"dy\": 1.75, \"dz\": -0.02, \"dpitch\": -1.26, \"dyaw\": 6.29, \"droll\": 0.0}, {\"dx\": -3.93, \"dy\": 2.24, \"dz\": -0.02, \"dpitch\": -1.59, \"dyaw\": 9.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.28, "window_alt_abs_m": 0.04, "target_px_mean_hist": 619.8, "cur_frame_id": 50, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-86.44, 127.24, 20.0, -49.83, -145.88, 0.0]\n  Target bbox: [617.01, 333.88, 654.38, 399.75]\n\nFrame 2:\n  Drone pose: [-87.0, 127.16, 20.0, -51.44, -141.82, 0.0]\n  Target bbox: [589.37, 298.2, 623.04, 371.65]\n\nFrame 3:\n  Drone pose: [-87.49, 126.89, 19.99, -49.67, -144.23, 0.0]\n  Target bbox: [618.78, 320.66, 661.42, 398.37]\n\nFrame 4:\n  Drone pose: [-87.79, 126.84, 20.0, -53.35, -147.06, 0.0]\n  Target bbox: [667.83, 257.94, 716.84, 339.88]\n\nFrame 5 (current):\n  Drone pose: [-87.92, 126.74, 20.02, -49.42, -140.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.3, \"ymin\": 317.01, \"xmax\": 666.02, \"ymax\": 402.19}, \"waypoint_deltas\": [{\"dx\": -0.37, \"dy\": -0.35, \"dz\": -0.02, \"dpitch\": -0.3, \"dyaw\": 1.22, \"droll\": 0.0}, {\"dx\": -0.58, \"dy\": -0.61, \"dz\": -0.02, \"dpitch\": -0.3, \"dyaw\": 2.36, \"droll\": 0.0}, {\"dx\": -0.8, \"dy\": -0.87, \"dz\": -0.02, \"dpitch\": -0.32, \"dyaw\": 3.53, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": -1.13, \"dz\": -0.02, \"dpitch\": -0.37, \"dyaw\": 4.8, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": -1.38, \"dz\": -0.02, \"dpitch\": -0.45, \"dyaw\": 6.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.42, "window_alt_abs_m": 0.05, "target_px_mean_hist": 642.8, "cur_frame_id": 66, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00082/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-92.05, 123.6, 19.89, -49.71, -128.05, 0.0]\n  Target bbox: [675.0, 336.59, 712.88, 407.84]\n\nFrame 2:\n  Drone pose: [-92.27, 123.24, 20.0, -50.53, -122.08, 0.0]\n  Target bbox: [616.97, 321.58, 663.0, 397.35]\n\nFrame 3:\n  Drone pose: [-92.57, 122.85, 20.02, -50.67, -120.95, 0.0]\n  Target bbox: [621.87, 324.46, 658.02, 394.44]\n\nFrame 4:\n  Drone pose: [-92.99, 122.42, 19.96, -50.85, -119.49, 0.0]\n  Target bbox: [617.68, 322.07, 662.25, 396.8]\n\nFrame 5 (current):\n  Drone pose: [-93.15, 122.09, 20.0, -50.79, -118.65, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.51, \"ymin\": 323.68, \"xmax\": 659.36, \"ymax\": 395.23}, \"waypoint_deltas\": [{\"dx\": -0.3, \"dy\": -0.39, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 1.15, \"droll\": 0.0}, {\"dx\": -0.6, \"dy\": -0.77, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 2.33, \"droll\": 0.0}, {\"dx\": -0.9, \"dy\": -1.14, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": 3.56, \"droll\": 0.0}, {\"dx\": -1.21, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 4.83, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": -1.83, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 6.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.4, "window_alt_abs_m": 0.24, "target_px_mean_hist": 652.8, "cur_frame_id": 82, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-96.35, 118.17, 20.09, -48.08, -100.64, 0.0]\n  Target bbox: [558.46, 364.85, 612.89, 437.43]\n\nFrame 2:\n  Drone pose: [-96.48, 117.87, 20.02, -45.48, -110.0, 0.0]\n  Target bbox: [669.16, 400.75, 719.92, 476.27]\n\nFrame 3:\n  Drone pose: [-96.93, 117.63, 20.09, -49.92, -103.19, 0.0]\n  Target bbox: [621.14, 325.53, 658.64, 393.37]\n\nFrame 4:\n  Drone pose: [-97.07, 117.06, 20.06, -51.34, -97.78, 0.0]\n  Target bbox: [559.86, 303.87, 610.68, 375.32]\n\nFrame 5 (current):\n  Drone pose: [-97.32, 116.72, 20.0, -44.79, -97.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 569.31, \"ymin\": 410.32, \"xmax\": 618.13, \"ymax\": 479.31}, \"waypoint_deltas\": [{\"dx\": -0.2, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": -4.91, \"dyaw\": -3.43, \"droll\": 0.0}, {\"dx\": -0.39, \"dy\": -0.83, \"dz\": 0.0, \"dpitch\": -4.83, \"dyaw\": -2.72, \"droll\": 0.0}, {\"dx\": -0.56, \"dy\": -1.25, \"dz\": 0.0, \"dpitch\": -4.74, \"dyaw\": -2.06, \"droll\": 0.0}, {\"dx\": -0.72, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": -4.66, \"dyaw\": -1.45, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": -2.1, \"dz\": 0.0, \"dpitch\": -4.57, \"dyaw\": -0.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.77, "window_alt_abs_m": 0.22, "target_px_mean_hist": 646.2, "cur_frame_id": 97, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00113/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-98.61, 111.49, 20.0, -45.17, -101.34, 0.0]\n  Target bbox: [663.57, 386.23, 715.74, 458.77]\n\nFrame 2:\n  Drone pose: [-98.7, 111.12, 19.97, -48.59, -96.54, 0.0]\n  Target bbox: [624.93, 326.6, 654.84, 392.33]\n\nFrame 3:\n  Drone pose: [-98.48, 110.52, 20.0, -49.87, -95.73, 0.0]\n  Target bbox: [601.27, 307.75, 642.56, 374.14]\n\nFrame 4:\n  Drone pose: [-98.33, 110.02, 20.1, -48.86, -97.83, 0.0]\n  Target bbox: [621.82, 326.35, 657.93, 392.56]\n\nFrame 5 (current):\n  Drone pose: [-98.25, 109.52, 20.0, -48.7, -98.1, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.36, \"ymin\": 326.38, \"xmax\": 656.4, \"ymax\": 392.52}, \"waypoint_deltas\": [{\"dx\": 0.15, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": -1.05, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": -1.58, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.61, \"droll\": 0.0}, {\"dx\": 0.61, \"dy\": -2.13, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -2.13, \"droll\": 0.0}, {\"dx\": 0.73, \"dy\": -2.69, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -2.57, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.97, "window_alt_abs_m": 0.25, "target_px_mean_hist": 642.8, "cur_frame_id": 113, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00128/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.83, 103.38, 19.86, -46.58, -106.65, 0.0]\n  Target bbox: [617.3, 356.09, 661.65, 433.16]\n\nFrame 2:\n  Drone pose: [-98.04, 102.63, 20.13, -46.84, -107.3, 0.0]\n  Target bbox: [610.8, 361.77, 656.04, 441.06]\n\nFrame 3:\n  Drone pose: [-98.26, 101.99, 20.0, -49.47, -113.9, 0.0]\n  Target bbox: [672.72, 321.57, 716.17, 392.58]\n\nFrame 4:\n  Drone pose: [-98.49, 101.37, 20.0, -50.42, -105.89, 0.0]\n  Target bbox: [570.13, 301.86, 618.6, 381.07]\n\nFrame 5 (current):\n  Drone pose: [-98.62, 100.74, 20.0, -45.75, -109.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 597.09, \"ymin\": 382.09, \"xmax\": 639.09, \"ymax\": 455.25}, \"waypoint_deltas\": [{\"dx\": -0.37, \"dy\": -0.62, \"dz\": 0.0, \"dpitch\": -3.61, \"dyaw\": -2.5, \"droll\": 0.0}, {\"dx\": -0.64, \"dy\": -1.24, \"dz\": 0.0, \"dpitch\": -3.64, \"dyaw\": -3.42, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": -1.85, \"dz\": 0.0, \"dpitch\": -3.67, \"dyaw\": -4.3, \"droll\": 0.0}, {\"dx\": -1.2, \"dy\": -2.47, \"dz\": 0.0, \"dpitch\": -3.7, \"dyaw\": -5.15, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": -3.07, \"dz\": 0.0, \"dpitch\": -3.71, \"dyaw\": -5.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.81, "window_alt_abs_m": 0.4, "target_px_mean_hist": 630.8, "cur_frame_id": 128, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822/aug_001/frames_playback/frame_00144/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.31, 93.46, 20.0, -49.77, -116.91, 0.0]\n  Target bbox: [576.81, 317.61, 621.23, 393.6]\n\nFrame 2:\n  Drone pose: [-102.63, 92.86, 20.0, -49.47, -121.34, 0.0]\n  Target bbox: [620.05, 323.82, 660.21, 395.04]\n\nFrame 3:\n  Drone pose: [-103.01, 92.08, 20.02, -54.07, -125.53, 0.0]\n  Target bbox: [656.7, 253.26, 696.61, 323.84]\n\nFrame 4:\n  Drone pose: [-103.3, 91.57, 20.14, -52.01, -117.89, 0.0]\n  Target bbox: [565.52, 289.87, 605.34, 358.73]\n\nFrame 5 (current):\n  Drone pose: [-103.63, 91.05, 20.0, -51.25, -120.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 588.35, \"ymin\": 292.14, \"xmax\": 631.61, \"ymax\": 367.69}, \"waypoint_deltas\": [{\"dx\": -0.33, \"dy\": -0.61, \"dz\": 0.0, \"dpitch\": 1.79, \"dyaw\": -3.42, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": -1.23, \"dz\": 0.0, \"dpitch\": 1.8, \"dyaw\": -4.14, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": -1.85, \"dz\": 0.0, \"dpitch\": 1.79, \"dyaw\": -4.86, \"droll\": 0.0}, {\"dx\": -1.32, \"dy\": -2.47, \"dz\": 0.0, \"dpitch\": 1.79, \"dyaw\": -5.59, \"droll\": 0.0}, {\"dx\": -1.66, \"dy\": -3.09, \"dz\": 0.0, \"dpitch\": 1.8, \"dyaw\": -6.32, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.08, "window_alt_abs_m": 0.29, "target_px_mean_hist": 637.5, "cur_frame_id": 144, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776502822", "difficulty_score": 0.2607, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.61, -57.06, 22.0, -46.4, 0.0, 0.0]\n  Target bbox: [625.3, 324.29, 654.7, 394.96]\n\nFrame 2:\n  Drone pose: [-98.02, -57.96, 21.2, -43.99, 2.47, 0.0]\n  Target bbox: [625.64, 323.35, 654.82, 396.24]\n\nFrame 3:\n  Drone pose: [-97.9, -58.32, 20.67, -42.69, 3.4, 0.0]\n  Target bbox: [626.02, 324.23, 654.37, 395.25]\n\nFrame 4:\n  Drone pose: [-97.51, -58.43, 20.64, -42.5, 3.68, 0.0]\n  Target bbox: [625.74, 325.01, 654.63, 394.46]\n\nFrame 5 (current):\n  Drone pose: [-97.01, -58.45, 20.62, -42.45, 3.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.37, \"ymin\": 329.5, \"xmax\": 654.84, \"ymax\": 389.69}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.0, \"dz\": -0.07, \"dpitch\": 0.04, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": 0.0, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": 0.0, \"dz\": -0.2, \"dpitch\": 0.2, \"dyaw\": 0.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.72, "window_alt_abs_m": 1.38, "target_px_mean_hist": 511.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-90.31, -58.49, 20.19, -42.09, 3.88, 0.0]\n  Target bbox: [625.77, 326.32, 654.57, 393.13]\n\nFrame 2:\n  Drone pose: [-89.79, -58.51, 20.17, -42.09, 3.94, 0.0]\n  Target bbox: [625.61, 324.43, 654.79, 395.13]\n\nFrame 3:\n  Drone pose: [-89.28, -58.54, 20.15, -42.08, 4.0, 0.0]\n  Target bbox: [624.6, 326.82, 655.7, 392.46]\n\nFrame 4:\n  Drone pose: [-88.77, -58.56, 20.13, -42.06, 4.07, 0.0]\n  Target bbox: [625.2, 327.17, 655.07, 392.07]\n\nFrame 5 (current):\n  Drone pose: [-88.26, -58.59, 20.12, -42.04, 4.13, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.34, \"ymin\": 324.26, \"xmax\": 655.05, \"ymax\": 395.22}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.02, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.04, \"dz\": -0.03, \"dpitch\": 0.05, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 1.49, \"dy\": -0.05, \"dz\": -0.04, \"dpitch\": 0.07, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -0.06, \"dz\": -0.05, \"dpitch\": 0.09, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 2.49, \"dy\": -0.07, \"dz\": -0.06, \"dpitch\": 0.1, \"dyaw\": 0.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.26, "window_alt_abs_m": 0.07, "target_px_mean_hist": 539.0, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-81.23, -58.66, 20.01, -41.92, 4.35, 0.0]\n  Target bbox: [625.15, 325.16, 655.21, 394.29]\n\nFrame 2:\n  Drone pose: [-80.72, -58.66, 20.01, -41.93, 4.35, 0.0]\n  Target bbox: [625.65, 324.75, 654.69, 394.61]\n\nFrame 3:\n  Drone pose: [-80.22, -58.66, 20.01, -41.93, 4.35, 0.0]\n  Target bbox: [624.19, 325.31, 656.16, 394.03]\n\nFrame 4:\n  Drone pose: [-79.71, -58.66, 20.01, -41.93, 4.36, 0.0]\n  Target bbox: [624.31, 322.18, 656.15, 397.3]\n\nFrame 5 (current):\n  Drone pose: [-79.21, -58.66, 20.01, -41.93, 4.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.05, \"ymin\": 323.83, \"xmax\": 656.35, \"ymax\": 395.58}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.52, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.01, "target_px_mean_hist": 545.8, "cur_frame_id": 39, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-72.17, -58.69, 20.0, -41.98, 4.43, 0.0]\n  Target bbox: [625.46, 327.61, 654.83, 391.76]\n\nFrame 2:\n  Drone pose: [-71.67, -58.69, 20.0, -41.98, 4.43, 0.0]\n  Target bbox: [624.37, 323.18, 656.05, 396.24]\n\nFrame 3:\n  Drone pose: [-71.17, -58.69, 20.0, -41.98, 4.43, 0.0]\n  Target bbox: [624.12, 325.07, 656.24, 394.28]\n\nFrame 4:\n  Drone pose: [-70.66, -58.69, 20.0, -41.98, 4.43, 0.0]\n  Target bbox: [625.04, 323.68, 655.39, 395.92]\n\nFrame 5 (current):\n  Drone pose: [-70.16, -58.69, 20.0, -41.99, 4.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.81, \"ymin\": 328.05, \"xmax\": 655.42, \"ymax\": 391.14}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 552.0, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-63.14, -58.69, 20.0, -42.01, 4.43, 0.0]\n  Target bbox: [625.47, 326.55, 654.85, 392.87]\n\nFrame 2:\n  Drone pose: [-62.64, -58.69, 20.0, -42.01, 4.43, 0.0]\n  Target bbox: [625.22, 325.93, 655.12, 393.49]\n\nFrame 3:\n  Drone pose: [-62.14, -58.68, 20.0, -42.01, 4.43, 0.0]\n  Target bbox: [624.36, 324.2, 656.02, 395.18]\n\nFrame 4:\n  Drone pose: [-61.64, -58.68, 20.0, -42.01, 4.42, 0.0]\n  Target bbox: [625.19, 323.57, 655.24, 396.02]\n\nFrame 5 (current):\n  Drone pose: [-61.14, -58.67, 20.0, -42.02, 4.4, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.01, \"ymin\": 328.56, \"xmax\": 655.21, \"ymax\": 390.62}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.03, "window_alt_abs_m": 0.0, "target_px_mean_hist": 550.8, "cur_frame_id": 75, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00093/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-54.02, -57.66, 20.0, -42.25, 1.67, 0.0]\n  Target bbox: [625.89, 328.52, 654.38, 390.79]\n\nFrame 2:\n  Drone pose: [-53.48, -57.45, 20.0, -42.31, 1.08, 0.0]\n  Target bbox: [624.95, 322.03, 655.52, 397.43]\n\nFrame 3:\n  Drone pose: [-52.93, -57.21, 20.0, -42.38, 0.43, 0.0]\n  Target bbox: [625.35, 321.91, 654.96, 397.58]\n\nFrame 4:\n  Drone pose: [-52.38, -56.96, 20.0, -42.46, -0.27, 0.0]\n  Target bbox: [625.78, 326.04, 654.08, 393.18]\n\nFrame 5 (current):\n  Drone pose: [-51.81, -56.7, 20.0, -42.55, -0.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.67, \"ymin\": 328.66, \"xmax\": 654.06, \"ymax\": 390.6}, \"waypoint_deltas\": [{\"dx\": 0.57, \"dy\": 0.26, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -0.73, \"droll\": 0.0}, {\"dx\": 1.16, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": -1.42, \"droll\": 0.0}, {\"dx\": 1.76, \"dy\": 0.72, \"dz\": 0.0, \"dpitch\": -0.32, \"dyaw\": -2.02, \"droll\": 0.0}, {\"dx\": 2.36, \"dy\": 0.88, \"dz\": 0.0, \"dpitch\": -0.45, \"dyaw\": -2.48, \"droll\": 0.0}, {\"dx\": 2.97, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": -0.6, \"dyaw\": -2.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.66, "window_alt_abs_m": 0.0, "target_px_mean_hist": 563.8, "cur_frame_id": 93, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00110/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-43.87, -56.72, 20.0, -43.91, -12.69, 0.0]\n  Target bbox: [615.6, 319.97, 664.57, 399.57]\n\nFrame 2:\n  Drone pose: [-43.26, -56.97, 20.0, -43.97, -13.46, 0.0]\n  Target bbox: [620.38, 323.12, 659.84, 396.35]\n\nFrame 3:\n  Drone pose: [-42.67, -57.24, 20.0, -44.02, -14.17, 0.0]\n  Target bbox: [620.94, 323.39, 659.27, 396.06]\n\nFrame 4:\n  Drone pose: [-42.1, -57.54, 20.0, -44.06, -14.8, 0.0]\n  Target bbox: [620.55, 322.3, 659.62, 397.0]\n\nFrame 5 (current):\n  Drone pose: [-41.53, -57.86, 20.0, -44.08, -15.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.19, \"ymin\": 322.6, \"xmax\": 661.0, \"ymax\": 396.84}, \"waypoint_deltas\": [{\"dx\": 0.55, \"dy\": -0.35, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.46, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": -0.73, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.83, \"droll\": 0.0}, {\"dx\": 1.6, \"dy\": -1.14, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -1.11, \"droll\": 0.0}, {\"dx\": 2.11, \"dy\": -1.57, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -1.3, \"droll\": 0.0}, {\"dx\": 2.6, \"dy\": -2.03, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -1.42, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.67, "window_alt_abs_m": 0.0, "target_px_mean_hist": 572.8, "cur_frame_id": 110, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00128/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.89, -63.21, 20.0, -43.71, -14.11, 0.0]\n  Target bbox: [622.2, 321.9, 657.42, 397.48]\n\nFrame 2:\n  Drone pose: [-34.46, -63.43, 20.0, -43.69, -13.44, 0.0]\n  Target bbox: [620.27, 320.11, 659.26, 399.32]\n\nFrame 3:\n  Drone pose: [-34.03, -63.65, 20.0, -43.66, -12.79, 0.0]\n  Target bbox: [622.75, 326.11, 657.04, 393.03]\n\nFrame 4:\n  Drone pose: [-33.59, -63.86, 20.0, -43.64, -12.15, 0.0]\n  Target bbox: [622.22, 322.36, 657.36, 397.14]\n\nFrame 5 (current):\n  Drone pose: [-33.16, -64.07, 20.0, -43.61, -11.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.79, \"ymin\": 325.35, \"xmax\": 656.92, \"ymax\": 393.98}, \"waypoint_deltas\": [{\"dx\": 0.44, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.63, \"droll\": 0.0}, {\"dx\": 0.89, \"dy\": -0.42, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 1.25, \"droll\": 0.0}, {\"dx\": 1.33, \"dy\": -0.63, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 1.86, \"droll\": 0.0}, {\"dx\": 1.78, \"dy\": -0.83, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 2.45, \"droll\": 0.0}, {\"dx\": 2.23, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 3.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.6, "window_alt_abs_m": 0.0, "target_px_mean_hist": 562.8, "cur_frame_id": 128, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00146/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-26.85, -66.47, 20.0, -43.1, -4.47, 0.0]\n  Target bbox: [623.83, 326.4, 655.87, 392.79]\n\nFrame 2:\n  Drone pose: [-26.42, -66.57, 20.0, -43.02, -4.17, 0.0]\n  Target bbox: [623.23, 321.49, 656.29, 397.92]\n\nFrame 3:\n  Drone pose: [-26.0, -66.66, 20.0, -42.92, -3.91, 0.0]\n  Target bbox: [625.22, 326.7, 654.48, 392.62]\n\nFrame 4:\n  Drone pose: [-25.57, -66.74, 20.0, -42.82, -3.68, 0.0]\n  Target bbox: [625.01, 328.14, 654.73, 391.12]\n\nFrame 5 (current):\n  Drone pose: [-25.14, -66.81, 20.0, -42.73, -3.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.9, \"ymin\": 327.82, \"xmax\": 654.82, \"ymax\": 391.46}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": 1.32, \"dy\": -0.16, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 0.48, \"droll\": 0.0}, {\"dx\": 1.78, \"dy\": -0.22, \"dz\": 0.0, \"dpitch\": 0.28, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": 2.24, \"dy\": -0.27, \"dz\": 0.0, \"dpitch\": 0.33, \"dyaw\": 0.78, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 559.8, "cur_frame_id": 146, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/ORI/frames_playback/frame_00164/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-18.3, -66.45, 20.0, -42.48, -4.43, 0.0]\n  Target bbox: [624.76, 324.37, 654.85, 395.13]\n\nFrame 2:\n  Drone pose: [-17.8, -66.49, 20.0, -42.48, -4.34, 0.0]\n  Target bbox: [625.04, 327.58, 654.68, 391.76]\n\nFrame 3:\n  Drone pose: [-17.31, -66.47, 20.0, -42.48, -4.37, 0.0]\n  Target bbox: [624.88, 322.95, 654.7, 396.53]\n\nFrame 4:\n  Drone pose: [-16.82, -66.44, 20.0, -42.45, -4.47, 0.0]\n  Target bbox: [624.66, 324.01, 654.93, 395.51]\n\nFrame 5 (current):\n  Drone pose: [-16.34, -66.4, 20.0, -42.42, -4.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.51, \"ymin\": 327.84, \"xmax\": 655.25, \"ymax\": 391.32}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": 1.45, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.93, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": 2.42, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.15, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.31, "window_alt_abs_m": 0.0, "target_px_mean_hist": 551.2, "cur_frame_id": 164, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.61, -57.06, 22.0, -46.49, 3.48, 0.0]\n  Target bbox: [576.55, 322.28, 621.79, 396.06]\n\nFrame 2:\n  Drone pose: [-97.91, -58.06, 21.07, -45.83, 7.25, 0.0]\n  Target bbox: [621.62, 325.39, 658.46, 393.94]\n\nFrame 3:\n  Drone pose: [-97.9, -58.32, 20.67, -42.69, 3.4, 0.0]\n  Target bbox: [626.15, 325.29, 654.19, 394.1]\n\nFrame 4:\n  Drone pose: [-97.51, -58.43, 20.64, -40.12, 1.63, 0.0]\n  Target bbox: [651.55, 365.72, 680.14, 434.09]\n\nFrame 5 (current):\n  Drone pose: [-97.01, -58.45, 20.62, -42.45, 3.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.64, \"ymin\": 322.53, \"xmax\": 655.83, \"ymax\": 397.04}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.0, \"dz\": -0.07, \"dpitch\": 0.04, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": 0.0, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": 0.0, \"dz\": -0.2, \"dpitch\": 0.2, \"dyaw\": 0.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.5, "window_alt_abs_m": 1.38, "target_px_mean_hist": 538.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-90.42, -58.55, 20.11, -37.83, 1.14, 0.0]\n  Target bbox: [624.79, 330.0, 655.01, 389.6]\n\nFrame 2:\n  Drone pose: [-89.81, -58.46, 20.19, -46.5, 1.1, 0.0]\n  Target bbox: [579.58, 292.9, 625.81, 370.9]\n\nFrame 3:\n  Drone pose: [-89.28, -58.54, 20.15, -44.55, -1.0, 0.0]\n  Target bbox: [688.06, 284.4, 717.6, 355.8]\n\nFrame 4:\n  Drone pose: [-88.77, -58.56, 20.13, -42.06, 4.07, 0.0]\n  Target bbox: [625.79, 324.89, 654.56, 394.48]\n\nFrame 5 (current):\n  Drone pose: [-88.26, -58.59, 20.12, -42.04, 4.13, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.53, \"ymin\": 326.05, \"xmax\": 657.55, \"ymax\": 393.27}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.02, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.04, \"dz\": -0.03, \"dpitch\": 0.05, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 1.49, \"dy\": -0.05, \"dz\": -0.04, \"dpitch\": 0.07, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -0.06, \"dz\": -0.05, \"dpitch\": 0.09, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 2.49, \"dy\": -0.07, \"dz\": -0.06, \"dpitch\": 0.1, \"dyaw\": 0.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.27, "window_alt_abs_m": 0.14, "target_px_mean_hist": 521.5, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-81.23, -58.66, 20.01, -41.92, 4.35, 0.0]\n  Target bbox: [624.39, 322.57, 656.05, 396.89]\n\nFrame 2:\n  Drone pose: [-80.72, -58.66, 20.01, -41.93, 4.35, 0.0]\n  Target bbox: [625.32, 323.64, 655.08, 395.86]\n\nFrame 3:\n  Drone pose: [-80.22, -58.66, 20.01, -41.93, 4.35, 0.0]\n  Target bbox: [623.97, 321.87, 656.51, 397.64]\n\nFrame 4:\n  Drone pose: [-79.71, -58.66, 20.01, -39.98, 5.79, 0.0]\n  Target bbox: [607.17, 359.93, 636.87, 425.15]\n\nFrame 5 (current):\n  Drone pose: [-79.21, -58.66, 20.01, -41.93, 4.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.3, \"ymin\": 325.47, \"xmax\": 655.02, \"ymax\": 393.84}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.52, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.86, "window_alt_abs_m": 0.01, "target_px_mean_hist": 540.8, "cur_frame_id": 39, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-72.23, -58.82, 19.99, -46.75, 6.67, 0.0]\n  Target bbox: [623.5, 321.54, 656.9, 397.53]\n\nFrame 2:\n  Drone pose: [-71.73, -58.7, 19.96, -40.1, 4.94, 0.0]\n  Target bbox: [621.45, 324.64, 658.51, 394.92]\n\nFrame 3:\n  Drone pose: [-71.24, -58.69, 20.06, -44.75, -2.02, 0.0]\n  Target bbox: [619.19, 321.52, 660.85, 397.75]\n\nFrame 4:\n  Drone pose: [-70.66, -58.69, 20.0, -41.78, 6.15, 0.0]\n  Target bbox: [602.76, 331.46, 633.97, 395.07]\n\nFrame 5 (current):\n  Drone pose: [-70.16, -58.69, 20.0, -38.06, 5.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 606.19, \"ymin\": 388.29, \"xmax\": 642.59, \"ymax\": 463.27}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -3.93, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -3.93, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -3.93, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -3.93, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -3.94, \"dyaw\": -1.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.32, "window_alt_abs_m": 0.2, "target_px_mean_hist": 541.0, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-63.19, -58.6, 19.93, -37.85, 5.75, 0.0]\n  Target bbox: [624.55, 326.71, 655.13, 393.08]\n\nFrame 2:\n  Drone pose: [-62.64, -58.69, 20.0, -42.01, 4.43, 0.0]\n  Target bbox: [625.5, 323.21, 654.91, 396.3]\n\nFrame 3:\n  Drone pose: [-62.14, -58.68, 20.0, -37.56, 4.67, 0.0]\n  Target bbox: [622.52, 400.83, 651.52, 468.34]\n\nFrame 4:\n  Drone pose: [-61.64, -58.68, 20.0, -42.01, 4.42, 0.0]\n  Target bbox: [620.5, 323.04, 659.61, 396.43]\n\nFrame 5 (current):\n  Drone pose: [-61.09, -58.69, 20.12, -40.39, -5.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 690.57, \"ymin\": 315.21, \"xmax\": 719.27, \"ymax\": 375.86}, \"waypoint_deltas\": [{\"dx\": 0.45, \"dy\": 0.02, \"dz\": -0.12, \"dpitch\": -1.63, \"dyaw\": 10.37, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": 0.03, \"dz\": -0.12, \"dpitch\": -1.63, \"dyaw\": 10.35, \"droll\": 0.0}, {\"dx\": 1.45, \"dy\": 0.04, \"dz\": -0.12, \"dpitch\": -1.63, \"dyaw\": 10.32, \"droll\": 0.0}, {\"dx\": 1.95, \"dy\": 0.06, \"dz\": -0.12, \"dpitch\": -1.63, \"dyaw\": 10.27, \"droll\": 0.0}, {\"dx\": 2.45, \"dy\": 0.08, \"dz\": -0.12, \"dpitch\": -1.64, \"dyaw\": 10.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.23, "window_alt_abs_m": 0.19, "target_px_mean_hist": 521.8, "cur_frame_id": 75, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00093/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-54.02, -57.66, 20.0, -42.25, 1.67, 0.0]\n  Target bbox: [626.49, 323.36, 653.91, 396.1]\n\nFrame 2:\n  Drone pose: [-53.48, -57.45, 20.0, -37.31, 1.07, 0.0]\n  Target bbox: [625.04, 409.39, 655.63, 478.08]\n\nFrame 3:\n  Drone pose: [-52.93, -57.21, 20.0, -38.92, 5.43, 0.0]\n  Target bbox: [561.25, 380.95, 594.44, 458.49]\n\nFrame 4:\n  Drone pose: [-52.38, -56.96, 20.0, -42.46, -0.27, 0.0]\n  Target bbox: [626.07, 323.85, 653.76, 395.58]\n\nFrame 5 (current):\n  Drone pose: [-51.81, -56.7, 20.0, -41.91, -5.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 680.05, \"ymin\": 339.72, \"xmax\": 711.21, \"ymax\": 403.93}, \"waypoint_deltas\": [{\"dx\": 0.57, \"dy\": 0.26, \"dz\": 0.0, \"dpitch\": -0.73, \"dyaw\": 3.76, \"droll\": 0.0}, {\"dx\": 1.16, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": -0.84, \"dyaw\": 3.07, \"droll\": 0.0}, {\"dx\": 1.76, \"dy\": 0.72, \"dz\": 0.0, \"dpitch\": -0.96, \"dyaw\": 2.47, \"droll\": 0.0}, {\"dx\": 2.36, \"dy\": 0.88, \"dz\": 0.0, \"dpitch\": -1.09, \"dyaw\": 2.01, \"droll\": 0.0}, {\"dx\": 2.97, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": -1.24, \"dyaw\": 1.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.87, "window_alt_abs_m": 0.0, "target_px_mean_hist": 561.8, "cur_frame_id": 93, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00110/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-43.87, -56.72, 20.0, -48.91, -14.32, 0.0]\n  Target bbox: [641.19, 238.78, 679.57, 312.67]\n\nFrame 2:\n  Drone pose: [-43.26, -56.97, 20.0, -46.61, -16.83, 0.0]\n  Target bbox: [656.9, 277.05, 705.76, 355.57]\n\nFrame 3:\n  Drone pose: [-42.67, -57.24, 20.0, -42.64, -15.4, 0.0]\n  Target bbox: [634.58, 344.31, 676.01, 421.78]\n\nFrame 4:\n  Drone pose: [-42.1, -57.54, 20.0, -46.32, -10.63, 0.0]\n  Target bbox: [566.88, 284.04, 611.15, 361.95]\n\nFrame 5 (current):\n  Drone pose: [-41.63, -57.8, 20.14, -45.26, -6.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 571.55, \"ymin\": 332.87, \"xmax\": 615.37, \"ymax\": 411.42}, \"waypoint_deltas\": [{\"dx\": 0.65, \"dy\": -0.41, \"dz\": -0.14, \"dpitch\": 1.17, \"dyaw\": -9.13, \"droll\": 0.0}, {\"dx\": 1.18, \"dy\": -0.79, \"dz\": -0.14, \"dpitch\": 1.17, \"dyaw\": -9.5, \"droll\": 0.0}, {\"dx\": 1.7, \"dy\": -1.2, \"dz\": -0.14, \"dpitch\": 1.18, \"dyaw\": -9.78, \"droll\": 0.0}, {\"dx\": 2.21, \"dy\": -1.63, \"dz\": -0.14, \"dpitch\": 1.2, \"dyaw\": -9.97, \"droll\": 0.0}, {\"dx\": 2.7, \"dy\": -2.09, \"dz\": -0.14, \"dpitch\": 1.23, \"dyaw\": -10.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.65, "window_alt_abs_m": 0.14, "target_px_mean_hist": 556.5, "cur_frame_id": 110, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00128/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.89, -63.21, 20.0, -41.47, -11.78, 0.0]\n  Target bbox: [587.62, 358.24, 634.7, 437.36]\n\nFrame 2:\n  Drone pose: [-34.4, -63.49, 19.85, -36.25, -7.41, 0.0]\n  Target bbox: [603.62, 401.57, 643.79, 475.47]\n\nFrame 3:\n  Drone pose: [-34.03, -63.65, 20.0, -46.43, -8.02, 0.0]\n  Target bbox: [563.34, 277.11, 599.61, 352.52]\n\nFrame 4:\n  Drone pose: [-33.59, -63.86, 20.0, -42.81, -15.1, 0.0]\n  Target bbox: [658.04, 336.0, 694.11, 412.27]\n\nFrame 5 (current):\n  Drone pose: [-33.16, -64.07, 20.0, -40.6, -15.4, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 667.38, \"ymin\": 370.6, \"xmax\": 707.31, \"ymax\": 452.29}, \"waypoint_deltas\": [{\"dx\": 0.44, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": -2.98, \"dyaw\": 4.51, \"droll\": 0.0}, {\"dx\": 0.89, \"dy\": -0.42, \"dz\": 0.0, \"dpitch\": -2.96, \"dyaw\": 5.13, \"droll\": 0.0}, {\"dx\": 1.33, \"dy\": -0.63, \"dz\": 0.0, \"dpitch\": -2.94, \"dyaw\": 5.74, \"droll\": 0.0}, {\"dx\": 1.78, \"dy\": -0.83, \"dz\": 0.0, \"dpitch\": -2.91, \"dyaw\": 6.33, \"droll\": 0.0}, {\"dx\": 2.23, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -2.88, \"dyaw\": 6.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.37, "window_alt_abs_m": 0.3, "target_px_mean_hist": 556.2, "cur_frame_id": 128, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00146/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-26.85, -66.47, 20.0, -43.1, -4.47, 0.0]\n  Target bbox: [625.01, 322.65, 654.58, 396.79]\n\nFrame 2:\n  Drone pose: [-26.42, -66.57, 20.0, -40.37, -1.89, 0.0]\n  Target bbox: [596.24, 372.09, 626.86, 436.43]\n\nFrame 3:\n  Drone pose: [-26.0, -66.64, 20.04, -38.19, 0.9, 0.0]\n  Target bbox: [567.07, 349.03, 594.69, 417.21]\n\nFrame 4:\n  Drone pose: [-25.57, -66.74, 20.0, -40.21, -8.68, 0.0]\n  Target bbox: [684.82, 372.44, 718.78, 438.17]\n\nFrame 5 (current):\n  Drone pose: [-25.14, -66.81, 20.0, -42.91, -0.28, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 578.81, \"ymin\": 320.02, \"xmax\": 621.58, \"ymax\": 395.09}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": 0.26, \"dyaw\": -3.0, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": 0.34, \"dyaw\": -2.84, \"droll\": 0.0}, {\"dx\": 1.32, \"dy\": -0.16, \"dz\": 0.0, \"dpitch\": 0.42, \"dyaw\": -2.71, \"droll\": 0.0}, {\"dx\": 1.78, \"dy\": -0.22, \"dz\": 0.0, \"dpitch\": 0.46, \"dyaw\": -2.55, \"droll\": 0.0}, {\"dx\": 2.24, \"dy\": -0.27, \"dz\": 0.0, \"dpitch\": 0.51, \"dyaw\": -2.41, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.35, "window_alt_abs_m": 0.08, "target_px_mean_hist": 545.2, "cur_frame_id": 146, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030/aug_001/frames_playback/frame_00164/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-18.38, -66.35, 20.11, -44.02, 3.01, 0.0]\n  Target bbox: [619.68, 324.69, 660.2, 394.65]\n\nFrame 2:\n  Drone pose: [-17.8, -66.49, 20.0, -42.48, -4.34, 0.0]\n  Target bbox: [623.64, 321.62, 656.87, 397.86]\n\nFrame 3:\n  Drone pose: [-17.31, -66.61, 20.07, -38.57, -4.62, 0.0]\n  Target bbox: [625.77, 330.37, 654.41, 389.0]\n\nFrame 4:\n  Drone pose: [-16.82, -66.44, 20.0, -42.45, -4.47, 0.0]\n  Target bbox: [621.53, 320.5, 658.01, 399.05]\n\nFrame 5 (current):\n  Drone pose: [-16.51, -66.44, 20.05, -34.59, -3.81, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 637.11, \"ymin\": 396.14, \"xmax\": 673.69, \"ymax\": 467.78}, \"waypoint_deltas\": [{\"dx\": 0.65, \"dy\": 0.04, \"dz\": -0.05, \"dpitch\": -7.8, \"dyaw\": -0.75, \"droll\": 0.0}, {\"dx\": 1.13, \"dy\": 0.06, \"dz\": -0.05, \"dpitch\": -7.78, \"dyaw\": -0.79, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": 0.04, \"dz\": -0.05, \"dpitch\": -7.76, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": 2.1, \"dy\": 0.02, \"dz\": -0.05, \"dpitch\": -7.74, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": 2.59, \"dy\": -0.01, \"dz\": -0.05, \"dpitch\": -7.73, \"dyaw\": -0.6, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.43, "window_alt_abs_m": 0.3, "target_px_mean_hist": 554.2, "cur_frame_id": 164, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776256030", "difficulty_score": 0.2435, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-42.61, -62.56, 22.0, -46.4, 0.0, 0.0]\n  Target bbox: [624.15, 328.29, 655.85, 390.88]\n\nFrame 2:\n  Drone pose: [-43.09, -63.63, 21.2, -43.89, 2.93, 0.0]\n  Target bbox: [623.96, 327.13, 656.37, 392.16]\n\nFrame 3:\n  Drone pose: [-43.03, -64.12, 20.67, -42.49, 4.18, 0.0]\n  Target bbox: [624.0, 325.8, 656.35, 393.54]\n\nFrame 4:\n  Drone pose: [-42.69, -64.31, 20.64, -42.22, 4.64, 0.0]\n  Target bbox: [623.76, 326.38, 656.58, 392.99]\n\nFrame 5 (current):\n  Drone pose: [-42.22, -64.36, 20.62, -42.13, 4.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.72, \"ymin\": 327.08, \"xmax\": 656.6, \"ymax\": 392.25}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.01, \"dz\": -0.03, \"dpitch\": 0.03, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": -0.02, \"dz\": -0.05, \"dpitch\": 0.05, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 1.53, \"dy\": -0.01, \"dz\": -0.07, \"dpitch\": 0.06, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 2.04, \"dy\": -0.02, \"dz\": -0.09, \"dpitch\": 0.08, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 2.55, \"dy\": -0.02, \"dz\": -0.2, \"dpitch\": 0.22, \"dyaw\": 0.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.78, "window_alt_abs_m": 1.38, "target_px_mean_hist": 655.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-35.59, -64.38, 20.19, -41.68, 4.84, 0.0]\n  Target bbox: [624.11, 322.6, 656.36, 396.99]\n\nFrame 2:\n  Drone pose: [-35.08, -64.38, 20.17, -41.66, 4.85, 0.0]\n  Target bbox: [623.45, 325.8, 656.9, 393.55]\n\nFrame 3:\n  Drone pose: [-34.57, -64.38, 20.15, -41.65, 4.87, 0.0]\n  Target bbox: [623.4, 323.07, 657.07, 396.5]\n\nFrame 4:\n  Drone pose: [-34.06, -64.4, 20.13, -41.63, 4.9, 0.0]\n  Target bbox: [623.63, 326.64, 656.67, 392.64]\n\nFrame 5 (current):\n  Drone pose: [-33.55, -64.41, 20.12, -41.62, 4.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.68, \"ymin\": 323.72, \"xmax\": 656.75, \"ymax\": 395.77}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.02, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": -0.05, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": -0.08, \"dz\": -0.04, \"dpitch\": 0.05, \"dyaw\": 0.22, \"droll\": 0.0}, {\"dx\": 2.02, \"dy\": -0.12, \"dz\": -0.05, \"dpitch\": 0.07, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": 2.52, \"dy\": -0.15, \"dz\": -0.06, \"dpitch\": 0.08, \"dyaw\": 0.4, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.1, "window_alt_abs_m": 0.07, "target_px_mean_hist": 684.0, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-26.44, -64.68, 20.01, -41.57, 5.7, 0.0]\n  Target bbox: [623.12, 322.74, 657.37, 396.87]\n\nFrame 2:\n  Drone pose: [-25.94, -64.68, 20.01, -41.57, 5.7, 0.0]\n  Target bbox: [623.19, 325.95, 657.15, 393.41]\n\nFrame 3:\n  Drone pose: [-25.44, -64.69, 20.01, -41.57, 5.7, 0.0]\n  Target bbox: [622.81, 324.06, 657.61, 395.42]\n\nFrame 4:\n  Drone pose: [-24.94, -64.69, 20.01, -41.56, 5.7, 0.0]\n  Target bbox: [623.38, 323.72, 657.03, 395.75]\n\nFrame 5 (current):\n  Drone pose: [-24.46, -64.67, 20.01, -41.54, 5.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.37, \"ymin\": 326.28, \"xmax\": 656.94, \"ymax\": 393.01}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": 0.02, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": 0.06, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": 1.47, \"dy\": 0.1, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": 1.96, \"dy\": 0.14, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": 2.47, \"dy\": 0.18, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.04, "window_alt_abs_m": 0.01, "target_px_mean_hist": 678.0, "cur_frame_id": 39, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-17.87, -64.23, 20.0, -41.71, 4.5, 0.0]\n  Target bbox: [623.5, 325.74, 656.85, 393.58]\n\nFrame 2:\n  Drone pose: [-17.34, -64.16, 20.0, -41.75, 4.32, 0.0]\n  Target bbox: [623.74, 322.34, 656.76, 397.28]\n\nFrame 3:\n  Drone pose: [-16.8, -64.07, 20.0, -41.81, 4.1, 0.0]\n  Target bbox: [623.56, 322.58, 656.95, 397.05]\n\nFrame 4:\n  Drone pose: [-16.26, -63.97, 20.0, -41.88, 3.83, 0.0]\n  Target bbox: [623.24, 323.9, 657.2, 395.55]\n\nFrame 5 (current):\n  Drone pose: [-15.71, -63.85, 20.0, -41.95, 3.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.38, \"ymin\": 323.34, \"xmax\": 657.07, \"ymax\": 396.1}, \"waypoint_deltas\": [{\"dx\": 0.55, \"dy\": 0.15, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": 0.3, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": -0.79, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": 0.46, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": -1.23, \"droll\": 0.0}, {\"dx\": 2.08, \"dy\": 0.63, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": -1.7, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": 0.81, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": -2.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 690.0, "cur_frame_id": 56, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-6.11, -61.42, 20.0, -45.36, -9.62, 0.0]\n  Target bbox: [615.1, 318.78, 665.04, 400.63]\n\nFrame 2:\n  Drone pose: [-5.14, -61.39, 20.0, -45.93, -11.51, 0.0]\n  Target bbox: [612.7, 317.07, 667.5, 402.4]\n\nFrame 3:\n  Drone pose: [-4.23, -61.39, 20.0, -46.38, -13.31, 0.0]\n  Target bbox: [614.01, 317.95, 666.2, 401.41]\n\nFrame 4:\n  Drone pose: [-3.42, -61.41, 20.0, -46.68, -15.01, 0.0]\n  Target bbox: [617.22, 319.53, 662.96, 399.61]\n\nFrame 5 (current):\n  Drone pose: [-2.69, -61.45, 20.0, -46.82, -16.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.83, \"ymin\": 319.31, \"xmax\": 663.38, \"ymax\": 399.83}, \"waypoint_deltas\": [{\"dx\": 0.65, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.59, \"droll\": 0.0}, {\"dx\": 1.27, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -3.14, \"droll\": 0.0}, {\"dx\": 1.85, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": -4.54, \"droll\": 0.0}, {\"dx\": 2.4, \"dy\": -0.25, \"dz\": 0.0, \"dpitch\": 0.31, \"dyaw\": -5.71, \"droll\": 0.0}, {\"dx\": 2.93, \"dy\": -0.45, \"dz\": 0.0, \"dpitch\": 0.45, \"dyaw\": -6.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 757.5, "cur_frame_id": 74, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [4.48, -64.8, 20.03, -44.76, -27.37, 0.0]\n  Target bbox: [616.44, 319.32, 664.02, 400.12]\n\nFrame 2:\n  Drone pose: [4.94, -65.19, 20.04, -44.65, -27.62, 0.0]\n  Target bbox: [618.69, 321.29, 661.64, 397.97]\n\nFrame 3:\n  Drone pose: [5.41, -65.59, 20.05, -44.54, -27.83, 0.0]\n  Target bbox: [616.09, 319.66, 664.34, 399.74]\n\nFrame 4:\n  Drone pose: [5.83, -66.02, 20.06, -44.4, -27.91, 0.0]\n  Target bbox: [616.01, 319.7, 664.43, 399.74]\n\nFrame 5 (current):\n  Drone pose: [6.17, -66.42, 20.07, -44.14, -27.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.12, \"ymin\": 322.5, \"xmax\": 659.18, \"ymax\": 396.75}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": -0.31, \"dz\": 0.01, \"dpitch\": 0.36, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": -0.35, \"dz\": 0.03, \"dpitch\": 0.42, \"dyaw\": -1.64, \"droll\": 0.0}, {\"dx\": 1.57, \"dy\": -0.42, \"dz\": 0.04, \"dpitch\": 0.25, \"dyaw\": -1.57, \"droll\": 0.0}, {\"dx\": 3.33, \"dy\": -0.4, \"dz\": 0.06, \"dpitch\": -1.4, \"dyaw\": -3.54, \"droll\": 0.0}, {\"dx\": 4.95, \"dy\": -0.83, \"dz\": 0.09, \"dpitch\": -3.28, \"dyaw\": -4.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.57, "window_alt_abs_m": 0.04, "target_px_mean_hist": 735.0, "cur_frame_id": 92, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00109/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [13.58, -77.1, 20.45, -49.59, -6.82, 0.0]\n  Target bbox: [611.96, 315.51, 668.21, 403.64]\n\nFrame 2:\n  Drone pose: [13.84, -77.37, 20.5, -49.22, -7.52, 0.0]\n  Target bbox: [614.1, 317.14, 666.06, 401.97]\n\nFrame 3:\n  Drone pose: [14.09, -77.64, 20.56, -48.85, -8.2, 0.0]\n  Target bbox: [619.51, 319.8, 660.0, 399.24]\n\nFrame 4:\n  Drone pose: [14.35, -77.9, 20.62, -48.6, -7.2, 0.0]\n  Target bbox: [620.04, 319.67, 659.44, 399.44]\n\nFrame 5 (current):\n  Drone pose: [14.61, -78.17, 20.69, -48.35, -6.23, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.19, \"ymin\": 323.99, \"xmax\": 658.46, \"ymax\": 394.91}, \"waypoint_deltas\": [{\"dx\": 0.26, \"dy\": -0.26, \"dz\": 0.07, \"dpitch\": 0.24, \"dyaw\": 0.94, \"droll\": 0.0}, {\"dx\": 0.52, \"dy\": -0.53, \"dz\": 0.15, \"dpitch\": 0.48, \"dyaw\": 1.86, \"droll\": 0.0}, {\"dx\": 0.77, \"dy\": -0.8, \"dz\": 0.23, \"dpitch\": 0.72, \"dyaw\": 2.76, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -1.06, \"dz\": 0.32, \"dpitch\": 0.95, \"dyaw\": 3.64, \"droll\": 0.0}, {\"dx\": 1.29, \"dy\": -1.33, \"dz\": 0.41, \"dpitch\": 1.18, \"dyaw\": 4.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.34, "window_alt_abs_m": 0.24, "target_px_mean_hist": 766.2, "cur_frame_id": 109, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [18.84, -81.34, 22.02, -46.1, 3.68, 0.0]\n  Target bbox: [623.77, 323.86, 656.66, 395.5]\n\nFrame 2:\n  Drone pose: [19.65, -81.12, 22.1, -46.67, 3.1, 0.0]\n  Target bbox: [624.52, 323.63, 655.9, 395.67]\n\nFrame 3:\n  Drone pose: [20.25, -81.09, 22.18, -46.92, 3.02, 0.0]\n  Target bbox: [623.69, 324.48, 656.72, 394.8]\n\nFrame 4:\n  Drone pose: [20.82, -81.05, 22.26, -47.14, 2.9, 0.0]\n  Target bbox: [623.83, 327.01, 656.49, 392.13]\n\nFrame 5 (current):\n  Drone pose: [21.35, -81.02, 23.05, -48.23, 2.83, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.07, \"ymin\": 328.01, \"xmax\": 656.24, \"ymax\": 391.12}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.97, \"dy\": 0.0, \"dz\": -0.57, \"dpitch\": 0.8, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": 0.0, \"dz\": -0.52, \"dpitch\": 0.76, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 1.92, \"dy\": 0.0, \"dz\": -0.51, \"dpitch\": 0.79, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 2.39, \"dy\": -0.01, \"dz\": -0.58, \"dpitch\": 0.92, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.85, "window_alt_abs_m": 1.03, "target_px_mean_hist": 630.2, "cur_frame_id": 127, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00144/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [26.73, -82.98, 22.09, -45.11, 8.07, 0.0]\n  Target bbox: [623.27, 325.69, 657.07, 393.63]\n\nFrame 2:\n  Drone pose: [26.89, -83.52, 22.03, -44.46, 9.36, 0.0]\n  Target bbox: [622.55, 323.86, 657.88, 395.66]\n\nFrame 3:\n  Drone pose: [27.17, -83.81, 21.96, -44.01, 10.04, 0.0]\n  Target bbox: [622.92, 323.63, 657.52, 395.97]\n\nFrame 4:\n  Drone pose: [27.59, -83.82, 21.9, -43.82, 10.01, 0.0]\n  Target bbox: [622.95, 324.68, 657.43, 394.81]\n\nFrame 5 (current):\n  Drone pose: [28.0, -83.86, 21.82, -43.58, 10.08, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.43, \"ymin\": 324.77, \"xmax\": 657.96, \"ymax\": 394.74}, \"waypoint_deltas\": [{\"dx\": 0.47, \"dy\": 0.04, \"dz\": -0.07, \"dpitch\": 0.13, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": 0.05, \"dz\": -0.13, \"dpitch\": 0.23, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": 0.07, \"dz\": -0.16, \"dpitch\": 0.28, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": 1.95, \"dy\": 0.06, \"dz\": -0.19, \"dpitch\": 0.31, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": 2.47, \"dy\": 0.06, \"dz\": -0.21, \"dpitch\": 0.3, \"dyaw\": -0.15, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.07, "window_alt_abs_m": 0.26, "target_px_mean_hist": 615.0, "cur_frame_id": 144, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00158/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00159/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/ORI/frames_playback/frame_00162/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [35.45, -83.83, 22.02, -44.46, 10.22, 0.0]\n  Target bbox: [622.88, 325.15, 657.47, 394.24]\n\nFrame 2:\n  Drone pose: [35.93, -83.84, 22.14, -44.59, 10.24, 0.0]\n  Target bbox: [622.63, 326.84, 657.66, 392.51]\n\nFrame 3:\n  Drone pose: [36.39, -83.85, 22.27, -44.72, 10.25, 0.0]\n  Target bbox: [622.87, 326.69, 657.43, 392.65]\n\nFrame 4:\n  Drone pose: [36.85, -83.86, 22.42, -44.87, 10.26, 0.0]\n  Target bbox: [623.12, 326.12, 657.19, 393.23]\n\nFrame 5 (current):\n  Drone pose: [37.31, -83.91, 22.58, -45.01, 10.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.74, \"ymin\": 324.77, \"xmax\": 657.65, \"ymax\": 394.74}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": 0.0, \"dz\": 0.17, \"dpitch\": -0.2, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.97, \"dy\": 0.09, \"dz\": 0.35, \"dpitch\": -0.44, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": 1.45, \"dy\": 0.12, \"dz\": 0.54, \"dpitch\": -0.68, \"dyaw\": -0.33, \"droll\": 0.0}, {\"dx\": 1.94, \"dy\": 0.09, \"dz\": 0.79, \"dpitch\": -0.98, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": 2.46, \"dy\": 0.08, \"dz\": 1.01, \"dpitch\": -1.29, \"dyaw\": -0.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.14, "window_alt_abs_m": 0.56, "target_px_mean_hist": 605.5, "cur_frame_id": 162, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-42.61, -62.56, 22.0, -46.4, 0.0, 0.0]\n  Target bbox: [624.95, 325.21, 655.05, 394.05]\n\nFrame 2:\n  Drone pose: [-43.08, -63.75, 21.18, -43.85, 3.25, 0.0]\n  Target bbox: [623.84, 324.43, 656.59, 395.0]\n\nFrame 3:\n  Drone pose: [-43.03, -64.12, 20.67, -44.07, -0.82, 0.0]\n  Target bbox: [685.92, 298.92, 719.0, 371.17]\n\nFrame 4:\n  Drone pose: [-42.65, -64.46, 20.63, -42.24, 5.05, 0.0]\n  Target bbox: [623.73, 323.89, 656.72, 395.67]\n\nFrame 5 (current):\n  Drone pose: [-42.3, -64.35, 20.63, -40.56, 0.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 676.99, \"ymin\": 350.48, \"xmax\": 709.44, \"ymax\": 422.12}, \"waypoint_deltas\": [{\"dx\": 0.58, \"dy\": -0.02, \"dz\": -0.04, \"dpitch\": -1.54, \"dyaw\": 4.32, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": -0.03, \"dz\": -0.06, \"dpitch\": -1.52, \"dyaw\": 4.32, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": -0.02, \"dz\": -0.08, \"dpitch\": -1.51, \"dyaw\": 4.33, \"droll\": 0.0}, {\"dx\": 2.12, \"dy\": -0.03, \"dz\": -0.1, \"dpitch\": -1.49, \"dyaw\": 4.33, \"droll\": 0.0}, {\"dx\": 2.63, \"dy\": -0.03, \"dz\": -0.21, \"dpitch\": -1.35, \"dyaw\": 4.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.78, "window_alt_abs_m": 1.37, "target_px_mean_hist": 650.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-35.53, -64.41, 20.09, -42.21, 3.31, 0.0]\n  Target bbox: [644.01, 313.03, 678.03, 386.55]\n\nFrame 2:\n  Drone pose: [-35.08, -64.38, 20.17, -38.57, 0.14, 0.0]\n  Target bbox: [682.61, 380.68, 716.76, 445.54]\n\nFrame 3:\n  Drone pose: [-34.57, -64.38, 20.15, -41.65, 4.87, 0.0]\n  Target bbox: [623.62, 323.03, 656.87, 396.6]\n\nFrame 4:\n  Drone pose: [-34.02, -64.23, 20.04, -41.56, 4.47, 0.0]\n  Target bbox: [623.19, 323.8, 657.26, 395.7]\n\nFrame 5 (current):\n  Drone pose: [-33.55, -64.41, 20.12, -37.31, 4.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.7, \"ymin\": 395.14, \"xmax\": 663.72, \"ymax\": 469.41}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.02, \"dz\": -0.02, \"dpitch\": -4.29, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": -0.05, \"dz\": -0.03, \"dpitch\": -4.27, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": -0.08, \"dz\": -0.04, \"dpitch\": -4.26, \"dyaw\": 0.73, \"droll\": 0.0}, {\"dx\": 2.02, \"dy\": -0.12, \"dz\": -0.05, \"dpitch\": -4.24, \"dyaw\": 0.82, \"droll\": 0.0}, {\"dx\": 2.52, \"dy\": -0.15, \"dz\": -0.06, \"dpitch\": -4.23, \"dyaw\": 0.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.33, "window_alt_abs_m": 0.3, "target_px_mean_hist": 685.8, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-26.46, -64.58, 20.06, -41.63, 5.42, 0.0]\n  Target bbox: [622.98, 323.23, 657.47, 396.27]\n\nFrame 2:\n  Drone pose: [-25.94, -64.68, 20.01, -43.28, 4.91, 0.0]\n  Target bbox: [633.4, 294.55, 667.25, 367.69]\n\nFrame 3:\n  Drone pose: [-25.44, -64.69, 20.01, -41.57, 5.7, 0.0]\n  Target bbox: [623.3, 322.36, 657.21, 397.29]\n\nFrame 4:\n  Drone pose: [-24.94, -64.69, 20.01, -41.56, 5.7, 0.0]\n  Target bbox: [623.38, 323.33, 657.06, 396.19]\n\nFrame 5 (current):\n  Drone pose: [-24.46, -64.67, 20.01, -41.54, 5.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.39, \"ymin\": 323.41, \"xmax\": 657.04, \"ymax\": 396.09}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": 0.02, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": 0.06, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": 1.47, \"dy\": 0.1, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": 1.96, \"dy\": 0.14, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": 2.47, \"dy\": 0.18, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.34, "window_alt_abs_m": 0.05, "target_px_mean_hist": 685.8, "cur_frame_id": 39, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-17.87, -64.23, 20.0, -41.54, 8.22, 0.0]\n  Target bbox: [575.0, 326.35, 611.43, 400.73]\n\nFrame 2:\n  Drone pose: [-17.38, -64.07, 20.13, -43.92, 0.84, 0.0]\n  Target bbox: [664.38, 291.0, 697.43, 362.24]\n\nFrame 3:\n  Drone pose: [-16.79, -64.19, 19.86, -36.67, 3.76, 0.0]\n  Target bbox: [631.49, 407.99, 665.41, 477.45]\n\nFrame 4:\n  Drone pose: [-16.25, -64.15, 20.05, -38.81, 4.37, 0.0]\n  Target bbox: [622.84, 379.08, 655.94, 445.36]\n\nFrame 5 (current):\n  Drone pose: [-15.73, -63.79, 19.95, -38.1, 3.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.6, \"ymin\": 386.75, \"xmax\": 656.43, \"ymax\": 458.48}, \"waypoint_deltas\": [{\"dx\": 0.57, \"dy\": 0.09, \"dz\": 0.05, \"dpitch\": -3.92, \"dyaw\": -0.27, \"droll\": 0.0}, {\"dx\": 1.1, \"dy\": 0.24, \"dz\": 0.05, \"dpitch\": -3.98, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": 0.4, \"dz\": 0.05, \"dpitch\": -4.0, \"dyaw\": -1.12, \"droll\": 0.0}, {\"dx\": 2.1, \"dy\": 0.57, \"dz\": 0.05, \"dpitch\": -4.0, \"dyaw\": -1.59, \"droll\": 0.0}, {\"dx\": 2.58, \"dy\": 0.75, \"dz\": 0.05, \"dpitch\": -3.98, \"dyaw\": -2.08, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.87, "window_alt_abs_m": 0.69, "target_px_mean_hist": 689.5, "cur_frame_id": 56, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-6.11, -61.42, 20.0, -45.36, -9.62, 0.0]\n  Target bbox: [617.55, 320.46, 662.57, 398.84]\n\nFrame 2:\n  Drone pose: [-5.11, -61.49, 19.85, -44.95, -10.85, 0.0]\n  Target bbox: [611.52, 332.88, 659.89, 414.42]\n\nFrame 3:\n  Drone pose: [-4.23, -61.39, 20.0, -46.38, -13.31, 0.0]\n  Target bbox: [617.8, 320.48, 662.35, 398.72]\n\nFrame 4:\n  Drone pose: [-3.42, -61.41, 20.0, -46.03, -11.75, 0.0]\n  Target bbox: [580.27, 332.15, 623.68, 410.35]\n\nFrame 5 (current):\n  Drone pose: [-2.69, -61.45, 20.0, -46.82, -16.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.66, \"ymin\": 319.84, \"xmax\": 662.54, \"ymax\": 399.28}, \"waypoint_deltas\": [{\"dx\": 0.65, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.59, \"droll\": 0.0}, {\"dx\": 1.27, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -3.14, \"droll\": 0.0}, {\"dx\": 1.85, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": -4.54, \"droll\": 0.0}, {\"dx\": 2.4, \"dy\": -0.25, \"dz\": 0.0, \"dpitch\": 0.31, \"dyaw\": -5.71, \"droll\": 0.0}, {\"dx\": 2.93, \"dy\": -0.45, \"dz\": 0.0, \"dpitch\": 0.45, \"dyaw\": -6.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.14, "window_alt_abs_m": 0.3, "target_px_mean_hist": 752.5, "cur_frame_id": 74, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [4.48, -64.8, 20.03, -44.76, -27.37, 0.0]\n  Target bbox: [619.87, 321.71, 660.45, 397.56]\n\nFrame 2:\n  Drone pose: [4.94, -65.19, 20.04, -40.02, -30.24, 0.0]\n  Target bbox: [653.12, 400.86, 690.75, 474.91]\n\nFrame 3:\n  Drone pose: [5.41, -65.59, 20.05, -45.24, -28.31, 0.0]\n  Target bbox: [624.52, 309.52, 667.82, 386.53]\n\nFrame 4:\n  Drone pose: [5.74, -66.05, 19.92, -39.11, -28.19, 0.0]\n  Target bbox: [623.74, 403.38, 668.7, 484.35]\n\nFrame 5 (current):\n  Drone pose: [6.17, -66.42, 20.07, -44.14, -27.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.53, \"ymin\": 322.78, \"xmax\": 658.75, \"ymax\": 396.43}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": -0.31, \"dz\": 0.01, \"dpitch\": 0.36, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": -0.35, \"dz\": 0.03, \"dpitch\": 0.42, \"dyaw\": -1.64, \"droll\": 0.0}, {\"dx\": 1.57, \"dy\": -0.42, \"dz\": 0.04, \"dpitch\": 0.25, \"dyaw\": -1.57, \"droll\": 0.0}, {\"dx\": 3.33, \"dy\": -0.4, \"dz\": 0.06, \"dpitch\": -1.4, \"dyaw\": -3.54, \"droll\": 0.0}, {\"dx\": 4.95, \"dy\": -0.83, \"dz\": 0.09, \"dpitch\": -3.28, \"dyaw\": -4.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.16, "window_alt_abs_m": 0.29, "target_px_mean_hist": 741.8, "cur_frame_id": 92, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00109/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [13.46, -77.02, 20.45, -52.33, -2.85, 0.0]\n  Target bbox: [566.35, 268.65, 620.16, 354.1]\n\nFrame 2:\n  Drone pose: [13.84, -77.37, 20.5, -51.55, -4.02, 0.0]\n  Target bbox: [574.87, 279.0, 627.18, 363.45]\n\nFrame 3:\n  Drone pose: [13.94, -77.66, 20.49, -52.14, -3.71, 0.0]\n  Target bbox: [571.08, 262.41, 610.82, 336.45]\n\nFrame 4:\n  Drone pose: [14.35, -77.9, 20.62, -48.6, -7.2, 0.0]\n  Target bbox: [620.01, 319.07, 659.44, 400.09]\n\nFrame 5 (current):\n  Drone pose: [14.61, -78.17, 20.69, -49.71, -1.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 569.88, \"ymin\": 301.88, \"xmax\": 608.64, \"ymax\": 374.41}, \"waypoint_deltas\": [{\"dx\": 0.26, \"dy\": -0.26, \"dz\": 0.07, \"dpitch\": 1.6, \"dyaw\": -3.54, \"droll\": 0.0}, {\"dx\": 0.52, \"dy\": -0.53, \"dz\": 0.15, \"dpitch\": 1.84, \"dyaw\": -2.62, \"droll\": 0.0}, {\"dx\": 0.77, \"dy\": -0.8, \"dz\": 0.23, \"dpitch\": 2.08, \"dyaw\": -1.72, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -1.06, \"dz\": 0.32, \"dpitch\": 2.31, \"dyaw\": -0.84, \"droll\": 0.0}, {\"dx\": 1.29, \"dy\": -1.33, \"dz\": 0.41, \"dpitch\": 2.54, \"dyaw\": 0.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.44, "window_alt_abs_m": 0.28, "target_px_mean_hist": 757.5, "cur_frame_id": 109, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [18.92, -81.33, 21.88, -50.31, 2.12, 0.0]\n  Target bbox: [642.1, 251.95, 674.9, 323.53]\n\nFrame 2:\n  Drone pose: [19.49, -81.03, 22.14, -48.4, 7.17, 0.0]\n  Target bbox: [571.97, 296.56, 606.83, 361.67]\n\nFrame 3:\n  Drone pose: [20.1, -81.16, 22.24, -50.64, 7.87, 0.0]\n  Target bbox: [567.74, 263.81, 603.85, 329.28]\n\nFrame 4:\n  Drone pose: [20.82, -81.05, 22.26, -47.14, 2.9, 0.0]\n  Target bbox: [623.92, 326.76, 656.42, 392.41]\n\nFrame 5 (current):\n  Drone pose: [21.27, -80.9, 23.07, -52.05, 1.15, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 639.62, \"ymin\": 259.37, \"xmax\": 670.6, \"ymax\": 329.33}, \"waypoint_deltas\": [{\"dx\": 0.57, \"dy\": -0.12, \"dz\": -0.02, \"dpitch\": 3.83, \"dyaw\": 1.67, \"droll\": 0.0}, {\"dx\": 1.05, \"dy\": -0.12, \"dz\": -0.59, \"dpitch\": 4.62, \"dyaw\": 1.67, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -0.12, \"dz\": -0.54, \"dpitch\": 4.58, \"dyaw\": 1.67, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -0.12, \"dz\": -0.53, \"dpitch\": 4.61, \"dyaw\": 1.67, \"droll\": 0.0}, {\"dx\": 2.47, \"dy\": -0.13, \"dz\": -0.6, \"dpitch\": 4.74, \"dyaw\": 1.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.46, "window_alt_abs_m": 1.19, "target_px_mean_hist": 640.8, "cur_frame_id": 127, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00144/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [26.73, -82.98, 22.09, -45.11, 8.07, 0.0]\n  Target bbox: [623.18, 323.22, 657.26, 396.27]\n\nFrame 2:\n  Drone pose: [26.89, -83.52, 22.03, -44.46, 9.36, 0.0]\n  Target bbox: [623.3, 327.11, 656.97, 392.16]\n\nFrame 3:\n  Drone pose: [27.11, -83.83, 21.97, -43.94, 10.07, 0.0]\n  Target bbox: [622.94, 325.34, 657.42, 394.12]\n\nFrame 4:\n  Drone pose: [27.59, -83.82, 21.9, -43.82, 10.01, 0.0]\n  Target bbox: [622.21, 324.17, 658.2, 395.34]\n\nFrame 5 (current):\n  Drone pose: [28.0, -83.86, 21.82, -43.58, 10.08, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.95, \"ymin\": 327.38, \"xmax\": 657.32, \"ymax\": 391.99}, \"waypoint_deltas\": [{\"dx\": 0.47, \"dy\": 0.04, \"dz\": -0.07, \"dpitch\": 0.13, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": 0.05, \"dz\": -0.13, \"dpitch\": 0.23, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": 0.07, \"dz\": -0.16, \"dpitch\": 0.28, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": 1.95, \"dy\": 0.06, \"dz\": -0.19, \"dpitch\": 0.31, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": 2.47, \"dy\": 0.06, \"dz\": -0.21, \"dpitch\": 0.3, \"dyaw\": -0.15, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.13, "window_alt_abs_m": 0.26, "target_px_mean_hist": 612.0, "cur_frame_id": 144, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00158/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00159/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215/aug_001/frames_playback/frame_00162/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [35.64, -83.8, 21.96, -41.91, 15.23, 0.0]\n  Target bbox: [561.07, 371.1, 599.01, 442.93]\n\nFrame 2:\n  Drone pose: [35.87, -83.73, 22.17, -44.57, 9.92, 0.0]\n  Target bbox: [622.52, 324.93, 657.85, 394.5]\n\nFrame 3:\n  Drone pose: [36.44, -83.82, 22.37, -47.99, 14.04, 0.0]\n  Target bbox: [575.42, 272.85, 612.52, 346.13]\n\nFrame 4:\n  Drone pose: [36.85, -83.86, 22.42, -44.87, 10.26, 0.0]\n  Target bbox: [622.75, 324.64, 657.64, 394.86]\n\nFrame 5 (current):\n  Drone pose: [37.31, -84.0, 22.52, -43.3, 12.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 605.27, \"ymin\": 351.72, \"xmax\": 641.13, \"ymax\": 421.81}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": 0.09, \"dz\": 0.23, \"dpitch\": -1.91, \"dyaw\": -1.65, \"droll\": 0.0}, {\"dx\": 0.97, \"dy\": 0.18, \"dz\": 0.41, \"dpitch\": -2.15, \"dyaw\": -1.89, \"droll\": 0.0}, {\"dx\": 1.45, \"dy\": 0.21, \"dz\": 0.6, \"dpitch\": -2.39, \"dyaw\": -1.97, \"droll\": 0.0}, {\"dx\": 1.94, \"dy\": 0.18, \"dz\": 0.85, \"dpitch\": -2.69, \"dyaw\": -1.9, \"droll\": 0.0}, {\"dx\": 2.46, \"dy\": 0.17, \"dz\": 1.07, \"dpitch\": -3.0, \"dyaw\": -1.87, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.96, "window_alt_abs_m": 0.56, "target_px_mean_hist": 611.2, "cur_frame_id": 162, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_215", "difficulty_score": 0.3918, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-56.89, -44.09, 22.0, -46.27, -8.53, 0.0]\n  Target bbox: [625.92, 325.0, 653.74, 394.41]\n\nFrame 2:\n  Drone pose: [-57.1, -45.22, 21.2, -44.34, -5.13, 0.0]\n  Target bbox: [626.46, 325.25, 653.18, 394.27]\n\nFrame 3:\n  Drone pose: [-56.95, -45.83, 20.67, -43.17, -3.36, 0.0]\n  Target bbox: [627.5, 328.49, 652.26, 390.84]\n\nFrame 4:\n  Drone pose: [-56.61, -46.21, 20.64, -42.93, -2.31, 0.0]\n  Target bbox: [623.08, 326.91, 657.01, 392.57]\n\nFrame 5 (current):\n  Drone pose: [-56.2, -46.51, 20.62, -42.76, -2.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.19, \"ymin\": 323.32, \"xmax\": 661.86, \"ymax\": 396.33}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.26, \"dz\": -0.03, \"dpitch\": 0.15, \"dyaw\": -0.64, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": -0.49, \"dz\": -0.05, \"dpitch\": 0.28, \"dyaw\": -1.36, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": -0.68, \"dz\": -0.07, \"dpitch\": 0.35, \"dyaw\": -0.83, \"droll\": 0.0}, {\"dx\": 1.81, \"dy\": -0.84, \"dz\": -0.09, \"dpitch\": 0.41, \"dyaw\": -0.4, \"droll\": 0.0}, {\"dx\": 2.29, \"dy\": -0.97, \"dz\": -0.2, \"dpitch\": 0.58, \"dyaw\": -0.06, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.76, "window_alt_abs_m": 1.38, "target_px_mean_hist": 518.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-49.89, -48.2, 20.19, -41.9, -0.97, 0.0]\n  Target bbox: [628.57, 327.35, 651.16, 392.1]\n\nFrame 2:\n  Drone pose: [-49.39, -48.28, 20.17, -41.88, -0.74, 0.0]\n  Target bbox: [627.89, 327.86, 651.84, 391.53]\n\nFrame 3:\n  Drone pose: [-48.88, -48.36, 20.15, -41.85, -0.52, 0.0]\n  Target bbox: [627.98, 325.86, 651.69, 393.69]\n\nFrame 4:\n  Drone pose: [-48.38, -48.44, 20.13, -41.83, -0.31, 0.0]\n  Target bbox: [628.68, 327.02, 651.15, 392.43]\n\nFrame 5 (current):\n  Drone pose: [-47.88, -48.52, 20.12, -41.81, -0.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.74, \"ymin\": 324.08, \"xmax\": 652.2, \"ymax\": 395.55}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.07, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.14, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -0.2, \"dz\": -0.04, \"dpitch\": 0.05, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": -0.27, \"dz\": -0.05, \"dpitch\": 0.06, \"dyaw\": 0.73, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -0.33, \"dz\": -0.06, \"dpitch\": 0.09, \"dyaw\": 0.89, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.88, "window_alt_abs_m": 0.07, "target_px_mean_hist": 553.2, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-39.12, -41.69, 20.02, -43.11, -19.85, 0.0]\n  Target bbox: [622.15, 322.24, 657.47, 397.33]\n\nFrame 2:\n  Drone pose: [-37.92, -40.18, 20.01, -43.21, -24.58, 0.0]\n  Target bbox: [621.1, 322.29, 658.54, 397.31]\n\nFrame 3:\n  Drone pose: [-36.88, -39.2, 20.01, -43.28, -27.76, 0.0]\n  Target bbox: [618.71, 320.42, 660.87, 399.26]\n\nFrame 4:\n  Drone pose: [-35.93, -38.56, 20.01, -43.41, -30.0, 0.0]\n  Target bbox: [619.9, 321.65, 659.77, 397.91]\n\nFrame 5 (current):\n  Drone pose: [-35.43, -38.56, 20.01, -43.4, -30.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.07, \"ymin\": 321.01, \"xmax\": 660.61, \"ymax\": 398.55}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.15, "window_alt_abs_m": 0.01, "target_px_mean_hist": 546.5, "cur_frame_id": 38, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-28.93, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [618.17, 319.98, 661.44, 399.66]\n\nFrame 2:\n  Drone pose: [-28.43, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [619.91, 322.03, 659.76, 397.57]\n\nFrame 3:\n  Drone pose: [-27.93, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [621.61, 324.14, 658.19, 395.3]\n\nFrame 4:\n  Drone pose: [-27.43, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [618.11, 320.19, 661.49, 399.48]\n\nFrame 5 (current):\n  Drone pose: [-26.93, -38.56, 20.0, -43.39, -30.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.84, \"ymin\": 320.58, \"xmax\": 660.82, \"ymax\": 398.99}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 554.5, "cur_frame_id": 55, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-20.43, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [618.67, 320.89, 660.95, 398.77]\n\nFrame 2:\n  Drone pose: [-19.93, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [623.49, 326.07, 656.38, 393.29]\n\nFrame 3:\n  Drone pose: [-19.2, -38.57, 20.0, -43.68, -30.31, 0.0]\n  Target bbox: [617.98, 319.77, 661.65, 399.83]\n\nFrame 4:\n  Drone pose: [-18.47, -38.58, 20.0, -43.98, -30.62, 0.0]\n  Target bbox: [617.8, 319.91, 661.8, 399.72]\n\nFrame 5 (current):\n  Drone pose: [-17.74, -38.59, 20.0, -44.27, -30.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.09, \"ymin\": 325.76, \"xmax\": 656.8, \"ymax\": 393.53}, \"waypoint_deltas\": [{\"dx\": 0.73, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.3, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": 1.45, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": -0.6, \"dyaw\": -0.66, \"droll\": 0.0}, {\"dx\": 2.18, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": -0.9, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": 2.91, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -1.21, \"dyaw\": -1.34, \"droll\": 0.0}, {\"dx\": 3.64, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": -1.51, \"dyaw\": -1.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.94, "window_alt_abs_m": 0.0, "target_px_mean_hist": 562.5, "cur_frame_id": 72, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00090/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-7.54, -38.72, 20.0, -48.63, -36.2, 0.0]\n  Target bbox: [618.85, 321.12, 660.98, 397.97]\n\nFrame 2:\n  Drone pose: [-6.82, -38.73, 20.0, -48.96, -36.64, 0.0]\n  Target bbox: [614.59, 316.57, 665.06, 402.59]\n\nFrame 3:\n  Drone pose: [-6.32, -38.74, 20.0, -48.96, -36.61, 0.0]\n  Target bbox: [616.44, 318.48, 663.31, 400.62]\n\nFrame 4:\n  Drone pose: [-5.82, -38.74, 20.0, -48.96, -36.61, 0.0]\n  Target bbox: [621.77, 324.17, 658.15, 394.84]\n\nFrame 5 (current):\n  Drone pose: [-5.55, -38.8, 20.0, -48.7, -35.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.99, \"ymin\": 316.81, \"xmax\": 664.66, \"ymax\": 402.36}, \"waypoint_deltas\": [{\"dx\": 0.27, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": 0.26, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": 0.52, \"dyaw\": 1.27, \"droll\": 0.0}, {\"dx\": 0.8, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": 0.78, \"dyaw\": 1.88, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -0.26, \"dz\": 0.0, \"dpitch\": 1.04, \"dyaw\": 2.49, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": -0.32, \"dz\": 0.0, \"dpitch\": 1.3, \"dyaw\": 3.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.13, "window_alt_abs_m": 0.0, "target_px_mean_hist": 634.2, "cur_frame_id": 90, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00107/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-1.61, -39.3, 20.0, -45.65, -30.04, 0.0]\n  Target bbox: [622.02, 323.94, 657.81, 395.3]\n\nFrame 2:\n  Drone pose: [-1.18, -39.26, 20.0, -45.51, -30.04, 0.0]\n  Target bbox: [622.54, 324.95, 657.31, 394.32]\n\nFrame 3:\n  Drone pose: [-0.76, -39.22, 20.0, -45.38, -30.03, 0.0]\n  Target bbox: [623.72, 326.27, 656.2, 392.91]\n\nFrame 4:\n  Drone pose: [-0.34, -39.17, 20.0, -45.24, -30.03, 0.0]\n  Target bbox: [617.58, 318.94, 662.04, 400.51]\n\nFrame 5 (current):\n  Drone pose: [0.08, -39.13, 20.0, -45.1, -30.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.57, \"ymin\": 319.11, \"xmax\": 662.02, \"ymax\": 400.43}, \"waypoint_deltas\": [{\"dx\": 0.42, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": 0.27, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.27, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": 0.4, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.69, \"dy\": 0.18, \"dz\": 0.0, \"dpitch\": 0.53, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.11, \"dy\": 0.22, \"dz\": 0.0, \"dpitch\": 0.67, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 589.2, "cur_frame_id": 107, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00120/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00121/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00122/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00124/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [5.57, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [620.13, 322.47, 659.56, 397.11]\n\nFrame 2:\n  Drone pose: [6.07, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [618.45, 320.54, 661.15, 399.12]\n\nFrame 3:\n  Drone pose: [6.57, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [622.61, 325.2, 657.22, 394.23]\n\nFrame 4:\n  Drone pose: [7.07, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [620.06, 322.25, 659.67, 397.26]\n\nFrame 5 (current):\n  Drone pose: [7.57, -38.56, 20.0, -43.39, -30.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.16, \"ymin\": 319.97, \"xmax\": 661.46, \"ymax\": 399.65}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 565.2, "cur_frame_id": 124, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00137/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00138/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00139/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00141/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [14.07, -38.56, 20.01, -43.4, -30.0, 0.0]\n  Target bbox: [624.3, 327.14, 655.62, 392.16]\n\nFrame 2:\n  Drone pose: [14.57, -38.56, 20.01, -43.4, -30.0, 0.0]\n  Target bbox: [618.02, 320.0, 661.58, 399.66]\n\nFrame 3:\n  Drone pose: [15.07, -38.56, 20.01, -43.41, -30.0, 0.0]\n  Target bbox: [619.94, 322.29, 659.74, 397.3]\n\nFrame 4:\n  Drone pose: [15.57, -38.56, 20.01, -43.41, -30.0, 0.0]\n  Target bbox: [623.83, 326.72, 656.05, 392.65]\n\nFrame 5 (current):\n  Drone pose: [16.07, -38.56, 20.02, -43.42, -30.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.57, \"ymin\": 322.78, \"xmax\": 659.14, \"ymax\": 396.77}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": 0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": 0.02, \"dpitch\": -0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": 0.0, \"dz\": 0.02, \"dpitch\": -0.04, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.01, "target_px_mean_hist": 560.0, "cur_frame_id": 141, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00154/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00155/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00156/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00157/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/ORI/frames_playback/frame_00158/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [22.82, -39.08, 20.16, -44.32, -29.03, 0.0]\n  Target bbox: [619.11, 321.35, 660.53, 398.25]\n\nFrame 2:\n  Drone pose: [23.38, -39.21, 20.18, -44.53, -28.78, 0.0]\n  Target bbox: [621.26, 323.67, 658.5, 395.78]\n\nFrame 3:\n  Drone pose: [23.95, -39.34, 20.21, -44.75, -28.53, 0.0]\n  Target bbox: [623.73, 326.54, 656.15, 392.75]\n\nFrame 4:\n  Drone pose: [24.51, -39.48, 20.24, -44.97, -28.27, 0.0]\n  Target bbox: [619.94, 321.62, 659.72, 397.87]\n\nFrame 5 (current):\n  Drone pose: [25.07, -39.61, 20.28, -45.2, -28.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.21, \"ymin\": 322.99, \"xmax\": 658.52, \"ymax\": 396.41}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": -0.13, \"dz\": 0.04, \"dpitch\": -0.23, \"dyaw\": 0.26, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": -0.26, \"dz\": 0.08, \"dpitch\": -0.47, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": 1.69, \"dy\": -0.39, \"dz\": 0.13, \"dpitch\": -0.72, \"dyaw\": 0.8, \"droll\": 0.0}, {\"dx\": 2.25, \"dy\": -0.52, \"dz\": 0.18, \"dpitch\": -0.97, \"dyaw\": 1.07, \"droll\": 0.0}, {\"dx\": 2.81, \"dy\": -0.65, \"dz\": 0.23, \"dpitch\": -1.23, \"dyaw\": 1.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.02, "window_alt_abs_m": 0.12, "target_px_mean_hist": 571.0, "cur_frame_id": 158, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-56.84, -44.0, 21.96, -45.96, -10.99, 0.0]\n  Target bbox: [653.59, 291.2, 676.89, 349.93]\n\nFrame 2:\n  Drone pose: [-57.1, -45.22, 21.2, -47.0, -0.23, 0.0]\n  Target bbox: [567.21, 285.0, 594.37, 348.83]\n\nFrame 3:\n  Drone pose: [-56.95, -45.83, 20.67, -40.17, 0.14, 0.0]\n  Target bbox: [583.08, 374.91, 610.25, 447.19]\n\nFrame 4:\n  Drone pose: [-56.61, -46.21, 20.64, -40.15, 2.69, 0.0]\n  Target bbox: [560.24, 375.45, 596.18, 441.07]\n\nFrame 5 (current):\n  Drone pose: [-56.2, -46.51, 20.62, -45.39, -5.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 657.71, \"ymin\": 282.56, \"xmax\": 694.39, \"ymax\": 349.87}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.26, \"dz\": -0.03, \"dpitch\": 2.78, \"dyaw\": 2.25, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": -0.49, \"dz\": -0.05, \"dpitch\": 2.91, \"dyaw\": 1.53, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": -0.68, \"dz\": -0.07, \"dpitch\": 2.98, \"dyaw\": 2.06, \"droll\": 0.0}, {\"dx\": 1.81, \"dy\": -0.84, \"dz\": -0.09, \"dpitch\": 3.04, \"dyaw\": 2.49, \"droll\": 0.0}, {\"dx\": 2.29, \"dy\": -0.97, \"dz\": -0.2, \"dpitch\": 3.21, \"dyaw\": 2.83, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.11, "window_alt_abs_m": 1.35, "target_px_mean_hist": 497.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-49.89, -48.2, 20.19, -41.9, -0.97, 0.0]\n  Target bbox: [627.01, 323.68, 652.6, 395.85]\n\nFrame 2:\n  Drone pose: [-49.39, -48.28, 20.17, -38.99, -0.96, 0.0]\n  Target bbox: [622.63, 372.82, 662.96, 443.68]\n\nFrame 3:\n  Drone pose: [-49.01, -48.48, 20.08, -38.3, -6.24, 0.0]\n  Target bbox: [624.12, 328.29, 655.81, 391.38]\n\nFrame 4:\n  Drone pose: [-48.38, -48.44, 20.13, -41.83, -0.31, 0.0]\n  Target bbox: [627.64, 323.92, 652.14, 395.72]\n\nFrame 5 (current):\n  Drone pose: [-47.88, -48.52, 20.12, -41.81, -0.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.58, \"ymin\": 325.51, \"xmax\": 651.36, \"ymax\": 393.98}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.07, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.14, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": 0.37, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -0.2, \"dz\": -0.04, \"dpitch\": 0.05, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": -0.27, \"dz\": -0.05, \"dpitch\": 0.06, \"dyaw\": 0.73, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -0.33, \"dz\": -0.06, \"dpitch\": 0.09, \"dyaw\": 0.89, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.43, "window_alt_abs_m": 0.19, "target_px_mean_hist": 532.2, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-39.12, -41.69, 20.02, -40.82, -23.16, 0.0]\n  Target bbox: [664.87, 366.23, 696.68, 431.67]\n\nFrame 2:\n  Drone pose: [-37.92, -40.18, 20.01, -44.65, -19.58, 0.0]\n  Target bbox: [559.0, 300.8, 597.52, 374.16]\n\nFrame 3:\n  Drone pose: [-36.88, -39.2, 20.01, -42.83, -32.76, 0.0]\n  Target bbox: [678.66, 329.44, 724.06, 409.21]\n\nFrame 4:\n  Drone pose: [-35.93, -38.56, 20.01, -46.06, -31.44, 0.0]\n  Target bbox: [635.86, 275.45, 679.83, 355.43]\n\nFrame 5 (current):\n  Drone pose: [-35.43, -38.56, 20.01, -43.4, -30.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.57, \"ymin\": 326.55, \"xmax\": 656.3, \"ymax\": 392.84}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.52, "window_alt_abs_m": 0.01, "target_px_mean_hist": 568.0, "cur_frame_id": 38, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-28.78, -38.49, 20.11, -41.66, -37.66, 0.0]\n  Target bbox: [626.75, 327.96, 653.03, 391.51]\n\nFrame 2:\n  Drone pose: [-28.45, -38.58, 20.1, -42.61, -32.71, 0.0]\n  Target bbox: [622.77, 322.49, 656.79, 397.12]\n\nFrame 3:\n  Drone pose: [-27.93, -38.56, 20.0, -43.48, -31.58, 0.0]\n  Target bbox: [640.23, 321.24, 678.85, 395.78]\n\nFrame 4:\n  Drone pose: [-27.43, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [622.83, 325.8, 657.02, 393.62]\n\nFrame 5 (current):\n  Drone pose: [-26.93, -38.56, 20.0, -45.79, -27.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 593.59, \"ymin\": 283.79, \"xmax\": 623.68, \"ymax\": 356.2}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.4, \"dyaw\": -2.53, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.4, \"dyaw\": -2.53, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.4, \"dyaw\": -2.53, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.4, \"dyaw\": -2.53, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 2.4, \"dyaw\": -2.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.19, "window_alt_abs_m": 0.11, "target_px_mean_hist": 524.0, "cur_frame_id": 55, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-20.43, -38.56, 20.0, -40.22, -31.38, 0.0]\n  Target bbox: [635.22, 373.27, 678.69, 453.21]\n\nFrame 2:\n  Drone pose: [-19.93, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [623.85, 326.62, 656.06, 392.69]\n\nFrame 3:\n  Drone pose: [-19.2, -38.57, 20.0, -39.5, -33.08, 0.0]\n  Target bbox: [651.29, 393.06, 697.08, 468.27]\n\nFrame 4:\n  Drone pose: [-18.46, -38.76, 19.99, -48.0, -29.35, 0.0]\n  Target bbox: [654.69, 309.87, 691.94, 383.17]\n\nFrame 5 (current):\n  Drone pose: [-17.74, -38.59, 20.0, -45.83, -35.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 677.47, \"ymin\": 296.48, \"xmax\": 723.39, \"ymax\": 374.34}, \"waypoint_deltas\": [{\"dx\": 0.73, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 1.26, \"dyaw\": 4.68, \"droll\": 0.0}, {\"dx\": 1.45, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.96, \"dyaw\": 4.34, \"droll\": 0.0}, {\"dx\": 2.18, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 0.66, \"dyaw\": 4.0, \"droll\": 0.0}, {\"dx\": 2.91, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": 3.66, \"droll\": 0.0}, {\"dx\": 3.64, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 3.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.78, "window_alt_abs_m": 0.03, "target_px_mean_hist": 561.5, "cur_frame_id": 72, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00090/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-7.65, -38.86, 19.93, -43.47, -36.03, 0.0]\n  Target bbox: [619.03, 324.55, 661.22, 394.88]\n\nFrame 2:\n  Drone pose: [-6.78, -38.57, 20.0, -54.92, -36.9, 0.0]\n  Target bbox: [664.33, 292.67, 718.01, 372.38]\n\nFrame 3:\n  Drone pose: [-6.32, -38.74, 20.0, -46.03, -35.42, 0.0]\n  Target bbox: [601.14, 365.79, 651.44, 452.14]\n\nFrame 4:\n  Drone pose: [-5.82, -38.74, 20.0, -45.61, -31.61, 0.0]\n  Target bbox: [561.35, 377.78, 606.92, 457.47]\n\nFrame 5 (current):\n  Drone pose: [-5.55, -38.8, 20.0, -45.65, -30.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.13, \"ymin\": 372.11, \"xmax\": 607.51, \"ymax\": 453.35}, \"waypoint_deltas\": [{\"dx\": 0.27, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": -2.79, \"dyaw\": -4.36, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": -2.53, \"dyaw\": -3.73, \"droll\": 0.0}, {\"dx\": 0.8, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": -2.27, \"dyaw\": -3.12, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -0.26, \"dz\": 0.0, \"dpitch\": -2.01, \"dyaw\": -2.51, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": -0.32, \"dz\": 0.0, \"dpitch\": -1.75, \"dyaw\": -1.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.82, "window_alt_abs_m": 0.07, "target_px_mean_hist": 513.8, "cur_frame_id": 90, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00107/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-1.61, -39.3, 20.0, -45.65, -30.04, 0.0]\n  Target bbox: [623.19, 325.68, 656.69, 393.54]\n\nFrame 2:\n  Drone pose: [-1.18, -39.26, 20.0, -45.51, -30.04, 0.0]\n  Target bbox: [620.2, 322.0, 659.53, 397.37]\n\nFrame 3:\n  Drone pose: [-0.76, -39.22, 20.0, -50.38, -32.54, 0.0]\n  Target bbox: [649.89, 237.9, 690.13, 314.15]\n\nFrame 4:\n  Drone pose: [-0.34, -39.17, 20.0, -45.85, -32.76, 0.0]\n  Target bbox: [651.72, 311.11, 693.45, 388.89]\n\nFrame 5 (current):\n  Drone pose: [0.08, -39.13, 20.0, -42.17, -27.77, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 591.58, \"ymin\": 370.47, \"xmax\": 633.6, \"ymax\": 448.23}, \"waypoint_deltas\": [{\"dx\": 0.42, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": -2.8, \"dyaw\": -2.26, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": -2.66, \"dyaw\": -2.25, \"droll\": 0.0}, {\"dx\": 1.27, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": -2.53, \"dyaw\": -2.25, \"droll\": 0.0}, {\"dx\": 1.69, \"dy\": 0.18, \"dz\": 0.0, \"dpitch\": -2.4, \"dyaw\": -2.25, \"droll\": 0.0}, {\"dx\": 2.11, \"dy\": 0.22, \"dz\": 0.0, \"dpitch\": -2.26, \"dyaw\": -2.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.72, "window_alt_abs_m": 0.0, "target_px_mean_hist": 596.5, "cur_frame_id": 107, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00120/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00121/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00122/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00124/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [5.57, -38.56, 20.0, -44.25, -35.0, 0.0]\n  Target bbox: [681.38, 310.72, 721.16, 383.54]\n\nFrame 2:\n  Drone pose: [6.07, -38.56, 20.0, -45.53, -29.77, 0.0]\n  Target bbox: [615.76, 284.46, 658.03, 363.26]\n\nFrame 3:\n  Drone pose: [6.57, -38.56, 20.0, -43.39, -30.0, 0.0]\n  Target bbox: [620.74, 322.59, 658.97, 396.93]\n\nFrame 4:\n  Drone pose: [7.07, -38.56, 20.0, -41.32, -25.0, 0.0]\n  Target bbox: [559.3, 360.19, 597.65, 432.51]\n\nFrame 5 (current):\n  Drone pose: [7.57, -38.56, 20.0, -42.8, -27.76, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 595.61, \"ymin\": 337.93, \"xmax\": 629.28, \"ymax\": 402.08}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": -2.24, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": -2.24, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": -2.24, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": -2.24, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": -2.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.23, "window_alt_abs_m": 0.0, "target_px_mean_hist": 570.2, "cur_frame_id": 124, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00137/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00138/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00139/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00141/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [14.07, -38.56, 20.01, -44.88, -25.0, 0.0]\n  Target bbox: [560.0, 299.89, 596.89, 373.71]\n\nFrame 2:\n  Drone pose: [14.41, -38.65, 20.03, -39.29, -30.76, 0.0]\n  Target bbox: [674.07, 319.76, 701.24, 385.67]\n\nFrame 3:\n  Drone pose: [15.06, -38.44, 20.12, -40.52, -28.36, 0.0]\n  Target bbox: [623.2, 323.8, 656.42, 395.87]\n\nFrame 4:\n  Drone pose: [15.57, -38.56, 20.01, -43.41, -30.0, 0.0]\n  Target bbox: [619.83, 321.87, 659.84, 397.72]\n\nFrame 5 (current):\n  Drone pose: [16.07, -38.56, 20.02, -43.42, -30.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.75, \"ymin\": 322.95, \"xmax\": 657.94, \"ymax\": 396.55}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.0, \"dz\": 0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.0, \"dz\": 0.02, \"dpitch\": -0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": 0.0, \"dz\": 0.02, \"dpitch\": -0.04, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.8, "window_alt_abs_m": 0.22, "target_px_mean_hist": 511.5, "cur_frame_id": 141, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00154/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00155/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00156/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00157/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448/aug_001/frames_playback/frame_00158/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [22.82, -39.08, 20.16, -44.32, -29.03, 0.0]\n  Target bbox: [618.42, 320.35, 661.19, 399.25]\n\nFrame 2:\n  Drone pose: [23.32, -39.37, 20.25, -43.46, -39.06, 0.0]\n  Target bbox: [660.99, 320.06, 692.78, 384.77]\n\nFrame 3:\n  Drone pose: [23.95, -39.34, 20.21, -44.75, -28.53, 0.0]\n  Target bbox: [618.3, 319.81, 661.29, 399.76]\n\nFrame 4:\n  Drone pose: [24.51, -39.48, 20.24, -45.84, -25.83, 0.0]\n  Target bbox: [595.15, 313.02, 626.39, 378.06]\n\nFrame 5 (current):\n  Drone pose: [25.04, -39.72, 20.14, -40.99, -25.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 564.55, \"ymin\": 314.76, \"xmax\": 586.78, \"ymax\": 376.08}, \"waypoint_deltas\": [{\"dx\": 0.59, \"dy\": -0.02, \"dz\": 0.18, \"dpitch\": -4.44, \"dyaw\": -2.59, \"droll\": 0.0}, {\"dx\": 1.15, \"dy\": -0.15, \"dz\": 0.22, \"dpitch\": -4.68, \"dyaw\": -2.32, \"droll\": 0.0}, {\"dx\": 1.72, \"dy\": -0.28, \"dz\": 0.27, \"dpitch\": -4.93, \"dyaw\": -2.05, \"droll\": 0.0}, {\"dx\": 2.28, \"dy\": -0.41, \"dz\": 0.32, \"dpitch\": -5.18, \"dyaw\": -1.78, \"droll\": 0.0}, {\"dx\": 2.84, \"dy\": -0.54, \"dz\": 0.37, \"dpitch\": -5.44, \"dyaw\": -1.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 23.92, "window_alt_abs_m": 0.26, "target_px_mean_hist": 599.8, "cur_frame_id": 158, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776208448", "difficulty_score": 0.3025, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-19.66, 139.83, 22.0, -46.66, -116.57, 0.0]\n  Target bbox: [626.51, 337.6, 653.45, 382.1]\n\nFrame 2:\n  Drone pose: [-20.87, 138.25, 21.2, -47.79, -114.73, 0.0]\n  Target bbox: [629.71, 338.33, 650.22, 381.31]\n\nFrame 3:\n  Drone pose: [-21.54, 137.26, 20.67, -48.17, -113.41, 0.0]\n  Target bbox: [626.93, 336.37, 653.0, 383.24]\n\nFrame 4:\n  Drone pose: [-21.96, 136.57, 20.64, -48.68, -112.41, 0.0]\n  Target bbox: [627.93, 337.09, 652.14, 382.48]\n\nFrame 5 (current):\n  Drone pose: [-22.26, 135.99, 20.62, -48.64, -113.13, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.14, \"ymin\": 336.52, \"xmax\": 652.95, \"ymax\": 383.09}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": -0.56, \"dz\": -0.03, \"dpitch\": 0.13, \"dyaw\": -0.88, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": -1.1, \"dz\": -0.05, \"dpitch\": 0.32, \"dyaw\": -1.87, \"droll\": 0.0}, {\"dx\": -0.55, \"dy\": -1.64, \"dz\": -0.07, \"dpitch\": 0.2, \"dyaw\": -1.52, \"droll\": 0.0}, {\"dx\": -0.64, \"dy\": -2.17, \"dz\": -0.09, \"dpitch\": 0.11, \"dyaw\": -1.28, \"droll\": 0.0}, {\"dx\": -0.72, \"dy\": -2.71, \"dz\": -0.2, \"dpitch\": 0.15, \"dyaw\": -1.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.87, "window_alt_abs_m": 1.38, "target_px_mean_hist": 276.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-24.54, 128.74, 20.19, -48.64, -116.32, 0.0]\n  Target bbox: [628.03, 336.45, 652.06, 383.09]\n\nFrame 2:\n  Drone pose: [-24.9, 128.17, 20.17, -48.61, -116.84, 0.0]\n  Target bbox: [628.58, 336.33, 651.52, 383.24]\n\nFrame 3:\n  Drone pose: [-25.29, 127.62, 20.15, -48.59, -117.25, 0.0]\n  Target bbox: [627.33, 336.25, 652.77, 383.32]\n\nFrame 4:\n  Drone pose: [-25.71, 127.07, 20.14, -48.57, -117.57, 0.0]\n  Target bbox: [627.59, 335.18, 652.53, 384.4]\n\nFrame 5 (current):\n  Drone pose: [-26.16, 126.53, 20.12, -48.56, -117.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.11, \"ymin\": 337.11, \"xmax\": 651.97, \"ymax\": 382.45}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": -0.53, \"dz\": -0.02, \"dpitch\": 0.0, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": -0.96, \"dy\": -1.05, \"dz\": -0.03, \"dpitch\": 0.0, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": -1.56, \"dz\": -0.04, \"dpitch\": -0.01, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": -2.07, \"dz\": -0.05, \"dpitch\": -0.03, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": -2.57, \"dz\": -0.06, \"dpitch\": -0.03, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.48, "window_alt_abs_m": 0.07, "target_px_mean_hist": 307.0, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-32.72, 119.52, 20.01, -49.78, -112.98, 0.0]\n  Target bbox: [626.36, 335.56, 653.56, 383.98]\n\nFrame 2:\n  Drone pose: [-32.98, 119.09, 20.01, -49.83, -112.01, 0.0]\n  Target bbox: [629.66, 336.57, 650.26, 382.96]\n\nFrame 3:\n  Drone pose: [-33.22, 118.67, 20.01, -49.86, -111.14, 0.0]\n  Target bbox: [629.66, 337.0, 650.26, 382.53]\n\nFrame 4:\n  Drone pose: [-33.44, 118.25, 20.01, -49.86, -110.32, 0.0]\n  Target bbox: [624.85, 335.18, 655.07, 384.32]\n\nFrame 5 (current):\n  Drone pose: [-33.65, 117.84, 20.01, -49.84, -109.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.12, \"ymin\": 335.17, \"xmax\": 654.8, \"ymax\": 384.3}, \"waypoint_deltas\": [{\"dx\": -0.2, \"dy\": -0.42, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.77, \"droll\": 0.0}, {\"dx\": -0.4, \"dy\": -0.84, \"dz\": -0.01, \"dpitch\": 0.05, \"dyaw\": 1.54, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": -1.25, \"dz\": -0.01, \"dpitch\": 0.08, \"dyaw\": 2.34, \"droll\": 0.0}, {\"dx\": -0.84, \"dy\": -1.67, \"dz\": -0.01, \"dpitch\": 0.1, \"dyaw\": 3.16, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -2.11, \"dz\": -0.01, \"dpitch\": 0.11, \"dyaw\": 3.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.44, "window_alt_abs_m": 0.01, "target_px_mean_hist": 314.8, "cur_frame_id": 39, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-39.98, 112.69, 20.0, -48.28, -88.76, 0.0]\n  Target bbox: [626.01, 334.71, 654.0, 384.89]\n\nFrame 2:\n  Drone pose: [-41.61, 112.47, 20.0, -47.73, -85.08, 0.0]\n  Target bbox: [626.95, 337.61, 653.19, 381.9]\n\nFrame 3:\n  Drone pose: [-43.0, 112.19, 20.0, -47.1, -80.73, 0.0]\n  Target bbox: [625.19, 334.46, 654.76, 385.22]\n\nFrame 4:\n  Drone pose: [-44.06, 111.85, 20.0, -46.69, -79.08, 0.0]\n  Target bbox: [628.43, 336.61, 651.52, 383.03]\n\nFrame 5 (current):\n  Drone pose: [-44.77, 111.47, 20.0, -46.45, -78.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.48, \"ymin\": 336.93, \"xmax\": 653.63, \"ymax\": 382.64}, \"waypoint_deltas\": [{\"dx\": -0.41, \"dy\": -0.43, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 1.21, \"droll\": 0.0}, {\"dx\": -0.63, \"dy\": -0.91, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": 1.85, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": -1.41, \"dz\": 0.0, \"dpitch\": 0.39, \"dyaw\": 2.18, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": -1.93, \"dz\": 0.0, \"dpitch\": 0.38, \"dyaw\": 2.41, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": -2.47, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": 2.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.25, "window_alt_abs_m": 0.0, "target_px_mean_hist": 311.8, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.78, 104.91, 20.0, -46.18, -75.41, 0.0]\n  Target bbox: [625.69, 337.23, 654.42, 382.42]\n\nFrame 2:\n  Drone pose: [-45.77, 104.44, 20.0, -46.14, -75.49, 0.0]\n  Target bbox: [631.03, 338.03, 649.06, 381.6]\n\nFrame 3:\n  Drone pose: [-45.74, 103.98, 20.0, -46.09, -75.6, 0.0]\n  Target bbox: [626.69, 336.71, 653.41, 382.87]\n\nFrame 4:\n  Drone pose: [-45.71, 103.52, 20.0, -46.04, -75.72, 0.0]\n  Target bbox: [630.57, 337.6, 649.52, 382.02]\n\nFrame 5 (current):\n  Drone pose: [-45.68, 103.06, 20.0, -46.0, -75.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.16, \"ymin\": 336.71, \"xmax\": 653.94, \"ymax\": 382.91}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -0.95, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.43, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.27, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -1.93, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": -2.42, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.35, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.43, "window_alt_abs_m": 0.0, "target_px_mean_hist": 295.0, "cur_frame_id": 74, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.6, 96.11, 20.0, -45.95, -76.1, 0.0]\n  Target bbox: [625.95, 337.46, 654.15, 382.19]\n\nFrame 2:\n  Drone pose: [-45.63, 95.59, 20.0, -45.97, -76.0, 0.0]\n  Target bbox: [627.1, 337.23, 653.01, 382.41]\n\nFrame 3:\n  Drone pose: [-45.67, 95.07, 20.0, -46.0, -75.88, 0.0]\n  Target bbox: [627.29, 337.24, 652.8, 382.36]\n\nFrame 4:\n  Drone pose: [-45.71, 94.53, 20.0, -46.03, -75.72, 0.0]\n  Target bbox: [627.22, 336.94, 652.88, 382.68]\n\nFrame 5 (current):\n  Drone pose: [-45.77, 93.99, 20.0, -46.07, -75.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.3, \"ymin\": 337.3, \"xmax\": 650.8, \"ymax\": 382.33}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.54, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": -1.07, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.36, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -1.6, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 0.48, \"droll\": 0.0}, {\"dx\": -0.15, \"dy\": -2.12, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -2.63, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.62, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.57, "window_alt_abs_m": 0.0, "target_px_mean_hist": 292.2, "cur_frame_id": 92, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00110/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.93, 86.85, 20.0, -46.22, -74.92, 0.0]\n  Target bbox: [626.32, 336.53, 653.78, 383.05]\n\nFrame 2:\n  Drone pose: [-45.93, 86.35, 20.0, -46.22, -74.93, 0.0]\n  Target bbox: [628.82, 337.29, 651.27, 382.31]\n\nFrame 3:\n  Drone pose: [-45.93, 85.85, 20.0, -46.22, -74.94, 0.0]\n  Target bbox: [630.24, 337.77, 649.86, 381.86]\n\nFrame 4:\n  Drone pose: [-45.92, 85.35, 20.0, -46.22, -74.95, 0.0]\n  Target bbox: [630.32, 337.47, 649.77, 382.16]\n\nFrame 5 (current):\n  Drone pose: [-45.92, 84.85, 20.0, -46.22, -74.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.96, \"ymin\": 338.03, \"xmax\": 649.13, \"ymax\": 381.6}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.02, "window_alt_abs_m": 0.0, "target_px_mean_hist": 292.8, "cur_frame_id": 110, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.9, 78.34, 20.0, -46.25, -75.02, 0.0]\n  Target bbox: [628.2, 337.86, 651.89, 381.78]\n\nFrame 2:\n  Drone pose: [-45.88, 77.84, 20.0, -46.25, -75.06, 0.0]\n  Target bbox: [626.1, 336.71, 654.0, 382.9]\n\nFrame 3:\n  Drone pose: [-45.87, 77.34, 20.0, -46.25, -75.11, 0.0]\n  Target bbox: [625.65, 337.24, 654.45, 382.41]\n\nFrame 4:\n  Drone pose: [-45.85, 76.85, 20.0, -46.25, -75.16, 0.0]\n  Target bbox: [626.3, 336.53, 653.8, 383.04]\n\nFrame 5 (current):\n  Drone pose: [-45.84, 76.35, 20.0, -46.25, -75.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.63, \"ymin\": 337.25, \"xmax\": 654.47, \"ymax\": 382.4}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -0.99, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.05, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -1.98, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -2.48, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.19, "window_alt_abs_m": 0.0, "target_px_mean_hist": 294.0, "cur_frame_id": 127, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00145/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.73, 69.43, 20.0, -46.18, -75.58, 0.0]\n  Target bbox: [627.78, 337.38, 652.31, 382.23]\n\nFrame 2:\n  Drone pose: [-45.69, 68.97, 20.0, -46.14, -75.74, 0.0]\n  Target bbox: [628.5, 337.29, 651.59, 382.31]\n\nFrame 3:\n  Drone pose: [-45.64, 68.5, 20.0, -46.1, -75.91, 0.0]\n  Target bbox: [627.78, 336.91, 652.34, 382.72]\n\nFrame 4:\n  Drone pose: [-45.6, 68.04, 20.0, -46.06, -76.07, 0.0]\n  Target bbox: [631.15, 337.73, 648.95, 381.9]\n\nFrame 5 (current):\n  Drone pose: [-45.56, 67.57, 20.0, -46.03, -76.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.5, \"ymin\": 336.82, \"xmax\": 653.61, \"ymax\": 382.79}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -0.95, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -1.44, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.93, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.29, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -2.42, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.63, "window_alt_abs_m": 0.0, "target_px_mean_hist": 291.2, "cur_frame_id": 145, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00159/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/ORI/frames_playback/frame_00163/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.49, 60.65, 20.0, -45.93, -76.48, 0.0]\n  Target bbox: [629.6, 337.46, 650.5, 382.16]\n\nFrame 2:\n  Drone pose: [-45.5, 60.16, 20.0, -45.92, -76.44, 0.0]\n  Target bbox: [629.21, 337.78, 650.89, 381.87]\n\nFrame 3:\n  Drone pose: [-45.52, 59.67, 20.0, -45.9, -76.39, 0.0]\n  Target bbox: [626.3, 336.73, 653.82, 382.9]\n\nFrame 4:\n  Drone pose: [-45.54, 59.17, 20.0, -45.89, -76.34, 0.0]\n  Target bbox: [627.12, 336.99, 652.98, 382.6]\n\nFrame 5 (current):\n  Drone pose: [-45.55, 58.67, 20.0, -45.88, -76.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.7, \"ymin\": 336.78, \"xmax\": 653.42, \"ymax\": 382.85}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -2.03, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": -2.58, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.18, "window_alt_abs_m": 0.0, "target_px_mean_hist": 293.8, "cur_frame_id": 163, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-19.66, 139.83, 22.0, -46.66, -116.57, 0.0]\n  Target bbox: [627.04, 337.49, 652.93, 382.18]\n\nFrame 2:\n  Drone pose: [-20.87, 138.25, 21.2, -46.0, -118.59, 0.0]\n  Target bbox: [668.84, 367.46, 698.35, 414.45]\n\nFrame 3:\n  Drone pose: [-21.57, 137.24, 20.72, -44.59, -112.81, 0.0]\n  Target bbox: [619.12, 397.74, 648.58, 446.0]\n\nFrame 4:\n  Drone pose: [-21.96, 136.57, 20.64, -53.07, -107.47, 0.0]\n  Target bbox: [570.16, 262.39, 599.77, 313.39]\n\nFrame 5 (current):\n  Drone pose: [-22.24, 136.01, 20.68, -47.96, -114.15, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 640.63, \"ymin\": 349.96, \"xmax\": 662.03, \"ymax\": 394.28}, \"waypoint_deltas\": [{\"dx\": -0.25, \"dy\": -0.58, \"dz\": -0.09, \"dpitch\": -0.55, \"dyaw\": 0.14, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -1.12, \"dz\": -0.11, \"dpitch\": -0.36, \"dyaw\": -0.85, \"droll\": 0.0}, {\"dx\": -0.57, \"dy\": -1.66, \"dz\": -0.13, \"dpitch\": -0.48, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": -0.66, \"dy\": -2.19, \"dz\": -0.15, \"dpitch\": -0.57, \"dyaw\": -0.26, \"droll\": 0.0}, {\"dx\": -0.74, \"dy\": -2.73, \"dz\": -0.26, \"dpitch\": -0.53, \"dyaw\": -0.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.81, "window_alt_abs_m": 1.4, "target_px_mean_hist": 270.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-24.65, 128.75, 20.22, -46.85, -113.09, 0.0]\n  Target bbox: [594.58, 367.84, 621.3, 416.54]\n\nFrame 2:\n  Drone pose: [-24.84, 128.24, 20.31, -48.67, -116.92, 0.0]\n  Target bbox: [626.51, 335.64, 653.62, 383.98]\n\nFrame 3:\n  Drone pose: [-25.36, 127.56, 20.31, -48.95, -117.12, 0.0]\n  Target bbox: [629.23, 338.06, 650.84, 381.49]\n\nFrame 4:\n  Drone pose: [-25.75, 127.18, 20.26, -48.61, -117.27, 0.0]\n  Target bbox: [629.18, 338.05, 650.88, 381.51]\n\nFrame 5 (current):\n  Drone pose: [-26.16, 126.53, 20.12, -45.92, -121.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 665.07, \"ymin\": 383.39, \"xmax\": 687.33, \"ymax\": 426.57}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": -0.53, \"dz\": -0.02, \"dpitch\": -2.64, \"dyaw\": 3.1, \"droll\": 0.0}, {\"dx\": -0.96, \"dy\": -1.05, \"dz\": -0.03, \"dpitch\": -2.64, \"dyaw\": 3.05, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": -1.56, \"dz\": -0.04, \"dpitch\": -2.65, \"dyaw\": 3.06, \"droll\": 0.0}, {\"dx\": -1.99, \"dy\": -2.07, \"dz\": -0.05, \"dpitch\": -2.67, \"dyaw\": 3.13, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": -2.57, \"dz\": -0.06, \"dpitch\": -2.67, \"dyaw\": 3.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.95, "window_alt_abs_m": 0.28, "target_px_mean_hist": 308.0, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-32.91, 119.53, 20.01, -51.82, -110.06, 0.0]\n  Target bbox: [601.47, 303.69, 628.68, 351.6]\n\nFrame 2:\n  Drone pose: [-32.95, 119.18, 20.1, -49.81, -112.0, 0.0]\n  Target bbox: [624.53, 334.95, 655.4, 384.58]\n\nFrame 3:\n  Drone pose: [-33.09, 118.59, 19.95, -49.81, -111.66, 0.0]\n  Target bbox: [624.75, 334.83, 655.17, 384.7]\n\nFrame 4:\n  Drone pose: [-33.28, 118.18, 20.02, -49.73, -115.95, 0.0]\n  Target bbox: [678.1, 339.34, 710.4, 389.23]\n\nFrame 5 (current):\n  Drone pose: [-33.65, 117.84, 20.01, -49.84, -109.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.52, \"ymin\": 336.23, \"xmax\": 651.39, \"ymax\": 383.3}, \"waypoint_deltas\": [{\"dx\": -0.2, \"dy\": -0.42, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.77, \"droll\": 0.0}, {\"dx\": -0.4, \"dy\": -0.84, \"dz\": -0.01, \"dpitch\": 0.05, \"dyaw\": 1.54, \"droll\": 0.0}, {\"dx\": -0.62, \"dy\": -1.25, \"dz\": -0.01, \"dpitch\": 0.08, \"dyaw\": 2.34, \"droll\": 0.0}, {\"dx\": -0.84, \"dy\": -1.67, \"dz\": -0.01, \"dpitch\": 0.1, \"dyaw\": 3.16, \"droll\": 0.0}, {\"dx\": -1.06, \"dy\": -2.11, \"dz\": -0.01, \"dpitch\": 0.11, \"dyaw\": 3.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.0, "window_alt_abs_m": 0.33, "target_px_mean_hist": 317.8, "cur_frame_id": 39, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-40.0, 112.61, 20.09, -48.56, -88.68, 0.0]\n  Target bbox: [628.2, 336.74, 651.78, 382.85]\n\nFrame 2:\n  Drone pose: [-41.61, 112.47, 20.0, -45.14, -80.08, 0.0]\n  Target bbox: [574.02, 382.82, 592.9, 427.34]\n\nFrame 3:\n  Drone pose: [-42.88, 112.18, 19.98, -47.12, -81.11, 0.0]\n  Target bbox: [625.18, 334.06, 654.78, 385.59]\n\nFrame 4:\n  Drone pose: [-44.06, 111.85, 20.0, -44.24, -77.29, 0.0]\n  Target bbox: [605.27, 375.93, 633.14, 426.45]\n\nFrame 5 (current):\n  Drone pose: [-44.77, 111.47, 20.0, -45.26, -73.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 567.36, \"ymin\": 358.12, \"xmax\": 596.73, \"ymax\": 404.93}, \"waypoint_deltas\": [{\"dx\": -0.41, \"dy\": -0.43, \"dz\": 0.0, \"dpitch\": -0.95, \"dyaw\": -3.79, \"droll\": 0.0}, {\"dx\": -0.63, \"dy\": -0.91, \"dz\": 0.0, \"dpitch\": -0.84, \"dyaw\": -3.15, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": -1.41, \"dz\": 0.0, \"dpitch\": -0.8, \"dyaw\": -2.82, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": -1.93, \"dz\": 0.0, \"dpitch\": -0.81, \"dyaw\": -2.59, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": -2.47, \"dz\": 0.0, \"dpitch\": -0.84, \"dyaw\": -2.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.23, "window_alt_abs_m": 0.12, "target_px_mean_hist": 304.2, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.94, 104.97, 20.06, -44.98, -75.17, 0.0]\n  Target bbox: [630.9, 356.39, 653.15, 401.53]\n\nFrame 2:\n  Drone pose: [-45.75, 104.3, 19.95, -45.0, -74.44, 0.0]\n  Target bbox: [616.59, 359.02, 640.46, 403.98]\n\nFrame 3:\n  Drone pose: [-45.68, 104.1, 20.01, -45.94, -75.86, 0.0]\n  Target bbox: [626.34, 336.85, 653.76, 382.76]\n\nFrame 4:\n  Drone pose: [-45.72, 103.67, 20.1, -43.34, -80.8, 0.0]\n  Target bbox: [684.72, 383.88, 712.5, 427.6]\n\nFrame 5 (current):\n  Drone pose: [-45.82, 103.13, 19.94, -49.68, -74.82, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.87, \"ymin\": 270.3, \"xmax\": 646.16, \"ymax\": 317.3}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": -0.54, \"dz\": 0.06, \"dpitch\": 3.71, \"dyaw\": -1.13, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": -1.02, \"dz\": 0.06, \"dpitch\": 3.74, \"dyaw\": -1.22, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": -1.5, \"dz\": 0.06, \"dpitch\": 3.75, \"dyaw\": -1.29, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -2.0, \"dz\": 0.06, \"dpitch\": 3.75, \"dyaw\": -1.34, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": -2.49, \"dz\": 0.06, \"dpitch\": 3.76, \"dyaw\": -1.37, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.08, "window_alt_abs_m": 0.41, "target_px_mean_hist": 288.2, "cur_frame_id": 74, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.6, 96.11, 20.0, -50.11, -75.96, 0.0]\n  Target bbox: [625.37, 267.65, 651.43, 312.18]\n\nFrame 2:\n  Drone pose: [-45.63, 95.59, 20.0, -47.03, -80.45, 0.0]\n  Target bbox: [678.03, 321.15, 706.25, 365.62]\n\nFrame 3:\n  Drone pose: [-45.61, 94.94, 19.91, -46.98, -78.11, 0.0]\n  Target bbox: [655.9, 322.88, 674.36, 367.22]\n\nFrame 4:\n  Drone pose: [-45.71, 94.53, 20.0, -50.29, -74.13, 0.0]\n  Target bbox: [606.62, 265.68, 635.83, 311.24]\n\nFrame 5 (current):\n  Drone pose: [-45.76, 94.02, 19.9, -45.89, -75.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.41, \"ymin\": 337.29, \"xmax\": 651.68, \"ymax\": 382.32}, \"waypoint_deltas\": [{\"dx\": -0.06, \"dy\": -0.57, \"dz\": 0.1, \"dpitch\": -0.22, \"dyaw\": 0.21, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": -1.1, \"dz\": 0.1, \"dpitch\": -0.25, \"dyaw\": 0.38, \"droll\": 0.0}, {\"dx\": -0.14, \"dy\": -1.63, \"dz\": 0.1, \"dpitch\": -0.28, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": -2.15, \"dz\": 0.1, \"dpitch\": -0.29, \"dyaw\": 0.58, \"droll\": 0.0}, {\"dx\": -0.18, \"dy\": -2.66, \"dz\": 0.1, \"dpitch\": -0.3, \"dyaw\": 0.64, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.25, "window_alt_abs_m": 0.27, "target_px_mean_hist": 298.8, "cur_frame_id": 92, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00110/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.9, 86.83, 19.81, -46.47, -75.93, 0.0]\n  Target bbox: [639.42, 329.42, 662.34, 373.38]\n\nFrame 2:\n  Drone pose: [-45.93, 86.35, 20.0, -46.22, -74.93, 0.0]\n  Target bbox: [625.98, 336.59, 654.13, 383.02]\n\nFrame 3:\n  Drone pose: [-45.93, 85.85, 20.0, -46.22, -74.94, 0.0]\n  Target bbox: [627.56, 336.98, 652.53, 382.61]\n\nFrame 4:\n  Drone pose: [-45.97, 85.4, 20.07, -46.66, -75.45, 0.0]\n  Target bbox: [633.31, 329.69, 661.19, 375.85]\n\nFrame 5 (current):\n  Drone pose: [-45.92, 84.85, 20.0, -46.22, -74.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.51, \"ymin\": 336.52, \"xmax\": 653.59, \"ymax\": 383.05}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.02, "window_alt_abs_m": 0.34, "target_px_mean_hist": 292.2, "cur_frame_id": 110, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00127/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.9, 78.34, 20.0, -46.25, -75.02, 0.0]\n  Target bbox: [630.26, 337.8, 649.83, 381.83]\n\nFrame 2:\n  Drone pose: [-45.88, 77.94, 19.86, -45.89, -75.14, 0.0]\n  Target bbox: [626.18, 336.65, 653.92, 382.96]\n\nFrame 3:\n  Drone pose: [-45.84, 77.45, 20.09, -46.23, -75.27, 0.0]\n  Target bbox: [629.16, 337.32, 650.94, 382.32]\n\nFrame 4:\n  Drone pose: [-45.85, 76.85, 20.0, -43.79, -72.53, 0.0]\n  Target bbox: [595.09, 378.0, 623.38, 425.22]\n\nFrame 5 (current):\n  Drone pose: [-45.71, 76.49, 20.0, -46.1, -75.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.42, \"ymin\": 337.58, \"xmax\": 651.68, \"ymax\": 382.06}, \"waypoint_deltas\": [{\"dx\": -0.12, \"dy\": -0.63, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -1.13, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 0.44, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -1.63, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 0.43, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -2.12, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 0.42, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -2.62, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 0.42, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.17, "window_alt_abs_m": 0.46, "target_px_mean_hist": 301.5, "cur_frame_id": 127, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00145/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.84, 69.35, 20.13, -46.59, -75.77, 0.0]\n  Target bbox: [634.82, 334.85, 658.32, 379.57]\n\nFrame 2:\n  Drone pose: [-45.69, 68.97, 20.0, -43.8, -77.33, 0.0]\n  Target bbox: [646.85, 376.82, 670.46, 421.69]\n\nFrame 3:\n  Drone pose: [-45.62, 68.37, 20.1, -46.45, -75.86, 0.0]\n  Target bbox: [626.29, 336.95, 653.81, 382.66]\n\nFrame 4:\n  Drone pose: [-45.57, 67.98, 20.02, -48.43, -71.81, 0.0]\n  Target bbox: [576.09, 300.05, 603.87, 347.09]\n\nFrame 5 (current):\n  Drone pose: [-45.56, 67.57, 20.0, -42.29, -81.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 686.3, \"ymin\": 402.29, \"xmax\": 710.86, \"ymax\": 446.23}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": -3.71, \"dyaw\": 4.88, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -0.95, \"dz\": 0.0, \"dpitch\": -3.69, \"dyaw\": 4.8, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -1.44, \"dz\": 0.0, \"dpitch\": -3.67, \"dyaw\": 4.74, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.93, \"dz\": 0.0, \"dpitch\": -3.66, \"dyaw\": 4.71, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -2.42, \"dz\": 0.0, \"dpitch\": -3.66, \"dyaw\": 4.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.49, "window_alt_abs_m": 0.33, "target_px_mean_hist": 289.8, "cur_frame_id": 145, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00159/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477/aug_001/frames_playback/frame_00163/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-45.45, 60.52, 20.0, -50.54, -74.48, 0.0]\n  Target bbox: [607.0, 263.97, 625.56, 308.5]\n\nFrame 2:\n  Drone pose: [-45.46, 60.05, 20.07, -47.9, -77.54, 0.0]\n  Target bbox: [638.45, 308.28, 666.39, 354.43]\n\nFrame 3:\n  Drone pose: [-45.44, 59.57, 19.98, -50.23, -81.57, 0.0]\n  Target bbox: [687.38, 269.54, 709.78, 312.84]\n\nFrame 4:\n  Drone pose: [-45.6, 59.32, 19.92, -46.38, -74.8, 0.0]\n  Target bbox: [609.89, 323.36, 635.61, 367.97]\n\nFrame 5 (current):\n  Drone pose: [-45.43, 58.69, 20.1, -48.67, -79.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 663.34, \"ymin\": 294.85, \"xmax\": 689.98, \"ymax\": 338.23}, \"waypoint_deltas\": [{\"dx\": -0.13, \"dy\": -0.52, \"dz\": -0.1, \"dpitch\": 2.8, \"dyaw\": 3.53, \"droll\": 0.0}, {\"dx\": -0.14, \"dy\": -1.02, \"dz\": -0.1, \"dpitch\": 2.8, \"dyaw\": 3.56, \"droll\": 0.0}, {\"dx\": -0.15, \"dy\": -1.52, \"dz\": -0.1, \"dpitch\": 2.79, \"dyaw\": 3.59, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -2.05, \"dz\": -0.1, \"dpitch\": 2.76, \"dyaw\": 3.67, \"droll\": 0.0}, {\"dx\": -0.21, \"dy\": -2.6, \"dz\": -0.1, \"dpitch\": 2.7, \"dyaw\": 3.83, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.86, "window_alt_abs_m": 0.41, "target_px_mean_hist": 293.8, "cur_frame_id": 163, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-17/trajectory_1776426477", "difficulty_score": 0.4387, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.39, 145.94, 22.0, -46.48, -90.0, 0.0]\n  Target bbox: [623.56, 332.03, 656.44, 387.13]\n\nFrame 2:\n  Drone pose: [97.79, 144.49, 21.2, -46.76, -88.18, 0.0]\n  Target bbox: [628.87, 330.48, 651.3, 388.65]\n\nFrame 3:\n  Drone pose: [97.77, 143.85, 20.67, -46.21, -88.12, 0.0]\n  Target bbox: [626.29, 330.44, 653.92, 388.7]\n\nFrame 4:\n  Drone pose: [97.67, 143.08, 20.64, -46.58, -87.79, 0.0]\n  Target bbox: [621.71, 329.53, 658.55, 389.51]\n\nFrame 5 (current):\n  Drone pose: [97.66, 142.45, 20.62, -46.73, -87.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.74, \"ymin\": 329.72, \"xmax\": 658.53, \"ymax\": 389.35}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.63, \"dz\": -0.03, \"dpitch\": -0.17, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -1.24, \"dz\": -0.05, \"dpitch\": -0.3, \"dyaw\": 0.36, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -1.82, \"dz\": -0.07, \"dpitch\": -0.4, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": -2.46, \"dz\": -0.09, \"dpitch\": -0.57, \"dyaw\": 0.81, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": -3.02, \"dz\": -0.2, \"dpitch\": -0.51, \"dyaw\": 1.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.28, "window_alt_abs_m": 1.38, "target_px_mean_hist": 479.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.12, 135.31, 20.19, -47.07, -85.93, 0.0]\n  Target bbox: [624.02, 328.47, 656.21, 390.52]\n\nFrame 2:\n  Drone pose: [97.11, 134.81, 20.17, -47.04, -85.88, 0.0]\n  Target bbox: [618.14, 327.49, 662.19, 391.51]\n\nFrame 3:\n  Drone pose: [97.09, 134.31, 20.15, -47.01, -85.84, 0.0]\n  Target bbox: [620.1, 327.94, 660.18, 391.03]\n\nFrame 4:\n  Drone pose: [97.08, 133.81, 20.13, -46.98, -85.79, 0.0]\n  Target bbox: [627.6, 328.43, 652.6, 390.59]\n\nFrame 5 (current):\n  Drone pose: [97.06, 133.31, 20.12, -46.95, -85.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.81, \"ymin\": 328.24, \"xmax\": 660.47, \"ymax\": 390.77}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": -0.5, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": -1.0, \"dz\": -0.03, \"dpitch\": 0.05, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": -1.49, \"dz\": -0.04, \"dpitch\": 0.07, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -0.05, \"dy\": -1.99, \"dz\": -0.05, \"dpitch\": 0.09, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": -2.49, \"dz\": -0.06, \"dpitch\": 0.1, \"dyaw\": 0.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.19, "window_alt_abs_m": 0.07, "target_px_mean_hist": 530.2, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [96.93, 126.83, 20.02, -46.76, -85.31, 0.0]\n  Target bbox: [618.77, 327.02, 661.55, 391.96]\n\nFrame 2:\n  Drone pose: [96.92, 126.33, 20.01, -46.75, -85.29, 0.0]\n  Target bbox: [618.82, 327.02, 661.49, 391.97]\n\nFrame 3:\n  Drone pose: [96.91, 125.83, 20.01, -46.75, -85.26, 0.0]\n  Target bbox: [622.14, 328.14, 658.11, 390.84]\n\nFrame 4:\n  Drone pose: [96.9, 125.33, 20.01, -46.74, -85.24, 0.0]\n  Target bbox: [619.98, 327.67, 660.3, 391.3]\n\nFrame 5 (current):\n  Drone pose: [96.9, 124.83, 20.01, -46.74, -85.23, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.28, \"ymin\": 328.8, \"xmax\": 653.93, \"ymax\": 390.26}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.5, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -2.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -2.51, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": -0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.08, "window_alt_abs_m": 0.01, "target_px_mean_hist": 541.0, "cur_frame_id": 38, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.82, 118.21, 20.0, -47.01, -88.15, 0.0]\n  Target bbox: [620.2, 327.75, 660.12, 391.19]\n\nFrame 2:\n  Drone pose: [98.02, 117.67, 20.0, -47.07, -88.8, 0.0]\n  Target bbox: [620.98, 328.83, 659.32, 390.15]\n\nFrame 3:\n  Drone pose: [98.24, 117.13, 20.0, -47.15, -89.51, 0.0]\n  Target bbox: [620.59, 328.61, 659.6, 390.32]\n\nFrame 4:\n  Drone pose: [98.47, 116.58, 20.0, -47.23, -90.26, 0.0]\n  Target bbox: [619.36, 327.96, 660.53, 390.96]\n\nFrame 5 (current):\n  Drone pose: [98.71, 116.03, 20.0, -47.31, -91.02, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.06, \"ymin\": 328.61, \"xmax\": 661.59, \"ymax\": 390.37}, \"waypoint_deltas\": [{\"dx\": 0.22, \"dy\": -0.56, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": 0.42, \"dy\": -1.11, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -1.39, \"droll\": 0.0}, {\"dx\": 0.57, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": -1.9, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": -2.23, \"dz\": 0.0, \"dpitch\": -0.32, \"dyaw\": -2.21, \"droll\": 0.0}, {\"dx\": 0.67, \"dy\": -2.79, \"dz\": 0.0, \"dpitch\": -0.42, \"dyaw\": -2.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.87, "window_alt_abs_m": 0.0, "target_px_mean_hist": 515.5, "cur_frame_id": 55, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.31, 108.9, 20.0, -47.71, -99.76, 0.0]\n  Target bbox: [619.31, 321.64, 660.81, 397.45]\n\nFrame 2:\n  Drone pose: [96.96, 108.38, 20.0, -47.69, -100.27, 0.0]\n  Target bbox: [620.25, 323.21, 659.82, 395.88]\n\nFrame 3:\n  Drone pose: [96.65, 107.88, 20.0, -47.64, -100.87, 0.0]\n  Target bbox: [620.79, 324.52, 659.22, 394.58]\n\nFrame 4:\n  Drone pose: [96.37, 107.39, 20.0, -47.55, -101.61, 0.0]\n  Target bbox: [616.72, 324.85, 663.02, 394.12]\n\nFrame 5 (current):\n  Drone pose: [96.14, 106.9, 20.0, -47.6, -100.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.31, \"ymin\": 325.77, \"xmax\": 661.46, \"ymax\": 393.17}, \"waypoint_deltas\": [{\"dx\": -0.19, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.65, \"droll\": 0.0}, {\"dx\": -0.36, \"dy\": -0.94, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.21, \"droll\": 0.0}, {\"dx\": -0.51, \"dy\": -1.41, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 1.72, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": -1.88, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 2.2, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": -2.35, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 2.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.61, "window_alt_abs_m": 0.0, "target_px_mean_hist": 543.8, "cur_frame_id": 72, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [93.97, 99.32, 20.0, -47.37, -93.55, 0.0]\n  Target bbox: [621.31, 328.33, 658.42, 390.64]\n\nFrame 2:\n  Drone pose: [93.87, 98.84, 20.0, -47.35, -93.22, 0.0]\n  Target bbox: [617.28, 327.24, 662.37, 391.71]\n\nFrame 3:\n  Drone pose: [93.78, 98.35, 20.0, -47.33, -92.9, 0.0]\n  Target bbox: [623.98, 327.81, 655.76, 391.16]\n\nFrame 4:\n  Drone pose: [93.68, 97.87, 20.0, -47.31, -92.6, 0.0]\n  Target bbox: [624.55, 328.62, 655.21, 390.37]\n\nFrame 5 (current):\n  Drone pose: [93.59, 97.38, 20.0, -47.29, -92.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.04, \"ymin\": 328.83, \"xmax\": 653.74, \"ymax\": 390.16}, \"waypoint_deltas\": [{\"dx\": -0.09, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -0.97, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": -0.25, \"dy\": -1.45, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.83, \"droll\": 0.0}, {\"dx\": -0.32, \"dy\": -1.94, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 1.08, \"droll\": 0.0}, {\"dx\": -0.4, \"dy\": -2.43, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 1.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.25, "window_alt_abs_m": 0.0, "target_px_mean_hist": 567.2, "cur_frame_id": 92, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00109/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.84, 91.03, 20.0, -47.31, -89.82, 0.0]\n  Target bbox: [618.27, 328.19, 661.81, 390.75]\n\nFrame 2:\n  Drone pose: [92.83, 90.54, 20.0, -47.3, -89.8, 0.0]\n  Target bbox: [628.14, 328.67, 651.95, 390.29]\n\nFrame 3:\n  Drone pose: [92.84, 90.04, 20.0, -47.28, -89.82, 0.0]\n  Target bbox: [620.4, 328.54, 659.67, 390.36]\n\nFrame 4:\n  Drone pose: [92.85, 89.55, 20.0, -47.27, -89.86, 0.0]\n  Target bbox: [627.04, 329.42, 653.02, 389.59]\n\nFrame 5 (current):\n  Drone pose: [92.87, 89.05, 20.0, -47.27, -89.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.59, \"ymin\": 328.06, \"xmax\": 661.43, \"ymax\": 390.87}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -0.99, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.48, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": -1.98, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": -2.48, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.63, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.16, "window_alt_abs_m": 0.0, "target_px_mean_hist": 542.5, "cur_frame_id": 109, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00122/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00126/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.51, 82.55, 20.0, -47.25, -92.02, 0.0]\n  Target bbox: [617.62, 321.08, 662.37, 398.15]\n\nFrame 2:\n  Drone pose: [92.36, 82.05, 20.0, -47.24, -93.14, 0.0]\n  Target bbox: [623.02, 328.3, 656.74, 390.64]\n\nFrame 3:\n  Drone pose: [92.22, 81.54, 20.0, -47.26, -92.69, 0.0]\n  Target bbox: [620.94, 328.34, 658.78, 390.61]\n\nFrame 4:\n  Drone pose: [92.11, 81.04, 20.0, -47.26, -92.32, 0.0]\n  Target bbox: [625.43, 329.03, 654.34, 389.98]\n\nFrame 5 (current):\n  Drone pose: [92.02, 80.55, 20.0, -47.26, -92.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.97, \"ymin\": 327.51, \"xmax\": 660.69, \"ymax\": 391.42}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -0.99, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.34, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.41, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -1.99, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.41, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": -2.47, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.22, "window_alt_abs_m": 0.0, "target_px_mean_hist": 540.8, "cur_frame_id": 126, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00139/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00143/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.72, 74.25, 20.0, -46.64, -94.26, 0.0]\n  Target bbox: [623.41, 328.47, 656.34, 390.56]\n\nFrame 2:\n  Drone pose: [92.78, 73.72, 20.0, -46.69, -94.47, 0.0]\n  Target bbox: [625.62, 328.65, 654.16, 390.39]\n\nFrame 3:\n  Drone pose: [92.82, 73.17, 20.0, -46.76, -94.62, 0.0]\n  Target bbox: [619.91, 327.68, 659.81, 391.29]\n\nFrame 4:\n  Drone pose: [92.84, 72.62, 20.0, -46.83, -94.69, 0.0]\n  Target bbox: [619.51, 327.74, 660.21, 391.26]\n\nFrame 5 (current):\n  Drone pose: [92.84, 72.07, 20.0, -46.91, -94.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.99, \"ymin\": 327.69, \"xmax\": 658.75, \"ymax\": 391.27}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": -1.06, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -1.57, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.08, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -2.05, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": -2.54, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.4, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.44, "window_alt_abs_m": 0.0, "target_px_mean_hist": 545.5, "cur_frame_id": 143, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00156/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00157/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00158/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00159/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/ORI/frames_playback/frame_00160/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [94.21, 65.52, 20.0, -46.6, -100.69, 0.0]\n  Target bbox: [621.84, 323.92, 658.26, 395.21]\n\nFrame 2:\n  Drone pose: [94.37, 65.02, 20.0, -46.39, -102.75, 0.0]\n  Target bbox: [618.66, 325.95, 661.13, 393.1]\n\nFrame 3:\n  Drone pose: [94.52, 64.51, 20.0, -46.33, -103.23, 0.0]\n  Target bbox: [620.88, 324.65, 659.17, 394.53]\n\nFrame 4:\n  Drone pose: [94.68, 64.01, 20.0, -46.08, -105.24, 0.0]\n  Target bbox: [622.47, 326.33, 657.33, 392.79]\n\nFrame 5 (current):\n  Drone pose: [94.84, 63.51, 20.0, -46.02, -105.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.95, \"ymin\": 326.45, \"xmax\": 656.88, \"ymax\": 392.63}, \"waypoint_deltas\": [{\"dx\": 0.15, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.47, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -0.94, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": -1.41, \"droll\": 0.0}, {\"dx\": 0.62, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 0.28, \"dyaw\": -1.88, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.35, \"dyaw\": -2.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.03, "window_alt_abs_m": 0.0, "target_px_mean_hist": 532.8, "cur_frame_id": 160, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.45, 145.98, 21.86, -44.99, -92.02, 0.0]\n  Target bbox: [646.18, 353.09, 677.4, 408.74]\n\nFrame 2:\n  Drone pose: [97.79, 144.49, 21.2, -46.76, -88.18, 0.0]\n  Target bbox: [621.88, 330.6, 658.39, 388.52]\n\nFrame 3:\n  Drone pose: [97.77, 143.85, 20.67, -42.6, -87.23, 0.0]\n  Target bbox: [616.57, 390.89, 642.34, 449.63]\n\nFrame 4:\n  Drone pose: [97.57, 143.09, 20.67, -45.39, -84.72, 0.0]\n  Target bbox: [595.51, 350.14, 620.61, 410.79]\n\nFrame 5 (current):\n  Drone pose: [97.78, 142.52, 20.74, -48.36, -83.13, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 561.44, \"ymin\": 304.53, \"xmax\": 603.04, \"ymax\": 366.42}, \"waypoint_deltas\": [{\"dx\": -0.17, \"dy\": -0.7, \"dz\": -0.15, \"dpitch\": 1.46, \"dyaw\": -4.42, \"droll\": 0.0}, {\"dx\": -0.23, \"dy\": -1.31, \"dz\": -0.17, \"dpitch\": 1.33, \"dyaw\": -4.23, \"droll\": 0.0}, {\"dx\": -0.29, \"dy\": -1.89, \"dz\": -0.19, \"dpitch\": 1.23, \"dyaw\": -4.02, \"droll\": 0.0}, {\"dx\": -0.36, \"dy\": -2.53, \"dz\": -0.21, \"dpitch\": 1.06, \"dyaw\": -3.78, \"droll\": 0.0}, {\"dx\": -0.43, \"dy\": -3.09, \"dz\": -0.32, \"dpitch\": 1.12, \"dyaw\": -3.56, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.9, "window_alt_abs_m": 1.26, "target_px_mean_hist": 480.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.21, 135.24, 20.25, -43.89, -91.19, 0.0]\n  Target bbox: [680.69, 387.54, 714.35, 448.67]\n\nFrame 2:\n  Drone pose: [97.11, 134.81, 20.17, -51.25, -81.74, 0.0]\n  Target bbox: [571.35, 257.65, 612.89, 322.45]\n\nFrame 3:\n  Drone pose: [97.14, 134.48, 20.2, -47.47, -91.02, 0.0]\n  Target bbox: [676.85, 319.84, 719.27, 380.74]\n\nFrame 4:\n  Drone pose: [97.04, 133.75, 19.96, -46.81, -85.66, 0.0]\n  Target bbox: [626.17, 328.78, 654.04, 390.27]\n\nFrame 5 (current):\n  Drone pose: [97.0, 133.15, 20.09, -51.49, -84.25, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 605.99, \"ymin\": 254.98, \"xmax\": 645.2, \"ymax\": 318.84}, \"waypoint_deltas\": [{\"dx\": 0.05, \"dy\": -0.34, \"dz\": 0.01, \"dpitch\": 4.56, \"dyaw\": -1.45, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -0.84, \"dz\": 0.0, \"dpitch\": 4.59, \"dyaw\": -1.41, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -1.33, \"dz\": -0.01, \"dpitch\": 4.61, \"dyaw\": -1.37, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.83, \"dz\": -0.02, \"dpitch\": 4.63, \"dyaw\": -1.34, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.33, \"dz\": -0.03, \"dpitch\": 4.64, \"dyaw\": -1.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 25.49, "window_alt_abs_m": 0.48, "target_px_mean_hist": 544.0, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00038/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [96.93, 126.83, 20.02, -48.8, -82.29, 0.0]\n  Target bbox: [590.43, 294.71, 619.62, 357.08]\n\nFrame 2:\n  Drone pose: [97.04, 126.29, 20.04, -46.86, -85.66, 0.0]\n  Target bbox: [620.31, 328.03, 659.97, 390.96]\n\nFrame 3:\n  Drone pose: [96.91, 125.83, 20.01, -47.96, -84.99, 0.0]\n  Target bbox: [622.97, 308.39, 650.7, 369.94]\n\nFrame 4:\n  Drone pose: [96.9, 125.33, 20.01, -51.74, -83.81, 0.0]\n  Target bbox: [600.44, 242.99, 645.76, 308.18]\n\nFrame 5 (current):\n  Drone pose: [96.9, 124.83, 20.01, -47.26, -80.23, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.68, \"ymin\": 319.83, \"xmax\": 603.59, \"ymax\": 385.45}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.52, \"dyaw\": -5.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.0, \"dz\": -0.01, \"dpitch\": 0.53, \"dyaw\": -5.01, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.5, \"dz\": -0.01, \"dpitch\": 0.53, \"dyaw\": -5.04, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -2.0, \"dz\": -0.01, \"dpitch\": 0.52, \"dyaw\": -5.09, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -2.51, \"dz\": -0.01, \"dpitch\": 0.52, \"dyaw\": -5.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.81, "window_alt_abs_m": 0.05, "target_px_mean_hist": 532.5, "cur_frame_id": 38, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.99, 118.21, 19.95, -46.95, -88.71, 0.0]\n  Target bbox: [623.66, 328.36, 656.62, 390.64]\n\nFrame 2:\n  Drone pose: [97.95, 117.53, 20.07, -50.28, -83.57, 0.0]\n  Target bbox: [563.27, 281.42, 602.34, 344.41]\n\nFrame 3:\n  Drone pose: [98.24, 117.13, 20.0, -51.1, -86.13, 0.0]\n  Target bbox: [580.44, 261.93, 621.46, 325.74]\n\nFrame 4:\n  Drone pose: [98.47, 116.58, 20.0, -47.16, -91.85, 0.0]\n  Target bbox: [644.25, 329.78, 672.29, 391.82]\n\nFrame 5 (current):\n  Drone pose: [98.71, 116.03, 20.0, -42.35, -93.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 649.25, \"ymin\": 411.53, \"xmax\": 687.92, \"ymax\": 474.86}, \"waypoint_deltas\": [{\"dx\": 0.22, \"dy\": -0.56, \"dz\": 0.0, \"dpitch\": -5.04, \"dyaw\": 1.74, \"droll\": 0.0}, {\"dx\": 0.42, \"dy\": -1.11, \"dz\": 0.0, \"dpitch\": -5.12, \"dyaw\": 1.09, \"droll\": 0.0}, {\"dx\": 0.57, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": -5.2, \"dyaw\": 0.58, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": -2.23, \"dz\": 0.0, \"dpitch\": -5.28, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": 0.67, \"dy\": -2.79, \"dz\": 0.0, \"dpitch\": -5.38, \"dyaw\": 0.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.08, "window_alt_abs_m": 0.19, "target_px_mean_hist": 517.8, "cur_frame_id": 55, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.36, 108.95, 20.17, -47.87, -99.91, 0.0]\n  Target bbox: [621.77, 325.48, 658.22, 393.61]\n\nFrame 2:\n  Drone pose: [96.99, 108.5, 20.16, -47.72, -100.31, 0.0]\n  Target bbox: [618.24, 322.15, 661.83, 397.04]\n\nFrame 3:\n  Drone pose: [96.65, 107.88, 20.0, -47.15, -104.97, 0.0]\n  Target bbox: [663.45, 330.94, 710.37, 407.0]\n\nFrame 4:\n  Drone pose: [96.37, 107.39, 20.0, -45.54, -102.47, 0.0]\n  Target bbox: [628.29, 359.27, 671.62, 427.25]\n\nFrame 5 (current):\n  Drone pose: [96.14, 106.9, 20.0, -47.6, -100.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.78, \"ymin\": 326.37, \"xmax\": 660.99, \"ymax\": 392.62}, \"waypoint_deltas\": [{\"dx\": -0.19, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.65, \"droll\": 0.0}, {\"dx\": -0.36, \"dy\": -0.94, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.21, \"droll\": 0.0}, {\"dx\": -0.51, \"dy\": -1.41, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 1.72, \"droll\": 0.0}, {\"dx\": -0.65, \"dy\": -1.88, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 2.2, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": -2.35, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 2.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.19, "window_alt_abs_m": 0.17, "target_px_mean_hist": 544.5, "cur_frame_id": 72, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [93.96, 99.15, 19.94, -47.55, -93.55, 0.0]\n  Target bbox: [617.47, 326.76, 662.19, 392.16]\n\nFrame 2:\n  Drone pose: [93.87, 98.84, 20.0, -45.24, -97.7, 0.0]\n  Target bbox: [672.41, 363.55, 710.12, 429.15]\n\nFrame 3:\n  Drone pose: [93.84, 98.4, 20.07, -47.36, -93.11, 0.0]\n  Target bbox: [620.3, 327.91, 659.41, 391.01]\n\nFrame 4:\n  Drone pose: [93.66, 97.87, 19.86, -42.1, -93.91, 0.0]\n  Target bbox: [641.64, 412.32, 670.84, 475.1]\n\nFrame 5 (current):\n  Drone pose: [93.53, 97.5, 19.88, -51.54, -94.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 645.86, \"ymin\": 249.88, \"xmax\": 691.47, \"ymax\": 314.84}, \"waypoint_deltas\": [{\"dx\": -0.03, \"dy\": -0.6, \"dz\": 0.12, \"dpitch\": 4.27, \"dyaw\": 2.52, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -1.09, \"dz\": 0.12, \"dpitch\": 4.29, \"dyaw\": 2.79, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": -1.57, \"dz\": 0.12, \"dpitch\": 4.3, \"dyaw\": 3.06, \"droll\": 0.0}, {\"dx\": -0.26, \"dy\": -2.06, \"dz\": 0.12, \"dpitch\": 4.32, \"dyaw\": 3.31, \"droll\": 0.0}, {\"dx\": -0.34, \"dy\": -2.55, \"dz\": 0.12, \"dpitch\": 4.11, \"dyaw\": 3.54, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.17, "window_alt_abs_m": 0.37, "target_px_mean_hist": 562.0, "cur_frame_id": 92, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00105/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00109/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.82, 91.13, 19.98, -47.12, -89.77, 0.0]\n  Target bbox: [626.5, 329.4, 653.59, 389.6]\n\nFrame 2:\n  Drone pose: [92.83, 90.54, 20.0, -48.35, -84.8, 0.0]\n  Target bbox: [564.3, 312.18, 601.3, 375.02]\n\nFrame 3:\n  Drone pose: [92.84, 90.04, 20.0, -42.55, -84.82, 0.0]\n  Target bbox: [565.8, 409.14, 599.54, 472.85]\n\nFrame 4:\n  Drone pose: [92.88, 89.58, 20.14, -47.42, -89.95, 0.0]\n  Target bbox: [619.58, 329.06, 660.45, 389.9]\n\nFrame 5 (current):\n  Drone pose: [92.8, 89.13, 19.85, -46.91, -89.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.83, \"ymin\": 328.66, \"xmax\": 660.29, \"ymax\": 390.3}, \"waypoint_deltas\": [{\"dx\": 0.1, \"dy\": -0.57, \"dz\": 0.15, \"dpitch\": -0.35, \"dyaw\": -0.34, \"droll\": 0.0}, {\"dx\": 0.14, \"dy\": -1.07, \"dz\": 0.15, \"dpitch\": -0.34, \"dyaw\": -0.46, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -1.56, \"dz\": 0.15, \"dpitch\": -0.34, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -2.06, \"dz\": 0.15, \"dpitch\": -0.33, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": 0.27, \"dy\": -2.56, \"dz\": 0.15, \"dpitch\": -0.32, \"dyaw\": -0.87, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.36, "window_alt_abs_m": 0.45, "target_px_mean_hist": 540.0, "cur_frame_id": 109, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00122/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00126/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.46, 82.54, 20.0, -46.06, -92.24, 0.0]\n  Target bbox: [627.9, 346.58, 661.38, 413.87]\n\nFrame 2:\n  Drone pose: [92.36, 82.05, 20.0, -46.13, -95.26, 0.0]\n  Target bbox: [643.24, 345.62, 685.83, 411.12]\n\nFrame 3:\n  Drone pose: [92.19, 81.67, 19.93, -47.84, -90.29, 0.0]\n  Target bbox: [594.25, 313.9, 632.41, 375.39]\n\nFrame 4:\n  Drone pose: [92.1, 81.01, 19.81, -47.02, -92.32, 0.0]\n  Target bbox: [619.76, 327.77, 659.93, 391.14]\n\nFrame 5 (current):\n  Drone pose: [92.02, 80.55, 20.0, -48.99, -93.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 636.47, \"ymin\": 298.82, \"xmax\": 681.23, \"ymax\": 362.5}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": 1.74, \"dyaw\": 1.83, \"droll\": 0.0}, {\"dx\": -0.11, \"dy\": -0.99, \"dz\": 0.0, \"dpitch\": 1.74, \"dyaw\": 1.97, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": 1.74, \"dyaw\": 2.04, \"droll\": 0.0}, {\"dx\": -0.13, \"dy\": -1.99, \"dz\": 0.0, \"dpitch\": 1.74, \"dyaw\": 2.04, \"droll\": 0.0}, {\"dx\": -0.1, \"dy\": -2.47, \"dz\": 0.0, \"dpitch\": 1.78, \"dyaw\": 1.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.36, "window_alt_abs_m": 0.39, "target_px_mean_hist": 543.8, "cur_frame_id": 126, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00139/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00140/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00141/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00143/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [92.72, 74.25, 20.0, -45.11, -99.26, 0.0]\n  Target bbox: [679.09, 354.75, 716.74, 419.1]\n\nFrame 2:\n  Drone pose: [92.9, 73.75, 19.89, -46.45, -94.85, 0.0]\n  Target bbox: [619.25, 327.55, 660.46, 391.45]\n\nFrame 3:\n  Drone pose: [92.7, 73.09, 20.08, -47.32, -96.5, 0.0]\n  Target bbox: [648.54, 322.55, 683.45, 386.92]\n\nFrame 4:\n  Drone pose: [92.84, 72.62, 20.0, -48.51, -96.73, 0.0]\n  Target bbox: [647.21, 299.35, 680.35, 363.87]\n\nFrame 5 (current):\n  Drone pose: [92.75, 72.04, 19.95, -50.26, -91.84, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 589.21, \"ymin\": 271.71, \"xmax\": 630.26, \"ymax\": 335.07}, \"waypoint_deltas\": [{\"dx\": 0.08, \"dy\": -0.5, \"dz\": 0.05, \"dpitch\": 3.29, \"dyaw\": -2.84, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -1.03, \"dz\": 0.05, \"dpitch\": 3.24, \"dyaw\": -2.81, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -1.54, \"dz\": 0.05, \"dpitch\": 3.23, \"dyaw\": -2.77, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": -2.02, \"dz\": 0.05, \"dpitch\": 3.27, \"dyaw\": -2.98, \"droll\": 0.0}, {\"dx\": 0.21, \"dy\": -2.51, \"dz\": 0.05, \"dpitch\": 3.29, \"dyaw\": -3.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.19, "window_alt_abs_m": 0.43, "target_px_mean_hist": 552.2, "cur_frame_id": 143, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00156/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00157/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00158/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00159/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311/aug_001/frames_playback/frame_00160/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [94.21, 65.52, 20.0, -46.6, -100.69, 0.0]\n  Target bbox: [623.33, 326.53, 656.67, 392.59]\n\nFrame 2:\n  Drone pose: [94.24, 64.98, 19.95, -47.55, -101.29, 0.0]\n  Target bbox: [607.25, 306.9, 647.22, 373.87]\n\nFrame 3:\n  Drone pose: [94.52, 64.51, 20.0, -46.33, -103.23, 0.0]\n  Target bbox: [621.8, 325.13, 658.27, 394.05]\n\nFrame 4:\n  Drone pose: [94.69, 63.97, 20.03, -46.19, -105.31, 0.0]\n  Target bbox: [618.58, 325.66, 661.25, 393.42]\n\nFrame 5 (current):\n  Drone pose: [94.84, 63.51, 20.0, -48.18, -102.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 580.33, \"ymin\": 290.36, \"xmax\": 624.07, \"ymax\": 357.66}, \"waypoint_deltas\": [{\"dx\": 0.15, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 2.23, \"dyaw\": -3.66, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 2.3, \"dyaw\": -4.13, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 2.37, \"dyaw\": -4.6, \"droll\": 0.0}, {\"dx\": 0.62, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 2.44, \"dyaw\": -5.07, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 2.51, \"dyaw\": -5.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.39, "window_alt_abs_m": 0.17, "target_px_mean_hist": 520.2, "cur_frame_id": 160, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_311", "difficulty_score": 0.4595, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-106.11, 44.97, 22.0, -46.48, 87.14, 0.0]\n  Target bbox: [621.34, 326.28, 658.67, 393.11]\n\nFrame 2:\n  Drone pose: [-107.0, 44.54, 21.2, -43.92, 83.46, 0.0]\n  Target bbox: [624.31, 330.3, 655.46, 389.04]\n\nFrame 3:\n  Drone pose: [-107.37, 44.62, 20.67, -42.54, 82.63, 0.0]\n  Target bbox: [627.5, 330.15, 652.32, 389.18]\n\nFrame 4:\n  Drone pose: [-107.46, 44.96, 20.64, -42.28, 82.43, 0.0]\n  Target bbox: [624.9, 330.28, 654.91, 389.07]\n\nFrame 5 (current):\n  Drone pose: [-107.46, 45.42, 20.62, -42.18, 82.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.05, \"ymin\": 329.38, \"xmax\": 659.68, \"ymax\": 390.04}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.5, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 1.01, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": 0.14, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 1.53, \"dz\": -0.07, \"dpitch\": 0.05, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 2.04, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 2.55, \"dz\": -0.2, \"dpitch\": 0.2, \"dyaw\": 0.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.73, "window_alt_abs_m": 1.38, "target_px_mean_hist": 480.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-107.24, 52.05, 20.19, -41.77, 82.99, 0.0]\n  Target bbox: [627.57, 330.05, 652.25, 389.32]\n\nFrame 2:\n  Drone pose: [-107.27, 52.56, 20.17, -41.75, 82.9, 0.0]\n  Target bbox: [626.32, 329.71, 653.49, 389.64]\n\nFrame 3:\n  Drone pose: [-107.32, 53.08, 20.15, -41.73, 82.77, 0.0]\n  Target bbox: [623.85, 328.76, 655.91, 390.59]\n\nFrame 4:\n  Drone pose: [-107.38, 53.59, 20.13, -41.72, 82.59, 0.0]\n  Target bbox: [618.39, 327.61, 661.3, 391.77]\n\nFrame 5 (current):\n  Drone pose: [-107.47, 54.11, 20.12, -41.71, 82.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.91, \"ymin\": 330.06, \"xmax\": 653.9, \"ymax\": 389.3}, \"waypoint_deltas\": [{\"dx\": -0.11, \"dy\": 0.53, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": -0.29, \"droll\": 0.0}, {\"dx\": -0.24, \"dy\": 1.05, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": -0.64, \"droll\": 0.0}, {\"dx\": -0.39, \"dy\": 1.59, \"dz\": -0.04, \"dpitch\": 0.03, \"dyaw\": -1.06, \"droll\": 0.0}, {\"dx\": -0.56, \"dy\": 2.12, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": -1.52, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": 2.67, \"dz\": -0.06, \"dpitch\": 0.02, \"dyaw\": -2.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.63, "window_alt_abs_m": 0.07, "target_px_mean_hist": 516.0, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-109.28, 61.9, 20.01, -41.83, 74.58, 0.0]\n  Target bbox: [619.02, 327.25, 660.78, 392.22]\n\nFrame 2:\n  Drone pose: [-109.19, 62.47, 20.01, -41.95, 74.74, 0.0]\n  Target bbox: [623.35, 327.86, 656.7, 391.58]\n\nFrame 3:\n  Drone pose: [-109.04, 63.05, 20.01, -41.92, 73.78, 0.0]\n  Target bbox: [622.32, 326.37, 657.81, 393.15]\n\nFrame 4:\n  Drone pose: [-108.84, 63.62, 20.01, -41.89, 72.96, 0.0]\n  Target bbox: [622.03, 326.09, 658.12, 393.45]\n\nFrame 5 (current):\n  Drone pose: [-108.58, 64.18, 20.01, -41.87, 72.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.74, \"ymin\": 327.18, \"xmax\": 657.36, \"ymax\": 392.3}, \"waypoint_deltas\": [{\"dx\": 0.31, \"dy\": 0.54, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.53, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": 1.09, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.97, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 1.62, \"dz\": -0.01, \"dpitch\": 0.05, \"dyaw\": -1.31, \"droll\": 0.0}, {\"dx\": 1.42, \"dy\": 2.15, \"dz\": -0.01, \"dpitch\": 0.06, \"dyaw\": -1.63, \"droll\": 0.0}, {\"dx\": 1.81, \"dy\": 2.65, \"dz\": -0.01, \"dpitch\": 0.11, \"dyaw\": -1.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.61, "window_alt_abs_m": 0.01, "target_px_mean_hist": 510.0, "cur_frame_id": 39, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.6, 71.13, 20.0, -41.78, 72.26, 0.0]\n  Target bbox: [618.38, 325.63, 661.42, 393.81]\n\nFrame 2:\n  Drone pose: [-104.45, 71.6, 20.0, -41.8, 72.69, 0.0]\n  Target bbox: [619.15, 326.43, 660.67, 393.01]\n\nFrame 3:\n  Drone pose: [-104.29, 72.06, 20.0, -41.82, 73.12, 0.0]\n  Target bbox: [617.56, 326.62, 662.24, 392.89]\n\nFrame 4:\n  Drone pose: [-104.14, 72.53, 20.0, -41.84, 73.54, 0.0]\n  Target bbox: [625.43, 328.71, 654.4, 390.71]\n\nFrame 5 (current):\n  Drone pose: [-103.99, 72.99, 20.0, -41.85, 73.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.6, \"ymin\": 327.48, \"xmax\": 659.2, \"ymax\": 391.98}, \"waypoint_deltas\": [{\"dx\": 0.14, \"dy\": 0.47, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.4, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 0.94, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.78, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": 1.41, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 1.13, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": 1.89, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 1.46, \"droll\": 0.0}, {\"dx\": 0.63, \"dy\": 2.36, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 1.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.69, "window_alt_abs_m": 0.0, "target_px_mean_hist": 510.5, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.79, 79.68, 20.0, -41.85, 77.3, 0.0]\n  Target bbox: [617.89, 327.2, 661.86, 392.24]\n\nFrame 2:\n  Drone pose: [-102.74, 80.18, 20.0, -41.86, 77.44, 0.0]\n  Target bbox: [621.95, 327.87, 657.84, 391.51]\n\nFrame 3:\n  Drone pose: [-102.67, 80.68, 20.0, -41.88, 77.62, 0.0]\n  Target bbox: [620.88, 327.73, 658.9, 391.64]\n\nFrame 4:\n  Drone pose: [-102.57, 81.2, 20.0, -41.93, 77.85, 0.0]\n  Target bbox: [619.35, 327.19, 660.41, 392.19]\n\nFrame 5 (current):\n  Drone pose: [-102.45, 81.73, 20.0, -42.0, 78.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.84, \"ymin\": 327.24, \"xmax\": 659.92, \"ymax\": 392.12}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": 0.55, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 0.4, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": 1.13, \"dz\": 0.0, \"dpitch\": -0.28, \"dyaw\": 0.89, \"droll\": 0.0}, {\"dx\": 0.57, \"dy\": 1.73, \"dz\": 0.0, \"dpitch\": -0.33, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 0.76, \"dy\": 2.32, \"dz\": 0.0, \"dpitch\": -0.36, \"dyaw\": -0.82, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": 2.89, \"dz\": 0.0, \"dpitch\": -0.5, \"dyaw\": -0.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.87, "window_alt_abs_m": 0.0, "target_px_mean_hist": 518.2, "cur_frame_id": 75, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00093/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.99, 88.96, 20.0, -42.44, 79.27, 0.0]\n  Target bbox: [617.3, 326.42, 662.41, 392.95]\n\nFrame 2:\n  Drone pose: [-100.93, 89.43, 20.0, -42.42, 79.44, 0.0]\n  Target bbox: [617.63, 327.25, 662.09, 392.15]\n\nFrame 3:\n  Drone pose: [-100.88, 89.91, 20.0, -42.4, 79.6, 0.0]\n  Target bbox: [626.72, 328.92, 653.1, 390.4]\n\nFrame 4:\n  Drone pose: [-100.83, 90.38, 20.0, -42.38, 79.75, 0.0]\n  Target bbox: [623.32, 328.4, 656.47, 390.93]\n\nFrame 5 (current):\n  Drone pose: [-100.78, 90.86, 20.0, -42.36, 79.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.56, \"ymin\": 327.34, \"xmax\": 660.18, \"ymax\": 391.99}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.48, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 1.46, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.22, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 1.95, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 0.22, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": 2.44, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.61, "window_alt_abs_m": 0.0, "target_px_mean_hist": 533.0, "cur_frame_id": 93, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00111/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.89, 97.96, 20.0, -42.19, 76.87, 0.0]\n  Target bbox: [621.66, 327.84, 658.14, 391.53]\n\nFrame 2:\n  Drone pose: [-102.12, 98.54, 20.0, -42.22, 76.21, 0.0]\n  Target bbox: [626.34, 328.6, 653.49, 390.73]\n\nFrame 3:\n  Drone pose: [-102.32, 99.13, 20.0, -42.26, 75.6, 0.0]\n  Target bbox: [624.39, 328.24, 655.43, 391.11]\n\nFrame 4:\n  Drone pose: [-102.48, 99.72, 20.0, -42.33, 75.11, 0.0]\n  Target bbox: [627.12, 328.7, 652.72, 390.63]\n\nFrame 5 (current):\n  Drone pose: [-102.57, 100.34, 20.0, -42.45, 74.79, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.13, \"ymin\": 327.76, \"xmax\": 655.69, \"ymax\": 391.58}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.63, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 1.28, \"dz\": 0.0, \"dpitch\": -0.22, \"dyaw\": -1.34, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": 1.93, \"dz\": 0.0, \"dpitch\": -0.28, \"dyaw\": -2.4, \"droll\": 0.0}, {\"dx\": 0.42, \"dy\": 2.59, \"dz\": 0.0, \"dpitch\": -0.36, \"dyaw\": -3.33, \"droll\": 0.0}, {\"dx\": 0.65, \"dy\": 3.25, \"dz\": 0.0, \"dpitch\": -0.45, \"dyaw\": -4.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.07, "window_alt_abs_m": 0.0, "target_px_mean_hist": 507.5, "cur_frame_id": 111, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00129/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.77, 108.76, 20.0, -43.06, 60.13, 0.0]\n  Target bbox: [620.73, 325.4, 659.19, 394.02]\n\nFrame 2:\n  Drone pose: [-100.77, 109.26, 20.0, -43.31, 58.18, 0.0]\n  Target bbox: [622.78, 327.77, 657.42, 391.52]\n\nFrame 3:\n  Drone pose: [-100.76, 109.77, 20.0, -42.95, 57.0, 0.0]\n  Target bbox: [622.7, 326.68, 657.22, 392.76]\n\nFrame 4:\n  Drone pose: [-100.72, 110.28, 20.0, -43.18, 55.13, 0.0]\n  Target bbox: [623.3, 327.4, 656.96, 391.95]\n\nFrame 5 (current):\n  Drone pose: [-100.65, 110.78, 20.0, -42.84, 54.14, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.44, \"ymin\": 324.31, \"xmax\": 661.59, \"ymax\": 395.25}, \"waypoint_deltas\": [{\"dx\": 0.11, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -0.22, \"dyaw\": -1.7, \"droll\": 0.0}, {\"dx\": 0.25, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -2.45, \"droll\": 0.0}, {\"dx\": 0.42, \"dy\": 1.43, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -3.98, \"droll\": 0.0}, {\"dx\": 0.62, \"dy\": 1.88, \"dz\": 0.0, \"dpitch\": -0.3, \"dyaw\": -5.44, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": 2.33, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -5.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.98, "window_alt_abs_m": 0.0, "target_px_mean_hist": 513.2, "cur_frame_id": 129, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00146/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00147/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.12, 116.83, 20.0, -42.73, 41.26, 0.0]\n  Target bbox: [617.19, 321.43, 663.07, 398.25]\n\nFrame 2:\n  Drone pose: [-96.78, 117.19, 20.0, -42.88, 40.22, 0.0]\n  Target bbox: [627.93, 326.78, 651.82, 392.55]\n\nFrame 3:\n  Drone pose: [-96.43, 117.53, 20.0, -42.59, 40.29, 0.0]\n  Target bbox: [621.52, 324.89, 658.58, 394.6]\n\nFrame 4:\n  Drone pose: [-96.06, 117.87, 20.0, -42.74, 39.32, 0.0]\n  Target bbox: [625.98, 327.76, 653.77, 391.55]\n\nFrame 5 (current):\n  Drone pose: [-95.69, 118.21, 20.0, -42.47, 39.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.52, \"ymin\": 327.2, \"xmax\": 655.52, \"ymax\": 392.17}, \"waypoint_deltas\": [{\"dx\": 0.39, \"dy\": 0.32, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": -0.9, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": 0.65, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -0.71, \"droll\": 0.0}, {\"dx\": 1.18, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -1.57, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": 1.29, \"dz\": 0.0, \"dpitch\": 0.19, \"dyaw\": -1.34, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": 1.61, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -2.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.2, "window_alt_abs_m": 0.0, "target_px_mean_hist": 544.0, "cur_frame_id": 147, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00164/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/ORI/frames_playback/frame_00165/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-90.07, 122.82, 20.0, -42.26, 33.66, 0.0]\n  Target bbox: [622.84, 326.08, 656.86, 393.37]\n\nFrame 2:\n  Drone pose: [-89.67, 123.15, 20.0, -42.02, 33.89, 0.0]\n  Target bbox: [623.82, 327.14, 656.27, 392.33]\n\nFrame 3:\n  Drone pose: [-89.27, 123.47, 20.0, -42.14, 33.02, 0.0]\n  Target bbox: [624.45, 329.46, 655.38, 389.88]\n\nFrame 4:\n  Drone pose: [-88.86, 123.77, 20.0, -41.89, 33.33, 0.0]\n  Target bbox: [623.6, 326.94, 656.51, 392.56]\n\nFrame 5 (current):\n  Drone pose: [-88.45, 124.06, 20.0, -42.0, 32.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.59, \"ymin\": 327.48, \"xmax\": 655.52, \"ymax\": 391.98}, \"waypoint_deltas\": [{\"dx\": 0.42, \"dy\": 0.28, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -0.76, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": 0.57, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": 1.3, \"dy\": 0.86, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.1, \"droll\": 0.0}, {\"dx\": 1.75, \"dy\": 1.15, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": -0.7, \"droll\": 0.0}, {\"dx\": 2.19, \"dy\": 1.44, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -1.44, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.19, "window_alt_abs_m": 0.0, "target_px_mean_hist": 510.2, "cur_frame_id": 165, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-106.11, 44.97, 22.0, -49.57, 84.56, 0.0]\n  Target bbox: [651.67, 275.52, 688.68, 341.14]\n\nFrame 2:\n  Drone pose: [-107.0, 44.54, 21.2, -43.92, 83.46, 0.0]\n  Target bbox: [624.01, 329.56, 655.76, 389.74]\n\nFrame 3:\n  Drone pose: [-107.41, 44.61, 20.68, -46.3, 83.27, 0.0]\n  Target bbox: [613.62, 266.69, 647.37, 326.54]\n\nFrame 4:\n  Drone pose: [-107.46, 44.96, 20.64, -39.33, 82.48, 0.0]\n  Target bbox: [626.69, 379.98, 651.91, 438.42]\n\nFrame 5 (current):\n  Drone pose: [-107.46, 45.42, 20.62, -46.31, 78.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 668.13, \"ymin\": 259.92, \"xmax\": 707.83, \"ymax\": 323.1}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": 0.5, \"dz\": -0.03, \"dpitch\": 4.15, \"dyaw\": 3.9, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 1.01, \"dz\": -0.05, \"dpitch\": 4.16, \"dyaw\": 3.97, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 1.53, \"dz\": -0.07, \"dpitch\": 4.18, \"dyaw\": 4.03, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": 2.04, \"dz\": -0.09, \"dpitch\": 4.19, \"dyaw\": 4.1, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 2.55, \"dz\": -0.2, \"dpitch\": 4.33, \"dyaw\": 4.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.94, "window_alt_abs_m": 1.38, "target_px_mean_hist": 493.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-107.24, 52.05, 20.19, -37.69, 79.93, 0.0]\n  Target bbox: [656.87, 397.17, 700.15, 460.85]\n\nFrame 2:\n  Drone pose: [-107.16, 52.55, 20.12, -41.68, 83.19, 0.0]\n  Target bbox: [618.95, 327.59, 660.73, 391.78]\n\nFrame 3:\n  Drone pose: [-107.32, 53.08, 20.15, -41.73, 82.77, 0.0]\n  Target bbox: [618.29, 327.77, 661.39, 391.61]\n\nFrame 4:\n  Drone pose: [-107.31, 53.6, 20.18, -41.81, 82.78, 0.0]\n  Target bbox: [625.06, 329.13, 654.72, 390.22]\n\nFrame 5 (current):\n  Drone pose: [-107.34, 54.24, 20.16, -37.63, 79.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 655.6, \"ymin\": 401.64, \"xmax\": 692.38, \"ymax\": 464.19}, \"waypoint_deltas\": [{\"dx\": -0.24, \"dy\": 0.4, \"dz\": -0.06, \"dpitch\": -4.07, \"dyaw\": 2.11, \"droll\": 0.0}, {\"dx\": -0.37, \"dy\": 0.92, \"dz\": -0.07, \"dpitch\": -4.06, \"dyaw\": 1.76, \"droll\": 0.0}, {\"dx\": -0.52, \"dy\": 1.46, \"dz\": -0.08, \"dpitch\": -4.05, \"dyaw\": 1.34, \"droll\": 0.0}, {\"dx\": -0.69, \"dy\": 1.99, \"dz\": -0.09, \"dpitch\": -4.05, \"dyaw\": 0.88, \"droll\": 0.0}, {\"dx\": -0.88, \"dy\": 2.54, \"dz\": -0.1, \"dpitch\": -4.06, \"dyaw\": 0.36, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.5, "window_alt_abs_m": 0.16, "target_px_mean_hist": 515.0, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-109.28, 61.9, 20.01, -41.83, 74.58, 0.0]\n  Target bbox: [621.27, 327.44, 658.54, 391.98]\n\nFrame 2:\n  Drone pose: [-109.35, 62.37, 20.04, -41.81, 74.41, 0.0]\n  Target bbox: [622.69, 327.58, 657.35, 391.88]\n\nFrame 3:\n  Drone pose: [-109.04, 63.05, 20.01, -41.92, 73.78, 0.0]\n  Target bbox: [620.52, 324.81, 659.65, 394.77]\n\nFrame 4:\n  Drone pose: [-108.84, 63.62, 20.01, -45.41, 74.93, 0.0]\n  Target bbox: [596.91, 268.28, 633.16, 333.64]\n\nFrame 5 (current):\n  Drone pose: [-108.58, 64.18, 20.01, -42.3, 69.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 656.03, \"ymin\": 320.13, \"xmax\": 693.21, \"ymax\": 386.01}, \"waypoint_deltas\": [{\"dx\": 0.31, \"dy\": 0.54, \"dz\": -0.01, \"dpitch\": 0.45, \"dyaw\": 2.21, \"droll\": 0.0}, {\"dx\": 0.66, \"dy\": 1.09, \"dz\": -0.01, \"dpitch\": 0.46, \"dyaw\": 1.77, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 1.62, \"dz\": -0.01, \"dpitch\": 0.48, \"dyaw\": 1.43, \"droll\": 0.0}, {\"dx\": 1.42, \"dy\": 2.15, \"dz\": -0.01, \"dpitch\": 0.49, \"dyaw\": 1.11, \"droll\": 0.0}, {\"dx\": 1.81, \"dy\": 2.65, \"dz\": -0.01, \"dpitch\": 0.54, \"dyaw\": 0.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.32, "window_alt_abs_m": 0.07, "target_px_mean_hist": 495.8, "cur_frame_id": 39, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.56, 71.11, 20.18, -37.88, 74.3, 0.0]\n  Target bbox: [594.61, 397.1, 636.51, 462.96]\n\nFrame 2:\n  Drone pose: [-104.45, 71.6, 20.0, -41.07, 74.27, 0.0]\n  Target bbox: [602.87, 340.85, 636.79, 403.65]\n\nFrame 3:\n  Drone pose: [-104.29, 72.06, 20.0, -41.82, 73.12, 0.0]\n  Target bbox: [626.07, 328.94, 653.78, 390.45]\n\nFrame 4:\n  Drone pose: [-104.1, 72.63, 19.9, -41.85, 73.55, 0.0]\n  Target bbox: [625.94, 328.59, 653.89, 390.8]\n\nFrame 5 (current):\n  Drone pose: [-104.07, 73.1, 19.91, -41.82, 73.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.3, \"ymin\": 326.92, \"xmax\": 659.51, \"ymax\": 392.49}, \"waypoint_deltas\": [{\"dx\": 0.22, \"dy\": 0.36, \"dz\": 0.09, \"dpitch\": -0.05, \"dyaw\": 0.68, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": 0.83, \"dz\": 0.09, \"dpitch\": -0.06, \"dyaw\": 1.06, \"droll\": 0.0}, {\"dx\": 0.49, \"dy\": 1.3, \"dz\": 0.09, \"dpitch\": -0.06, \"dyaw\": 1.41, \"droll\": 0.0}, {\"dx\": 0.61, \"dy\": 1.78, \"dz\": 0.09, \"dpitch\": -0.07, \"dyaw\": 1.74, \"droll\": 0.0}, {\"dx\": 0.71, \"dy\": 2.25, \"dz\": 0.09, \"dpitch\": -0.07, \"dyaw\": 2.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.74, "window_alt_abs_m": 0.28, "target_px_mean_hist": 506.2, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.81, 79.76, 20.06, -42.03, 77.18, 0.0]\n  Target bbox: [617.45, 326.36, 662.29, 393.06]\n\nFrame 2:\n  Drone pose: [-102.84, 80.17, 20.13, -42.02, 77.17, 0.0]\n  Target bbox: [622.75, 328.52, 657.05, 390.89]\n\nFrame 3:\n  Drone pose: [-102.59, 80.68, 19.95, -42.6, 80.17, 0.0]\n  Target bbox: [593.52, 315.57, 626.95, 378.54]\n\nFrame 4:\n  Drone pose: [-102.66, 81.17, 19.96, -41.81, 77.63, 0.0]\n  Target bbox: [619.39, 327.09, 660.37, 392.29]\n\nFrame 5 (current):\n  Drone pose: [-102.36, 81.63, 20.03, -41.95, 78.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.45, \"ymin\": 327.31, \"xmax\": 660.3, \"ymax\": 392.07}, \"waypoint_deltas\": [{\"dx\": 0.08, \"dy\": 0.65, \"dz\": -0.03, \"dpitch\": -0.17, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 0.27, \"dy\": 1.23, \"dz\": -0.03, \"dpitch\": -0.33, \"dyaw\": 0.6, \"droll\": 0.0}, {\"dx\": 0.48, \"dy\": 1.83, \"dz\": -0.03, \"dpitch\": -0.38, \"dyaw\": -0.24, \"droll\": 0.0}, {\"dx\": 0.67, \"dy\": 2.42, \"dz\": -0.03, \"dpitch\": -0.41, \"dyaw\": -1.11, \"droll\": 0.0}, {\"dx\": 0.83, \"dy\": 2.99, \"dz\": -0.03, \"dpitch\": -0.55, \"dyaw\": -0.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.37, "window_alt_abs_m": 0.34, "target_px_mean_hist": 530.0, "cur_frame_id": 75, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00093/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.99, 88.96, 20.0, -38.38, 83.11, 0.0]\n  Target bbox: [570.92, 396.94, 612.77, 460.79]\n\nFrame 2:\n  Drone pose: [-100.93, 89.43, 20.0, -42.42, 79.44, 0.0]\n  Target bbox: [622.49, 327.95, 657.28, 391.38]\n\nFrame 3:\n  Drone pose: [-100.88, 89.91, 20.0, -42.4, 79.6, 0.0]\n  Target bbox: [623.54, 328.13, 656.24, 391.19]\n\nFrame 4:\n  Drone pose: [-100.75, 90.38, 19.89, -42.22, 79.94, 0.0]\n  Target bbox: [624.54, 328.11, 655.24, 391.21]\n\nFrame 5 (current):\n  Drone pose: [-100.64, 90.82, 19.98, -42.29, 80.28, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.17, \"ymin\": 328.03, \"xmax\": 659.57, \"ymax\": 391.34}, \"waypoint_deltas\": [{\"dx\": -0.11, \"dy\": 0.52, \"dz\": 0.02, \"dpitch\": -0.05, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": -0.08, \"dy\": 1.01, \"dz\": 0.02, \"dpitch\": -0.04, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": 1.5, \"dz\": 0.02, \"dpitch\": -0.02, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": 1.99, \"dz\": 0.02, \"dpitch\": -0.01, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": -0.09, \"dy\": 2.48, \"dz\": 0.02, \"dpitch\": 0.0, \"dyaw\": -0.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.51, "window_alt_abs_m": 0.2, "target_px_mean_hist": 521.5, "cur_frame_id": 93, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00111/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.03, 98.1, 19.98, -42.29, 76.4, 0.0]\n  Target bbox: [621.08, 327.37, 658.71, 392.0]\n\nFrame 2:\n  Drone pose: [-102.12, 98.54, 20.0, -45.05, 77.8, 0.0]\n  Target bbox: [606.52, 281.3, 633.22, 343.14]\n\nFrame 3:\n  Drone pose: [-102.46, 99.11, 20.09, -42.33, 75.24, 0.0]\n  Target bbox: [625.8, 328.92, 654.03, 390.47]\n\nFrame 4:\n  Drone pose: [-102.48, 99.72, 20.0, -41.51, 71.01, 0.0]\n  Target bbox: [667.69, 340.73, 714.74, 408.8]\n\nFrame 5 (current):\n  Drone pose: [-102.57, 100.34, 20.0, -40.17, 69.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 681.74, \"ymin\": 366.06, \"xmax\": 721.33, \"ymax\": 433.58}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.63, \"dz\": 0.0, \"dpitch\": -2.46, \"dyaw\": 4.84, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": 1.28, \"dz\": 0.0, \"dpitch\": -2.5, \"dyaw\": 3.6, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": 1.93, \"dz\": 0.0, \"dpitch\": -2.56, \"dyaw\": 2.54, \"droll\": 0.0}, {\"dx\": 0.42, \"dy\": 2.59, \"dz\": 0.0, \"dpitch\": -2.64, \"dyaw\": 1.61, \"droll\": 0.0}, {\"dx\": 0.65, \"dy\": 3.25, \"dz\": 0.0, \"dpitch\": -2.73, \"dyaw\": 0.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.34, "window_alt_abs_m": 0.19, "target_px_mean_hist": 515.2, "cur_frame_id": 111, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00129/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.73, 108.87, 20.04, -46.07, 56.36, 0.0]\n  Target bbox: [665.3, 278.86, 705.7, 348.9]\n\nFrame 2:\n  Drone pose: [-100.77, 109.26, 20.0, -45.37, 60.09, 0.0]\n  Target bbox: [601.93, 292.48, 630.8, 357.97]\n\nFrame 3:\n  Drone pose: [-100.83, 109.92, 19.97, -44.74, 56.21, 0.0]\n  Target bbox: [622.28, 294.26, 667.6, 368.0]\n\nFrame 4:\n  Drone pose: [-100.72, 110.28, 20.0, -41.46, 51.22, 0.0]\n  Target bbox: [675.04, 357.74, 701.45, 421.6]\n\nFrame 5 (current):\n  Drone pose: [-100.75, 110.72, 19.88, -40.07, 53.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.3, \"ymin\": 364.55, \"xmax\": 666.34, \"ymax\": 436.81}, \"waypoint_deltas\": [{\"dx\": 0.21, \"dy\": 0.56, \"dz\": 0.12, \"dpitch\": -2.99, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": 1.03, \"dz\": 0.12, \"dpitch\": -2.66, \"dyaw\": -1.99, \"droll\": 0.0}, {\"dx\": 0.52, \"dy\": 1.49, \"dz\": 0.12, \"dpitch\": -2.86, \"dyaw\": -3.52, \"droll\": 0.0}, {\"dx\": 0.72, \"dy\": 1.94, \"dz\": 0.12, \"dpitch\": -3.07, \"dyaw\": -4.98, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": 2.39, \"dz\": 0.12, \"dpitch\": -2.77, \"dyaw\": -5.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.07, "window_alt_abs_m": 0.21, "target_px_mean_hist": 521.2, "cur_frame_id": 129, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00146/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00147/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-97.18, 116.79, 20.02, -42.65, 41.25, 0.0]\n  Target bbox: [621.52, 325.03, 658.59, 394.48]\n\nFrame 2:\n  Drone pose: [-96.62, 117.21, 20.06, -44.47, 35.47, 0.0]\n  Target bbox: [685.22, 309.29, 717.62, 370.07]\n\nFrame 3:\n  Drone pose: [-96.38, 117.51, 20.08, -44.67, 43.07, 0.0]\n  Target bbox: [586.34, 291.53, 627.5, 364.18]\n\nFrame 4:\n  Drone pose: [-96.06, 117.87, 20.0, -38.48, 43.48, 0.0]\n  Target bbox: [572.36, 398.6, 603.96, 466.64]\n\nFrame 5 (current):\n  Drone pose: [-95.69, 118.21, 20.0, -45.23, 34.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 679.74, \"ymin\": 277.27, \"xmax\": 725.44, \"ymax\": 353.21}, \"waypoint_deltas\": [{\"dx\": 0.39, \"dy\": 0.32, \"dz\": 0.0, \"dpitch\": 2.61, \"dyaw\": 4.1, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": 0.65, \"dz\": 0.0, \"dpitch\": 2.87, \"dyaw\": 4.29, \"droll\": 0.0}, {\"dx\": 1.18, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": 2.7, \"dyaw\": 3.43, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": 1.29, \"dz\": 0.0, \"dpitch\": 2.95, \"dyaw\": 3.66, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": 1.61, \"dz\": 0.0, \"dpitch\": 2.79, \"dyaw\": 2.82, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.82, "window_alt_abs_m": 0.15, "target_px_mean_hist": 526.8, "cur_frame_id": 147, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00164/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741/aug_001/frames_playback/frame_00165/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-89.97, 122.9, 19.96, -38.46, 31.16, 0.0]\n  Target bbox: [653.78, 390.34, 687.78, 461.72]\n\nFrame 2:\n  Drone pose: [-89.63, 123.24, 20.12, -42.32, 33.75, 0.0]\n  Target bbox: [622.71, 325.74, 657.42, 393.75]\n\nFrame 3:\n  Drone pose: [-89.36, 123.41, 20.15, -42.03, 34.71, 0.0]\n  Target bbox: [605.56, 330.56, 631.68, 395.95]\n\nFrame 4:\n  Drone pose: [-88.78, 123.76, 19.99, -43.74, 28.47, 0.0]\n  Target bbox: [685.44, 298.74, 720.19, 365.12]\n\nFrame 5 (current):\n  Drone pose: [-88.44, 124.08, 20.03, -44.28, 32.86, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.72, \"ymin\": 289.01, \"xmax\": 652.52, \"ymax\": 356.55}, \"waypoint_deltas\": [{\"dx\": 0.41, \"dy\": 0.26, \"dz\": -0.03, \"dpitch\": 2.16, \"dyaw\": -1.08, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": 0.55, \"dz\": -0.03, \"dpitch\": 2.38, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": 1.29, \"dy\": 0.84, \"dz\": -0.03, \"dpitch\": 2.25, \"dyaw\": -1.42, \"droll\": 0.0}, {\"dx\": 1.74, \"dy\": 1.13, \"dz\": -0.03, \"dpitch\": 2.46, \"dyaw\": -1.02, \"droll\": 0.0}, {\"dx\": 2.18, \"dy\": 1.42, \"dz\": -0.03, \"dpitch\": 2.31, \"dyaw\": -1.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.19, "window_alt_abs_m": 0.38, "target_px_mean_hist": 522.2, "cur_frame_id": 165, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-15/trajectory_1776205741", "difficulty_score": 0.2987, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-56.97, 84.56, 22.0, -55.94, 70.01, 0.0]\n  Target bbox: [618.81, 324.51, 661.35, 394.29]\n\nFrame 2:\n  Drone pose: [-56.83, 84.85, 21.2, -54.29, 68.99, 0.0]\n  Target bbox: [617.88, 322.15, 662.33, 396.62]\n\nFrame 3:\n  Drone pose: [-56.69, 85.13, 20.67, -52.95, 68.01, 0.0]\n  Target bbox: [621.08, 326.58, 659.01, 392.23]\n\nFrame 4:\n  Drone pose: [-56.54, 85.42, 20.64, -52.3, 67.07, 0.0]\n  Target bbox: [619.04, 322.15, 661.2, 396.73]\n\nFrame 5 (current):\n  Drone pose: [-56.4, 85.7, 20.62, -51.65, 66.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.99, \"ymin\": 320.7, \"xmax\": 663.29, \"ymax\": 398.2}, \"waypoint_deltas\": [{\"dx\": 0.15, \"dy\": 0.29, \"dz\": -0.03, \"dpitch\": 0.64, \"dyaw\": -0.86, \"droll\": 0.0}, {\"dx\": 0.29, \"dy\": 0.57, \"dz\": -0.05, \"dpitch\": 1.28, \"dyaw\": -1.68, \"droll\": 0.0}, {\"dx\": 0.43, \"dy\": 0.86, \"dz\": -0.07, \"dpitch\": 1.91, \"dyaw\": -2.47, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": 1.14, \"dz\": -0.09, \"dpitch\": 2.54, \"dyaw\": -3.23, \"droll\": 0.0}, {\"dx\": 0.72, \"dy\": 1.43, \"dz\": -0.2, \"dpitch\": 3.28, \"dyaw\": -3.95, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.84, "window_alt_abs_m": 1.38, "target_px_mean_hist": 601.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.53, 87.41, 20.39, -47.74, 61.52, 0.0]\n  Target bbox: [621.98, 327.36, 658.17, 391.71]\n\nFrame 2:\n  Drone pose: [-55.39, 87.7, 20.36, -47.12, 60.86, 0.0]\n  Target bbox: [624.96, 326.2, 655.25, 392.93]\n\nFrame 3:\n  Drone pose: [-55.24, 87.98, 20.33, -46.51, 60.21, 0.0]\n  Target bbox: [622.55, 327.62, 657.62, 391.53]\n\nFrame 4:\n  Drone pose: [-55.1, 88.27, 20.3, -45.9, 59.6, 0.0]\n  Target bbox: [621.38, 324.28, 658.93, 395.02]\n\nFrame 5 (current):\n  Drone pose: [-54.96, 88.55, 20.27, -45.31, 59.01, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.27, \"ymin\": 327.55, \"xmax\": 657.92, \"ymax\": 391.65}, \"waypoint_deltas\": [{\"dx\": 0.15, \"dy\": 0.28, \"dz\": -0.03, \"dpitch\": 0.59, \"dyaw\": -0.58, \"droll\": 0.0}, {\"dx\": 0.29, \"dy\": 0.57, \"dz\": -0.05, \"dpitch\": 1.17, \"dyaw\": -1.13, \"droll\": 0.0}, {\"dx\": 0.44, \"dy\": 0.85, \"dz\": -0.08, \"dpitch\": 1.74, \"dyaw\": -1.65, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": 1.14, \"dz\": -0.1, \"dpitch\": 2.29, \"dyaw\": -2.16, \"droll\": 0.0}, {\"dx\": 0.72, \"dy\": 1.42, \"dz\": -0.12, \"dpitch\": 2.27, \"dyaw\": -3.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.52, "window_alt_abs_m": 0.12, "target_px_mean_hist": 589.8, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-54.09, 90.26, 20.14, -43.06, 54.31, 0.0]\n  Target bbox: [623.69, 327.31, 656.26, 392.11]\n\nFrame 2:\n  Drone pose: [-53.85, 90.5, 20.12, -43.09, 53.33, 0.0]\n  Target bbox: [615.79, 322.27, 664.31, 397.34]\n\nFrame 3:\n  Drone pose: [-53.6, 90.74, 20.1, -43.12, 52.37, 0.0]\n  Target bbox: [617.98, 323.74, 662.1, 395.81]\n\nFrame 4:\n  Drone pose: [-53.34, 90.99, 20.09, -43.17, 51.41, 0.0]\n  Target bbox: [616.02, 322.06, 664.1, 397.55]\n\nFrame 5 (current):\n  Drone pose: [-53.06, 91.27, 20.08, -43.27, 50.42, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.62, \"ymin\": 324.29, \"xmax\": 660.43, \"ymax\": 395.2}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": 0.34, \"dz\": -0.01, \"dpitch\": -0.17, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": 0.63, \"dy\": 0.73, \"dz\": -0.02, \"dpitch\": 0.12, \"dyaw\": -1.2, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": 1.15, \"dz\": -0.03, \"dpitch\": 0.35, \"dyaw\": -1.36, \"droll\": 0.0}, {\"dx\": 1.36, \"dy\": 1.58, \"dz\": -0.04, \"dpitch\": 0.55, \"dyaw\": -1.5, \"droll\": 0.0}, {\"dx\": 1.73, \"dy\": 2.0, \"dz\": -0.04, \"dpitch\": 0.76, \"dyaw\": -1.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.9, "window_alt_abs_m": 0.06, "target_px_mean_hist": 535.5, "cur_frame_id": 24, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00034/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-50.95, 93.67, 20.03, -42.07, 48.74, 0.0]\n  Target bbox: [622.12, 325.79, 657.88, 393.7]\n\nFrame 2:\n  Drone pose: [-50.57, 94.06, 20.03, -42.34, 47.78, 0.0]\n  Target bbox: [621.33, 325.43, 658.71, 394.08]\n\nFrame 3:\n  Drone pose: [-50.17, 94.47, 20.02, -42.65, 46.8, 0.0]\n  Target bbox: [625.02, 326.93, 655.33, 392.44]\n\nFrame 4:\n  Drone pose: [-49.75, 94.91, 20.02, -42.52, 46.76, 0.0]\n  Target bbox: [624.79, 328.23, 655.52, 391.1]\n\nFrame 5 (current):\n  Drone pose: [-49.3, 95.38, 20.02, -42.44, 46.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.05, \"ymin\": 327.28, \"xmax\": 651.19, \"ymax\": 392.07}, \"waypoint_deltas\": [{\"dx\": 0.46, \"dy\": 0.51, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": 1.06, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": 1.42, \"dy\": 1.64, \"dz\": -0.01, \"dpitch\": -0.05, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": 1.89, \"dy\": 2.24, \"dz\": -0.01, \"dpitch\": -0.12, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": 2.33, \"dy\": 2.84, \"dz\": -0.01, \"dpitch\": -0.16, \"dyaw\": -0.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.03, "window_alt_abs_m": 0.02, "target_px_mean_hist": 543.0, "cur_frame_id": 34, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-46.24, 99.34, 20.0, -42.45, 44.94, 0.0]\n  Target bbox: [623.35, 326.44, 656.66, 392.98]\n\nFrame 2:\n  Drone pose: [-45.95, 99.85, 20.0, -42.75, 45.49, 0.0]\n  Target bbox: [625.41, 328.5, 654.82, 390.79]\n\nFrame 3:\n  Drone pose: [-45.68, 100.36, 20.0, -42.54, 45.03, 0.0]\n  Target bbox: [623.81, 326.63, 656.18, 392.76]\n\nFrame 4:\n  Drone pose: [-45.4, 100.89, 20.0, -42.82, 43.53, 0.0]\n  Target bbox: [624.38, 329.89, 655.36, 389.37]\n\nFrame 5 (current):\n  Drone pose: [-45.11, 101.46, 20.0, -42.67, 42.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.16, \"ymin\": 328.58, \"xmax\": 653.59, \"ymax\": 390.69}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": 0.59, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -0.56, \"droll\": 0.0}, {\"dx\": 0.63, \"dy\": 1.21, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": -1.14, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": 1.84, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": -1.71, \"droll\": 0.0}, {\"dx\": 1.32, \"dy\": 2.47, \"dz\": 0.0, \"dpitch\": 0.27, \"dyaw\": -2.23, \"droll\": 0.0}, {\"dx\": 1.68, \"dy\": 3.06, \"dz\": 0.0, \"dpitch\": 0.33, \"dyaw\": -2.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.05, "window_alt_abs_m": 0.0, "target_px_mean_hist": 375.5, "cur_frame_id": 45, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-43.07, 105.06, 20.0, -42.23, 39.98, 0.0]\n  Target bbox: [624.95, 330.2, 654.84, 389.08]\n\nFrame 2:\n  Drone pose: [-42.72, 105.54, 20.0, -42.05, 39.77, 0.0]\n  Target bbox: [621.05, 324.49, 659.09, 395.07]\n\nFrame 3:\n  Drone pose: [-42.38, 105.94, 20.0, -42.23, 38.63, 0.0]\n  Target bbox: [619.34, 322.67, 660.85, 396.93]\n\nFrame 4:\n  Drone pose: [-42.05, 106.28, 20.0, -42.34, 37.6, 0.0]\n  Target bbox: [619.8, 323.24, 660.43, 396.4]\n\nFrame 5 (current):\n  Drone pose: [-41.72, 106.57, 20.0, -42.4, 36.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.65, \"ymin\": 324.56, \"xmax\": 658.5, \"ymax\": 394.95}, \"waypoint_deltas\": [{\"dx\": 0.34, \"dy\": 0.26, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.83, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": 0.48, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.56, \"droll\": 0.0}, {\"dx\": 1.05, \"dy\": 0.67, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -2.23, \"droll\": 0.0}, {\"dx\": 1.41, \"dy\": 0.84, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -2.84, \"droll\": 0.0}, {\"dx\": 1.78, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -3.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.3, "window_alt_abs_m": 0.0, "target_px_mean_hist": 531.5, "cur_frame_id": 55, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-39.55, 107.71, 20.0, -42.34, 32.78, 0.0]\n  Target bbox: [622.86, 325.72, 657.3, 393.78]\n\nFrame 2:\n  Drone pose: [-39.16, 107.85, 20.0, -42.32, 32.28, 0.0]\n  Target bbox: [620.23, 322.75, 660.07, 396.91]\n\nFrame 3:\n  Drone pose: [-38.76, 108.0, 20.0, -42.31, 31.79, 0.0]\n  Target bbox: [623.07, 326.09, 657.07, 393.39]\n\nFrame 4:\n  Drone pose: [-38.35, 108.16, 20.0, -42.31, 31.3, 0.0]\n  Target bbox: [621.37, 323.5, 658.89, 396.09]\n\nFrame 5 (current):\n  Drone pose: [-37.93, 108.33, 20.0, -42.34, 30.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.5, \"ymin\": 325.75, \"xmax\": 657.22, \"ymax\": 393.7}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": 0.19, \"dz\": 0.0, \"dpitch\": 0.3, \"dyaw\": 0.62, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 1.29, \"dy\": 0.63, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": 1.73, \"dy\": 0.89, \"dz\": 0.0, \"dpitch\": 0.38, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 2.18, \"dy\": 1.19, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -0.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 544.0, "cur_frame_id": 65, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.86, 110.22, 20.0, -42.1, 29.26, 0.0]\n  Target bbox: [624.49, 329.43, 655.39, 389.9]\n\nFrame 2:\n  Drone pose: [-34.41, 110.62, 20.0, -41.98, 29.44, 0.0]\n  Target bbox: [626.43, 326.96, 653.35, 392.45]\n\nFrame 3:\n  Drone pose: [-33.95, 111.04, 20.0, -41.88, 29.58, 0.0]\n  Target bbox: [624.1, 329.2, 655.76, 390.16]\n\nFrame 4:\n  Drone pose: [-33.48, 111.46, 20.0, -41.79, 29.73, 0.0]\n  Target bbox: [621.79, 326.28, 657.94, 393.22]\n\nFrame 5 (current):\n  Drone pose: [-33.0, 111.85, 20.0, -41.69, 29.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.23, \"ymin\": 326.75, \"xmax\": 653.54, \"ymax\": 392.69}, \"waypoint_deltas\": [{\"dx\": 0.47, \"dy\": 0.35, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.32, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": 0.63, \"dz\": 0.0, \"dpitch\": 0.31, \"dyaw\": 0.78, \"droll\": 0.0}, {\"dx\": 1.4, \"dy\": 0.85, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": 0.23, \"droll\": 0.0}, {\"dx\": 1.86, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": 2.32, \"dy\": 1.12, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -0.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.68, "window_alt_abs_m": 0.0, "target_px_mean_hist": 552.0, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00086/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-30.22, 113.08, 20.0, -41.59, 29.11, 0.0]\n  Target bbox: [621.97, 324.5, 658.25, 395.08]\n\nFrame 2:\n  Drone pose: [-29.75, 113.2, 20.0, -41.63, 28.78, 0.0]\n  Target bbox: [622.05, 324.26, 658.21, 395.37]\n\nFrame 3:\n  Drone pose: [-29.29, 113.36, 20.0, -41.69, 28.37, 0.0]\n  Target bbox: [619.76, 321.34, 660.61, 398.37]\n\nFrame 4:\n  Drone pose: [-28.82, 113.55, 20.0, -41.78, 27.88, 0.0]\n  Target bbox: [621.82, 326.74, 657.99, 392.68]\n\nFrame 5 (current):\n  Drone pose: [-28.32, 113.78, 20.0, -41.6, 28.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.66, \"ymin\": 321.84, \"xmax\": 660.74, \"ymax\": 397.97}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.25, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 1.47, \"dy\": 0.84, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": -0.85, \"droll\": 0.0}, {\"dx\": 1.92, \"dy\": 1.18, \"dz\": 0.0, \"dpitch\": -0.33, \"dyaw\": -1.75, \"droll\": 0.0}, {\"dx\": 2.36, \"dy\": 1.55, \"dz\": 0.0, \"dpitch\": -0.18, \"dyaw\": -1.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.87, "window_alt_abs_m": 0.0, "target_px_mean_hist": 513.8, "cur_frame_id": 86, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/ORI/frames_playback/frame_00096/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-25.53, 115.7, 20.0, -41.91, 26.03, 0.0]\n  Target bbox: [623.88, 325.09, 655.87, 394.35]\n\nFrame 2:\n  Drone pose: [-25.1, 116.05, 20.0, -41.75, 26.34, 0.0]\n  Target bbox: [622.05, 325.68, 657.7, 393.83]\n\nFrame 3:\n  Drone pose: [-24.63, 116.44, 20.0, -41.64, 26.54, 0.0]\n  Target bbox: [625.27, 327.58, 654.54, 391.85]\n\nFrame 4:\n  Drone pose: [-24.16, 116.8, 20.0, -41.75, 26.86, 0.0]\n  Target bbox: [626.26, 327.36, 653.54, 392.08]\n\nFrame 5 (current):\n  Drone pose: [-23.69, 117.18, 20.0, -41.65, 27.11, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.59, \"ymin\": 325.14, \"xmax\": 656.14, \"ymax\": 394.36}, \"waypoint_deltas\": [{\"dx\": 0.47, \"dy\": 0.38, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 0.24, \"droll\": 0.0}, {\"dx\": 0.93, \"dy\": 0.75, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": 1.39, \"dy\": 1.11, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.42, \"droll\": 0.0}, {\"dx\": 1.85, \"dy\": 1.48, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -1.34, \"droll\": 0.0}, {\"dx\": 2.32, \"dy\": 1.84, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -1.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.07, "window_alt_abs_m": 0.0, "target_px_mean_hist": 539.8, "cur_frame_id": 96, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-56.94, 84.38, 21.95, -56.85, 72.31, 0.0]\n  Target bbox: [602.79, 304.55, 639.64, 372.06]\n\nFrame 2:\n  Drone pose: [-56.91, 84.98, 21.2, -54.46, 68.52, 0.0]\n  Target bbox: [620.03, 323.9, 660.14, 394.86]\n\nFrame 3:\n  Drone pose: [-56.69, 85.13, 20.67, -57.95, 63.01, 0.0]\n  Target bbox: [667.02, 238.04, 716.12, 316.06]\n\nFrame 4:\n  Drone pose: [-56.54, 85.42, 20.64, -52.3, 67.07, 0.0]\n  Target bbox: [620.7, 325.92, 659.42, 392.93]\n\nFrame 5 (current):\n  Drone pose: [-56.4, 85.7, 20.62, -51.65, 66.17, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.98, \"ymin\": 323.23, \"xmax\": 661.24, \"ymax\": 395.7}, \"waypoint_deltas\": [{\"dx\": 0.15, \"dy\": 0.29, \"dz\": -0.03, \"dpitch\": 0.64, \"dyaw\": -0.86, \"droll\": 0.0}, {\"dx\": 0.29, \"dy\": 0.57, \"dz\": -0.05, \"dpitch\": 1.28, \"dyaw\": -1.68, \"droll\": 0.0}, {\"dx\": 0.43, \"dy\": 0.86, \"dz\": -0.07, \"dpitch\": 1.91, \"dyaw\": -2.47, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": 1.14, \"dz\": -0.09, \"dpitch\": 2.54, \"dyaw\": -3.23, \"droll\": 0.0}, {\"dx\": 0.72, \"dy\": 1.43, \"dz\": -0.2, \"dpitch\": 3.28, \"dyaw\": -3.95, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.26, "window_alt_abs_m": 1.33, "target_px_mean_hist": 597.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-55.62, 87.41, 20.55, -44.97, 58.57, 0.0]\n  Target bbox: [651.73, 375.99, 690.46, 442.77]\n\nFrame 2:\n  Drone pose: [-55.34, 87.53, 20.4, -46.99, 61.25, 0.0]\n  Target bbox: [619.04, 322.47, 661.29, 396.73]\n\nFrame 3:\n  Drone pose: [-55.26, 87.85, 20.28, -46.25, 60.37, 0.0]\n  Target bbox: [623.69, 328.59, 656.45, 390.53]\n\nFrame 4:\n  Drone pose: [-55.2, 88.14, 20.38, -47.38, 63.67, 0.0]\n  Target bbox: [573.08, 297.91, 609.83, 370.2]\n\nFrame 5 (current):\n  Drone pose: [-54.93, 88.58, 20.4, -45.55, 59.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.34, \"ymin\": 327.72, \"xmax\": 657.84, \"ymax\": 391.48}, \"waypoint_deltas\": [{\"dx\": 0.12, \"dy\": 0.25, \"dz\": -0.16, \"dpitch\": 0.83, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": 0.54, \"dz\": -0.18, \"dpitch\": 1.41, \"dyaw\": -1.16, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": 0.82, \"dz\": -0.21, \"dpitch\": 1.98, \"dyaw\": -1.68, \"droll\": 0.0}, {\"dx\": 0.55, \"dy\": 1.11, \"dz\": -0.23, \"dpitch\": 2.53, \"dyaw\": -2.19, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": 1.39, \"dz\": -0.25, \"dpitch\": 2.51, \"dyaw\": -3.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.48, "window_alt_abs_m": 0.38, "target_px_mean_hist": 585.5, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-53.98, 90.36, 20.12, -43.24, 54.39, 0.0]\n  Target bbox: [616.14, 321.89, 663.91, 397.64]\n\nFrame 2:\n  Drone pose: [-53.85, 90.5, 20.12, -38.8, 54.43, 0.0]\n  Target bbox: [606.0, 396.98, 646.43, 467.0]\n\nFrame 3:\n  Drone pose: [-53.6, 90.74, 20.1, -43.12, 52.37, 0.0]\n  Target bbox: [622.35, 326.32, 657.62, 393.12]\n\nFrame 4:\n  Drone pose: [-53.34, 90.99, 20.09, -43.17, 51.41, 0.0]\n  Target bbox: [623.95, 326.77, 655.99, 392.59]\n\nFrame 5 (current):\n  Drone pose: [-53.03, 91.23, 20.2, -41.92, 45.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 680.68, \"ymin\": 352.01, \"xmax\": 720.63, \"ymax\": 421.86}, \"waypoint_deltas\": [{\"dx\": 0.27, \"dy\": 0.38, \"dz\": -0.13, \"dpitch\": -1.52, \"dyaw\": 3.74, \"droll\": 0.0}, {\"dx\": 0.6, \"dy\": 0.77, \"dz\": -0.14, \"dpitch\": -1.23, \"dyaw\": 3.59, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": 1.19, \"dz\": -0.15, \"dpitch\": -1.0, \"dyaw\": 3.43, \"droll\": 0.0}, {\"dx\": 1.33, \"dy\": 1.62, \"dz\": -0.16, \"dpitch\": -0.8, \"dyaw\": 3.29, \"droll\": 0.0}, {\"dx\": 1.7, \"dy\": 2.04, \"dz\": -0.16, \"dpitch\": -0.59, \"dyaw\": 3.18, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.84, "window_alt_abs_m": 0.14, "target_px_mean_hist": 549.0, "cur_frame_id": 24, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00034/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-50.95, 93.67, 20.03, -42.07, 48.74, 0.0]\n  Target bbox: [617.53, 322.24, 662.59, 397.37]\n\nFrame 2:\n  Drone pose: [-50.66, 94.0, 19.9, -40.41, 50.01, 0.0]\n  Target bbox: [586.97, 348.77, 634.9, 425.49]\n\nFrame 3:\n  Drone pose: [-50.3, 94.51, 19.96, -42.48, 46.47, 0.0]\n  Target bbox: [624.81, 326.42, 655.55, 392.95]\n\nFrame 4:\n  Drone pose: [-49.73, 94.84, 19.9, -43.67, 49.01, 0.0]\n  Target bbox: [600.8, 305.46, 627.28, 368.1]\n\nFrame 5 (current):\n  Drone pose: [-49.23, 95.54, 20.04, -42.7, 46.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.62, \"ymin\": 330.91, \"xmax\": 654.58, \"ymax\": 388.3}, \"waypoint_deltas\": [{\"dx\": 0.39, \"dy\": 0.35, \"dz\": -0.03, \"dpitch\": 0.29, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": 0.9, \"dz\": -0.03, \"dpitch\": 0.27, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 1.35, \"dy\": 1.48, \"dz\": -0.03, \"dpitch\": 0.21, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": 1.82, \"dy\": 2.08, \"dz\": -0.03, \"dpitch\": 0.14, \"dyaw\": -0.51, \"droll\": 0.0}, {\"dx\": 2.26, \"dy\": 2.68, \"dz\": -0.03, \"dpitch\": 0.1, \"dyaw\": -0.82, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.82, "window_alt_abs_m": 0.4, "target_px_mean_hist": 557.8, "cur_frame_id": 34, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-46.11, 99.39, 20.12, -46.01, 50.1, 0.0]\n  Target bbox: [560.45, 274.65, 595.52, 340.44]\n\nFrame 2:\n  Drone pose: [-45.86, 99.89, 19.87, -42.68, 45.6, 0.0]\n  Target bbox: [627.79, 328.3, 652.45, 390.96]\n\nFrame 3:\n  Drone pose: [-45.68, 100.29, 20.18, -40.96, 41.54, 0.0]\n  Target bbox: [663.31, 353.3, 706.65, 427.36]\n\nFrame 4:\n  Drone pose: [-45.27, 101.02, 19.94, -42.99, 43.52, 0.0]\n  Target bbox: [623.36, 326.41, 656.27, 392.9]\n\nFrame 5 (current):\n  Drone pose: [-45.11, 101.46, 20.0, -42.67, 42.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.87, \"ymin\": 328.17, \"xmax\": 653.86, \"ymax\": 391.11}, \"waypoint_deltas\": [{\"dx\": 0.3, \"dy\": 0.59, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -0.56, \"droll\": 0.0}, {\"dx\": 0.63, \"dy\": 1.21, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": -1.14, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": 1.84, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": -1.71, \"droll\": 0.0}, {\"dx\": 1.32, \"dy\": 2.47, \"dz\": 0.0, \"dpitch\": 0.27, \"dyaw\": -2.23, \"droll\": 0.0}, {\"dx\": 1.68, \"dy\": 3.06, \"dz\": 0.0, \"dpitch\": 0.33, \"dyaw\": -2.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.05, "window_alt_abs_m": 0.85, "target_px_mean_hist": 386.5, "cur_frame_id": 45, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-43.15, 105.07, 20.17, -42.41, 39.83, 0.0]\n  Target bbox: [623.81, 328.79, 655.94, 390.51]\n\nFrame 2:\n  Drone pose: [-42.66, 105.42, 19.99, -42.0, 40.11, 0.0]\n  Target bbox: [621.14, 324.63, 658.97, 394.92]\n\nFrame 3:\n  Drone pose: [-42.38, 105.94, 20.0, -41.95, 36.49, 0.0]\n  Target bbox: [644.44, 326.2, 689.8, 403.49]\n\nFrame 4:\n  Drone pose: [-42.05, 106.28, 20.0, -43.31, 42.6, 0.0]\n  Target bbox: [558.93, 310.99, 596.34, 379.4]\n\nFrame 5 (current):\n  Drone pose: [-41.53, 106.55, 20.04, -42.63, 37.05, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.45, \"ymin\": 325.76, \"xmax\": 657.66, \"ymax\": 393.72}, \"waypoint_deltas\": [{\"dx\": 0.15, \"dy\": 0.28, \"dz\": -0.04, \"dpitch\": 0.21, \"dyaw\": -1.2, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": 0.5, \"dz\": -0.04, \"dpitch\": 0.2, \"dyaw\": -1.93, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": 0.69, \"dz\": -0.04, \"dpitch\": 0.21, \"dyaw\": -2.6, \"droll\": 0.0}, {\"dx\": 1.22, \"dy\": 0.86, \"dz\": -0.04, \"dpitch\": 0.24, \"dyaw\": -3.21, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": 1.02, \"dz\": -0.04, \"dpitch\": 0.26, \"dyaw\": -3.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.56, "window_alt_abs_m": 0.22, "target_px_mean_hist": 556.0, "cur_frame_id": 55, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-39.47, 107.83, 20.11, -44.62, 27.61, 0.0]\n  Target bbox: [685.63, 297.03, 718.52, 360.94]\n\nFrame 2:\n  Drone pose: [-39.16, 107.85, 20.0, -42.32, 32.28, 0.0]\n  Target bbox: [620.81, 323.32, 659.42, 396.24]\n\nFrame 3:\n  Drone pose: [-38.59, 108.06, 19.96, -43.16, 34.85, 0.0]\n  Target bbox: [581.92, 311.71, 624.59, 386.88]\n\nFrame 4:\n  Drone pose: [-38.35, 108.16, 20.0, -42.31, 31.3, 0.0]\n  Target bbox: [619.65, 322.39, 660.69, 397.33]\n\nFrame 5 (current):\n  Drone pose: [-37.93, 108.33, 20.0, -43.89, 35.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 558.93, \"ymin\": 302.51, \"xmax\": 595.97, \"ymax\": 368.6}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": 0.19, \"dz\": 0.0, \"dpitch\": 1.85, \"dyaw\": -4.38, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": 1.78, \"dyaw\": -4.96, \"droll\": 0.0}, {\"dx\": 1.29, \"dy\": 0.63, \"dz\": 0.0, \"dpitch\": 1.69, \"dyaw\": -5.6, \"droll\": 0.0}, {\"dx\": 1.73, \"dy\": 0.89, \"dz\": 0.0, \"dpitch\": 1.93, \"dyaw\": -5.13, \"droll\": 0.0}, {\"dx\": 2.18, \"dy\": 1.19, \"dz\": 0.0, \"dpitch\": 1.79, \"dyaw\": -5.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.27, "window_alt_abs_m": 0.19, "target_px_mean_hist": 566.0, "cur_frame_id": 65, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.86, 110.22, 20.0, -42.1, 29.26, 0.0]\n  Target bbox: [620.55, 324.75, 659.15, 394.75]\n\nFrame 2:\n  Drone pose: [-34.41, 110.62, 20.0, -46.28, 25.79, 0.0]\n  Target bbox: [666.51, 253.06, 705.28, 323.84]\n\nFrame 3:\n  Drone pose: [-33.83, 110.98, 20.12, -38.29, 34.87, 0.0]\n  Target bbox: [560.7, 391.93, 593.68, 460.77]\n\nFrame 4:\n  Drone pose: [-33.48, 111.55, 20.16, -41.31, 34.44, 0.0]\n  Target bbox: [560.61, 343.31, 595.52, 405.61]\n\nFrame 5 (current):\n  Drone pose: [-32.94, 111.86, 19.85, -37.08, 28.07, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 645.84, \"ymin\": 399.94, \"xmax\": 683.14, \"ymax\": 470.41}, \"waypoint_deltas\": [{\"dx\": 0.41, \"dy\": 0.34, \"dz\": 0.15, \"dpitch\": -4.48, \"dyaw\": 2.19, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": 0.62, \"dz\": 0.15, \"dpitch\": -4.3, \"dyaw\": 2.65, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": 0.84, \"dz\": 0.15, \"dpitch\": -4.4, \"dyaw\": 2.1, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": 0.99, \"dz\": 0.15, \"dpitch\": -4.45, \"dyaw\": 1.67, \"droll\": 0.0}, {\"dx\": 2.26, \"dy\": 1.11, \"dz\": 0.15, \"dpitch\": -4.49, \"dyaw\": 1.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.36, "window_alt_abs_m": 0.46, "target_px_mean_hist": 544.8, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00086/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-30.17, 113.01, 19.95, -40.26, 29.64, 0.0]\n  Target bbox: [617.02, 344.29, 655.52, 418.19]\n\nFrame 2:\n  Drone pose: [-29.79, 113.1, 19.91, -41.97, 29.78, 0.0]\n  Target bbox: [609.31, 312.35, 649.84, 387.79]\n\nFrame 3:\n  Drone pose: [-29.44, 113.4, 20.03, -40.32, 23.07, 0.0]\n  Target bbox: [684.84, 347.23, 721.7, 418.33]\n\nFrame 4:\n  Drone pose: [-28.72, 113.41, 20.08, -41.93, 28.33, 0.0]\n  Target bbox: [620.96, 325.51, 658.75, 394.04]\n\nFrame 5 (current):\n  Drone pose: [-28.2, 113.83, 20.14, -41.99, 28.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.08, \"ymin\": 325.56, \"xmax\": 657.13, \"ymax\": 393.99}, \"waypoint_deltas\": [{\"dx\": 0.38, \"dy\": 0.2, \"dz\": -0.14, \"dpitch\": 0.23, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": 0.48, \"dz\": -0.14, \"dpitch\": 0.38, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": 1.35, \"dy\": 0.79, \"dz\": -0.14, \"dpitch\": 0.21, \"dyaw\": -0.88, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": 1.13, \"dz\": -0.14, \"dpitch\": 0.06, \"dyaw\": -1.78, \"droll\": 0.0}, {\"dx\": 2.24, \"dy\": 1.5, \"dz\": -0.14, \"dpitch\": 0.21, \"dyaw\": -1.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.33, "window_alt_abs_m": 0.28, "target_px_mean_hist": 525.5, "cur_frame_id": 86, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228/aug_001/frames_playback/frame_00096/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-25.7, 115.74, 20.04, -43.92, 25.46, 0.0]\n  Target bbox: [624.61, 289.22, 662.43, 359.19]\n\nFrame 2:\n  Drone pose: [-24.95, 116.0, 19.98, -38.35, 29.81, 0.0]\n  Target bbox: [582.88, 386.56, 616.31, 452.57]\n\nFrame 3:\n  Drone pose: [-24.63, 116.44, 20.0, -40.11, 27.57, 0.0]\n  Target bbox: [608.69, 351.15, 644.86, 419.91]\n\nFrame 4:\n  Drone pose: [-23.97, 116.8, 19.98, -41.94, 27.07, 0.0]\n  Target bbox: [625.91, 327.36, 653.89, 392.07]\n\nFrame 5 (current):\n  Drone pose: [-23.64, 117.11, 19.82, -45.62, 31.28, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 572.91, \"ymin\": 255.41, \"xmax\": 605.96, \"ymax\": 324.35}, \"waypoint_deltas\": [{\"dx\": 0.42, \"dy\": 0.45, \"dz\": 0.18, \"dpitch\": 4.08, \"dyaw\": -3.93, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": 0.82, \"dz\": 0.18, \"dpitch\": 4.2, \"dyaw\": -3.67, \"droll\": 0.0}, {\"dx\": 1.34, \"dy\": 1.18, \"dz\": 0.18, \"dpitch\": 4.04, \"dyaw\": -4.59, \"droll\": 0.0}, {\"dx\": 1.8, \"dy\": 1.55, \"dz\": 0.18, \"dpitch\": 3.87, \"dyaw\": -5.51, \"droll\": 0.0}, {\"dx\": 2.27, \"dy\": 1.91, \"dz\": 0.18, \"dpitch\": 3.99, \"dyaw\": -5.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.31, "window_alt_abs_m": 0.26, "target_px_mean_hist": 531.0, "cur_frame_id": 96, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776090228", "difficulty_score": 0.2572, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [77.45, 88.33, 22.0, -46.42, -63.43, 0.0]\n  Target bbox: [622.44, 324.82, 657.8, 394.55] (model-predicted box)\n\nFrame 2:\n  Drone pose: [80.65, 85.33, 21.2, -50.92, -69.53, 0.0]\n  Target bbox: [619.18, 321.4, 661.13, 397.63] (model-predicted box)\n\nFrame 3:\n  Drone pose: [84.83, 83.33, 20.67, -54.63, -83.56, 0.0]\n  Target bbox: [618.97, 321.45, 661.47, 397.19]\n\nFrame 4:\n  Drone pose: [89.48, 81.97, 20.65, -55.76, -103.32, 0.0]\n  Target bbox: [616.76, 318.87, 663.42, 399.6]\n\nFrame 5 (current):\n  Drone pose: [93.85, 81.04, 20.62, -53.77, -118.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.47, \"ymin\": 322.5, \"xmax\": 660.79, \"ymax\": 396.15}, \"waypoint_deltas\": [{\"dx\": 3.4, \"dy\": -0.75, \"dz\": -0.03, \"dpitch\": 2.63, \"dyaw\": -9.69, \"droll\": 0.0}, {\"dx\": 5.6, \"dy\": -1.44, \"dz\": -0.05, \"dpitch\": 4.35, \"dyaw\": -14.65, \"droll\": 0.0}, {\"dx\": 6.88, \"dy\": -2.1, \"dz\": -0.07, \"dpitch\": 5.7, \"dyaw\": -18.03, \"droll\": 0.0}, {\"dx\": 7.68, \"dy\": -2.7, \"dz\": -0.09, \"dpitch\": 5.98, \"dyaw\": -18.92, \"droll\": 0.0}, {\"dx\": 8.3, \"dy\": -3.25, \"dz\": -0.2, \"dpitch\": 6.23, \"dyaw\": -19.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 2, "current_invisible": false, "window_yaw_abs_deg": 55.48, "window_alt_abs_m": 1.38, "target_px_mean_hist": 143.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.53, 73.65, 20.19, -47.5, -138.29, 0.0]\n  Target bbox: [616.93, 319.61, 663.02, 399.49]\n\nFrame 2:\n  Drone pose: [104.52, 73.12, 20.17, -47.53, -138.34, 0.0]\n  Target bbox: [615.61, 318.93, 664.41, 400.28]\n\nFrame 3:\n  Drone pose: [104.5, 72.6, 20.15, -47.54, -138.36, 0.0]\n  Target bbox: [618.96, 321.5, 660.92, 397.54]\n\nFrame 4:\n  Drone pose: [104.49, 72.09, 20.14, -47.53, -138.37, 0.0]\n  Target bbox: [617.42, 320.84, 662.51, 398.32]\n\nFrame 5 (current):\n  Drone pose: [104.51, 71.6, 20.12, -47.25, -138.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.27, \"ymin\": 319.7, \"xmax\": 663.71, \"ymax\": 399.51}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": -0.48, \"dz\": -0.01, \"dpitch\": 0.07, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -0.95, \"dz\": -0.03, \"dpitch\": 0.15, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.42, \"dz\": -0.04, \"dpitch\": 0.24, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.89, \"dz\": -0.05, \"dpitch\": 0.33, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": -2.36, \"dz\": -0.06, \"dpitch\": 0.4, \"dyaw\": 0.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.07, "window_alt_abs_m": 0.07, "target_px_mean_hist": 639.0, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.67, 64.8, 20.01, -46.68, -138.26, 0.0]\n  Target bbox: [618.82, 321.34, 661.06, 397.75]\n\nFrame 2:\n  Drone pose: [104.71, 64.27, 20.01, -46.66, -138.4, 0.0]\n  Target bbox: [616.18, 319.82, 663.8, 399.45]\n\nFrame 3:\n  Drone pose: [104.71, 63.78, 20.01, -46.64, -138.36, 0.0]\n  Target bbox: [616.57, 319.79, 663.4, 399.44]\n\nFrame 4:\n  Drone pose: [104.69, 63.3, 20.01, -46.64, -138.29, 0.0]\n  Target bbox: [616.16, 319.17, 663.83, 400.05]\n\nFrame 5 (current):\n  Drone pose: [104.6, 62.85, 20.01, -46.69, -137.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.1, \"ymin\": 321.05, \"xmax\": 660.77, \"ymax\": 397.99}, \"waypoint_deltas\": [{\"dx\": -0.14, \"dy\": -0.4, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.54, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": -0.62, \"dz\": -0.01, \"dpitch\": -0.69, \"dyaw\": 3.0, \"droll\": 0.0}, {\"dx\": -2.14, \"dy\": -0.74, \"dz\": -0.01, \"dpitch\": -1.59, \"dyaw\": 6.81, \"droll\": 0.0}, {\"dx\": -3.62, \"dy\": -0.74, \"dz\": -0.01, \"dpitch\": -2.51, \"dyaw\": 11.89, \"droll\": 0.0}, {\"dx\": -5.27, \"dy\": -0.67, \"dz\": -0.01, \"dpitch\": -3.24, \"dyaw\": 17.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.58, "window_alt_abs_m": 0.01, "target_px_mean_hist": 647.2, "cur_frame_id": 39, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [91.5, 60.45, 20.0, -48.81, -90.37, 0.0]\n  Target bbox: [624.12, 322.58, 655.63, 396.34]\n\nFrame 2:\n  Drone pose: [91.46, 59.93, 20.0, -48.85, -90.24, 0.0]\n  Target bbox: [621.97, 321.82, 657.87, 397.0]\n\nFrame 3:\n  Drone pose: [91.42, 59.4, 20.0, -48.9, -90.1, 0.0]\n  Target bbox: [622.75, 322.72, 657.18, 396.18]\n\nFrame 4:\n  Drone pose: [91.38, 58.88, 20.0, -48.94, -89.97, 0.0]\n  Target bbox: [628.54, 321.74, 651.48, 397.08]\n\nFrame 5 (current):\n  Drone pose: [91.32, 58.35, 20.0, -48.98, -89.76, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.42, \"ymin\": 321.53, \"xmax\": 652.74, \"ymax\": 397.28}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": -0.94, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": -0.08, \"dy\": -1.34, \"dz\": 0.0, \"dpitch\": 0.28, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": -0.08, \"dy\": -1.77, \"dz\": 0.0, \"dpitch\": 0.39, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -0.08, \"dy\": -2.25, \"dz\": 0.0, \"dpitch\": 0.43, \"dyaw\": 0.28, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.61, "window_alt_abs_m": 0.0, "target_px_mean_hist": 752.5, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [91.26, 51.68, 20.0, -48.42, -89.56, 0.0]\n  Target bbox: [626.31, 322.31, 653.97, 396.57]\n\nFrame 2:\n  Drone pose: [91.26, 51.25, 20.0, -48.52, -89.56, 0.0]\n  Target bbox: [625.42, 322.58, 654.87, 396.31]\n\nFrame 3:\n  Drone pose: [91.27, 50.83, 20.0, -48.4, -89.59, 0.0]\n  Target bbox: [620.34, 322.69, 659.92, 396.2]\n\nFrame 4:\n  Drone pose: [91.26, 50.4, 20.0, -48.28, -89.55, 0.0]\n  Target bbox: [628.55, 321.78, 651.74, 397.06]\n\nFrame 5 (current):\n  Drone pose: [91.29, 49.97, 20.0, -48.16, -89.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.58, \"ymin\": 322.87, \"xmax\": 657.63, \"ymax\": 396.05}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": -0.46, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -0.05, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -0.94, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.43, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -1.95, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": -0.03, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.2, "window_alt_abs_m": 0.0, "target_px_mean_hist": 727.8, "cur_frame_id": 75, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00093/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [90.61, 41.05, 20.0, -51.07, -79.57, 0.0]\n  Target bbox: [618.71, 320.65, 661.2, 398.09]\n\nFrame 2:\n  Drone pose: [90.65, 40.42, 20.0, -51.13, -77.8, 0.0]\n  Target bbox: [619.02, 319.54, 660.8, 399.14]\n\nFrame 3:\n  Drone pose: [90.61, 39.56, 20.0, -51.55, -75.49, 0.0]\n  Target bbox: [616.4, 319.08, 663.45, 399.68]\n\nFrame 4:\n  Drone pose: [90.59, 38.7, 20.0, -51.93, -73.21, 0.0]\n  Target bbox: [619.12, 320.25, 660.71, 398.36]\n\nFrame 5 (current):\n  Drone pose: [90.05, 37.76, 20.0, -52.08, -68.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.39, \"ymin\": 321.07, \"xmax\": 660.41, \"ymax\": 397.55}, \"waypoint_deltas\": [{\"dx\": -0.7, \"dy\": -0.78, \"dz\": 0.0, \"dpitch\": 0.41, \"dyaw\": 4.68, \"droll\": 0.0}, {\"dx\": -1.24, \"dy\": -1.48, \"dz\": 0.0, \"dpitch\": 0.97, \"dyaw\": 8.51, \"droll\": 0.0}, {\"dx\": -1.39, \"dy\": -1.97, \"dz\": 0.0, \"dpitch\": 1.59, \"dyaw\": 10.54, \"droll\": 0.0}, {\"dx\": -1.4, \"dy\": -2.42, \"dz\": 0.0, \"dpitch\": 2.15, \"dyaw\": 12.02, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": -2.88, \"dz\": 0.0, \"dpitch\": 2.72, \"dyaw\": 13.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.69, "window_alt_abs_m": 0.0, "target_px_mean_hist": 743.2, "cur_frame_id": 93, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00110/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [91.13, 31.62, 20.0, -46.89, -52.81, 0.0]\n  Target bbox: [622.75, 323.01, 656.92, 395.96]\n\nFrame 2:\n  Drone pose: [91.67, 31.18, 20.0, -46.85, -53.02, 0.0]\n  Target bbox: [622.68, 324.94, 657.07, 393.92]\n\nFrame 3:\n  Drone pose: [92.22, 30.74, 20.0, -46.83, -53.26, 0.0]\n  Target bbox: [624.16, 323.85, 655.56, 395.08]\n\nFrame 4:\n  Drone pose: [92.77, 30.3, 20.0, -46.8, -53.5, 0.0]\n  Target bbox: [621.29, 323.8, 658.37, 395.24]\n\nFrame 5 (current):\n  Drone pose: [93.29, 29.86, 20.0, -46.74, -53.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.61, \"ymin\": 322.73, \"xmax\": 659.02, \"ymax\": 396.3}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": -0.43, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 0.91, \"dy\": -0.85, \"dz\": 0.0, \"dpitch\": 0.28, \"dyaw\": -0.04, \"droll\": 0.0}, {\"dx\": 1.27, \"dy\": -1.26, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -1.15, \"droll\": 0.0}, {\"dx\": 1.6, \"dy\": -1.67, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -2.17, \"droll\": 0.0}, {\"dx\": 1.88, \"dy\": -2.07, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": -3.12, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.86, "window_alt_abs_m": 0.0, "target_px_mean_hist": 691.5, "cur_frame_id": 110, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00128/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.32, 24.19, 20.0, -47.42, -64.32, 0.0]\n  Target bbox: [621.21, 320.05, 659.1, 399.01]\n\nFrame 2:\n  Drone pose: [97.54, 23.77, 20.0, -47.46, -65.09, 0.0]\n  Target bbox: [615.96, 318.96, 664.33, 400.22]\n\nFrame 3:\n  Drone pose: [97.76, 23.35, 20.0, -47.49, -65.83, 0.0]\n  Target bbox: [620.88, 320.73, 659.43, 398.42]\n\nFrame 4:\n  Drone pose: [97.96, 22.92, 20.0, -47.51, -66.55, 0.0]\n  Target bbox: [617.24, 319.86, 663.08, 399.35]\n\nFrame 5 (current):\n  Drone pose: [98.16, 22.49, 20.0, -47.53, -67.24, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.91, \"ymin\": 320.6, \"xmax\": 657.4, \"ymax\": 398.44}, \"waypoint_deltas\": [{\"dx\": 0.19, \"dy\": -0.43, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": 0.37, \"dy\": -0.86, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.31, \"droll\": 0.0}, {\"dx\": 0.55, \"dy\": -1.3, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -1.92, \"droll\": 0.0}, {\"dx\": 0.72, \"dy\": -1.74, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -2.52, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": -2.18, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -3.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.92, "window_alt_abs_m": 0.0, "target_px_mean_hist": 706.5, "cur_frame_id": 128, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00146/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.25, 16.22, 20.0, -47.52, -74.53, 0.0]\n  Target bbox: [618.74, 320.91, 661.66, 398.26]\n\nFrame 2:\n  Drone pose: [100.36, 15.76, 20.0, -47.51, -74.91, 0.0]\n  Target bbox: [622.16, 320.69, 658.19, 398.35]\n\nFrame 3:\n  Drone pose: [100.46, 15.3, 20.0, -47.5, -75.27, 0.0]\n  Target bbox: [622.89, 320.36, 657.47, 398.65]\n\nFrame 4:\n  Drone pose: [100.56, 14.83, 20.0, -47.49, -75.62, 0.0]\n  Target bbox: [621.23, 321.01, 659.14, 398.08]\n\nFrame 5 (current):\n  Drone pose: [100.66, 14.36, 20.0, -47.47, -75.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.64, \"ymin\": 320.41, \"xmax\": 657.73, \"ymax\": 398.6}, \"waypoint_deltas\": [{\"dx\": 0.09, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -0.94, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": 0.27, \"dy\": -1.41, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.92, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.88, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -1.2, \"droll\": 0.0}, {\"dx\": 0.43, \"dy\": -2.35, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -1.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.42, "window_alt_abs_m": 0.0, "target_px_mean_hist": 723.0, "cur_frame_id": 146, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/ORI/frames_playback/frame_00164/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.67, 7.7, 20.0, -47.29, -79.41, 0.0]\n  Target bbox: [623.83, 320.89, 656.53, 398.11]\n\nFrame 2:\n  Drone pose: [101.72, 7.22, 20.0, -47.27, -79.59, 0.0]\n  Target bbox: [622.32, 320.84, 658.07, 398.18]\n\nFrame 3:\n  Drone pose: [101.77, 6.73, 20.0, -47.26, -79.76, 0.0]\n  Target bbox: [620.76, 321.37, 659.64, 397.69]\n\nFrame 4:\n  Drone pose: [101.82, 6.25, 20.0, -47.25, -79.93, 0.0]\n  Target bbox: [621.55, 321.87, 658.85, 397.26]\n\nFrame 5 (current):\n  Drone pose: [101.86, 5.77, 20.0, -47.24, -80.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.31, \"ymin\": 321.75, \"xmax\": 658.07, \"ymax\": 397.33}, \"waypoint_deltas\": [{\"dx\": 0.05, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -0.98, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.29, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": -1.46, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -1.95, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.56, \"droll\": 0.0}, {\"dx\": 0.21, \"dy\": -2.44, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.67, "window_alt_abs_m": 0.0, "target_px_mean_hist": 726.0, "cur_frame_id": 164, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [77.45, 88.33, 22.0, -46.42, -63.43, 0.0]\n  Target bbox: [619.43, 323.59, 660.79, 395.79] (model-predicted box)\n\nFrame 2:\n  Drone pose: [80.73, 85.38, 21.11, -50.76, -69.84, 0.0]\n  Target bbox: [620.24, 321.24, 660.08, 397.72] (model-predicted box)\n\nFrame 3:\n  Drone pose: [84.83, 83.33, 20.67, -52.83, -81.28, 0.0]\n  Target bbox: [602.84, 350.89, 632.34, 428.65]\n\nFrame 4:\n  Drone pose: [89.48, 81.97, 20.65, -60.57, -108.32, 0.0]\n  Target bbox: [663.57, 240.64, 712.85, 319.39]\n\nFrame 5 (current):\n  Drone pose: [93.85, 81.04, 20.62, -48.77, -123.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 670.1, \"ymin\": 410.4, \"xmax\": 710.78, \"ymax\": 479.84}, \"waypoint_deltas\": [{\"dx\": 3.4, \"dy\": -0.75, \"dz\": -0.03, \"dpitch\": -2.37, \"dyaw\": -4.69, \"droll\": 0.0}, {\"dx\": 5.6, \"dy\": -1.44, \"dz\": -0.05, \"dpitch\": -0.65, \"dyaw\": -9.65, \"droll\": 0.0}, {\"dx\": 6.88, \"dy\": -2.1, \"dz\": -0.07, \"dpitch\": 0.7, \"dyaw\": -13.03, \"droll\": 0.0}, {\"dx\": 7.68, \"dy\": -2.7, \"dz\": -0.09, \"dpitch\": 0.98, \"dyaw\": -13.92, \"droll\": 0.0}, {\"dx\": 8.3, \"dy\": -3.25, \"dz\": -0.2, \"dpitch\": 1.23, \"dyaw\": -14.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 2, "current_invisible": false, "window_yaw_abs_deg": 60.48, "window_alt_abs_m": 1.38, "target_px_mean_hist": 138.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.53, 73.65, 20.19, -48.55, -143.29, 0.0]\n  Target bbox: [672.5, 304.42, 722.13, 383.29]\n\nFrame 2:\n  Drone pose: [104.61, 73.28, 20.16, -51.51, -140.49, 0.0]\n  Target bbox: [645.76, 249.28, 688.69, 326.64]\n\nFrame 3:\n  Drone pose: [104.4, 72.48, 20.18, -44.32, -136.46, 0.0]\n  Target bbox: [592.95, 377.74, 641.39, 459.87]\n\nFrame 4:\n  Drone pose: [104.49, 72.09, 20.14, -48.35, -133.37, 0.0]\n  Target bbox: [555.88, 305.8, 609.48, 389.61]\n\nFrame 5 (current):\n  Drone pose: [104.51, 71.6, 20.12, -47.25, -138.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.42, \"ymin\": 318.52, \"xmax\": 665.64, \"ymax\": 400.82}, \"waypoint_deltas\": [{\"dx\": 0.02, \"dy\": -0.48, \"dz\": -0.01, \"dpitch\": 0.07, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -0.95, \"dz\": -0.03, \"dpitch\": 0.15, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.42, \"dz\": -0.04, \"dpitch\": 0.24, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.89, \"dz\": -0.05, \"dpitch\": 0.33, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": -2.36, \"dz\": -0.06, \"dpitch\": 0.4, \"dyaw\": 0.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.93, "window_alt_abs_m": 0.12, "target_px_mean_hist": 661.2, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.67, 64.8, 20.01, -46.68, -138.26, 0.0]\n  Target bbox: [618.8, 321.47, 661.1, 397.65]\n\nFrame 2:\n  Drone pose: [104.59, 64.39, 19.94, -45.9, -132.85, 0.0]\n  Target bbox: [557.43, 332.51, 605.96, 412.81]\n\nFrame 3:\n  Drone pose: [104.69, 63.75, 20.15, -50.02, -135.61, 0.0]\n  Target bbox: [586.07, 268.82, 628.8, 346.9]\n\nFrame 4:\n  Drone pose: [104.83, 63.18, 20.04, -45.52, -138.51, 0.0]\n  Target bbox: [610.88, 336.97, 660.68, 419.92]\n\nFrame 5 (current):\n  Drone pose: [104.6, 62.85, 20.01, -46.69, -137.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.8, \"ymin\": 319.98, \"xmax\": 662.11, \"ymax\": 399.1}, \"waypoint_deltas\": [{\"dx\": -0.14, \"dy\": -0.4, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.54, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": -0.62, \"dz\": -0.01, \"dpitch\": -0.69, \"dyaw\": 3.0, \"droll\": 0.0}, {\"dx\": -2.14, \"dy\": -0.74, \"dz\": -0.01, \"dpitch\": -1.59, \"dyaw\": 6.81, \"droll\": 0.0}, {\"dx\": -3.62, \"dy\": -0.74, \"dz\": -0.01, \"dpitch\": -2.51, \"dyaw\": 11.89, \"droll\": 0.0}, {\"dx\": -5.27, \"dy\": -0.67, \"dz\": -0.01, \"dpitch\": -3.24, \"dyaw\": 17.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.61, "window_alt_abs_m": 0.42, "target_px_mean_hist": 651.8, "cur_frame_id": 39, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [91.5, 60.45, 20.0, -50.35, -95.37, 0.0]\n  Target bbox: [675.04, 297.68, 716.06, 373.4]\n\nFrame 2:\n  Drone pose: [91.45, 59.92, 20.07, -48.96, -90.21, 0.0]\n  Target bbox: [628.64, 321.61, 651.22, 397.19]\n\nFrame 3:\n  Drone pose: [91.42, 59.4, 20.0, -45.88, -90.5, 0.0]\n  Target bbox: [625.57, 372.52, 663.22, 447.84]\n\nFrame 4:\n  Drone pose: [91.38, 58.88, 20.0, -47.24, -91.85, 0.0]\n  Target bbox: [642.65, 351.17, 679.29, 425.29]\n\nFrame 5 (current):\n  Drone pose: [91.41, 58.29, 20.03, -45.59, -86.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 592.66, \"ymin\": 381.25, \"xmax\": 619.19, \"ymax\": 457.76}, \"waypoint_deltas\": [{\"dx\": -0.14, \"dy\": -0.44, \"dz\": -0.03, \"dpitch\": -3.39, \"dyaw\": -2.6, \"droll\": 0.0}, {\"dx\": -0.16, \"dy\": -0.88, \"dz\": -0.03, \"dpitch\": -3.29, \"dyaw\": -2.51, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -1.28, \"dz\": -0.03, \"dpitch\": -3.11, \"dyaw\": -2.5, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -1.71, \"dz\": -0.03, \"dpitch\": -3.0, \"dyaw\": -2.49, \"droll\": 0.0}, {\"dx\": -0.17, \"dy\": -2.19, \"dz\": -0.03, \"dpitch\": -2.96, \"dyaw\": -2.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.67, "window_alt_abs_m": 0.16, "target_px_mean_hist": 749.8, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [91.38, 51.62, 19.86, -47.63, -87.37, 0.0]\n  Target bbox: [596.27, 333.77, 626.0, 408.79]\n\nFrame 2:\n  Drone pose: [91.38, 51.26, 20.04, -45.68, -88.07, 0.0]\n  Target bbox: [605.8, 370.12, 631.87, 445.88]\n\nFrame 3:\n  Drone pose: [91.21, 50.77, 20.16, -51.69, -84.4, 0.0]\n  Target bbox: [569.0, 273.4, 599.73, 349.6]\n\nFrame 4:\n  Drone pose: [91.26, 50.4, 20.0, -51.02, -84.55, 0.0]\n  Target bbox: [566.92, 276.8, 600.79, 353.64]\n\nFrame 5 (current):\n  Drone pose: [91.26, 49.8, 20.05, -51.97, -93.89, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 670.25, \"ymin\": 265.65, \"xmax\": 706.68, \"ymax\": 339.86}, \"waypoint_deltas\": [{\"dx\": 0.05, \"dy\": -0.29, \"dz\": -0.05, \"dpitch\": 3.87, \"dyaw\": 4.17, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -0.77, \"dz\": -0.05, \"dpitch\": 3.91, \"dyaw\": 4.2, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -1.26, \"dz\": -0.05, \"dpitch\": 3.92, \"dyaw\": 4.2, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -1.78, \"dz\": -0.05, \"dpitch\": 3.89, \"dyaw\": 4.34, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.33, \"dz\": -0.05, \"dpitch\": 3.81, \"dyaw\": 4.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.86, "window_alt_abs_m": 0.51, "target_px_mean_hist": 729.5, "cur_frame_id": 75, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00093/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [90.57, 40.92, 19.92, -50.17, -84.33, 0.0]\n  Target bbox: [671.18, 338.78, 715.12, 416.93]\n\nFrame 2:\n  Drone pose: [90.54, 40.49, 19.96, -54.85, -73.43, 0.0]\n  Target bbox: [573.4, 254.34, 620.3, 334.17]\n\nFrame 3:\n  Drone pose: [90.51, 39.66, 19.95, -47.52, -70.24, 0.0]\n  Target bbox: [564.61, 385.47, 608.84, 462.39]\n\nFrame 4:\n  Drone pose: [90.59, 38.62, 19.86, -51.87, -73.13, 0.0]\n  Target bbox: [619.2, 319.61, 660.6, 398.97]\n\nFrame 5 (current):\n  Drone pose: [90.08, 37.59, 20.07, -57.5, -68.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.65, \"ymin\": 236.04, \"xmax\": 656.2, \"ymax\": 314.27}, \"waypoint_deltas\": [{\"dx\": -0.73, \"dy\": -0.61, \"dz\": -0.07, \"dpitch\": 5.83, \"dyaw\": 4.16, \"droll\": 0.0}, {\"dx\": -1.27, \"dy\": -1.31, \"dz\": -0.07, \"dpitch\": 6.39, \"dyaw\": 7.99, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": -1.8, \"dz\": -0.07, \"dpitch\": 7.01, \"dyaw\": 10.02, \"droll\": 0.0}, {\"dx\": -1.43, \"dy\": -2.25, \"dz\": -0.07, \"dpitch\": 7.57, \"dyaw\": 11.5, \"droll\": 0.0}, {\"dx\": -1.45, \"dy\": -2.71, \"dz\": -0.07, \"dpitch\": 8.14, \"dyaw\": 12.93, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.77, "window_alt_abs_m": 0.34, "target_px_mean_hist": 733.8, "cur_frame_id": 93, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00106/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00110/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [91.12, 31.61, 19.8, -47.09, -52.01, 0.0]\n  Target bbox: [614.87, 315.8, 647.73, 386.85]\n\nFrame 2:\n  Drone pose: [91.73, 31.07, 19.87, -46.11, -57.11, 0.0]\n  Target bbox: [668.38, 336.49, 707.46, 410.44]\n\nFrame 3:\n  Drone pose: [92.2, 30.74, 19.96, -46.75, -53.19, 0.0]\n  Target bbox: [623.06, 325.1, 656.67, 393.84]\n\nFrame 4:\n  Drone pose: [92.77, 30.3, 20.0, -46.8, -53.5, 0.0]\n  Target bbox: [624.53, 323.47, 655.19, 395.46]\n\nFrame 5 (current):\n  Drone pose: [93.28, 30.0, 20.07, -46.66, -53.91, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.59, \"ymin\": 322.87, \"xmax\": 659.05, \"ymax\": 396.18}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": -0.57, \"dz\": -0.07, \"dpitch\": 0.03, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -0.99, \"dz\": -0.07, \"dpitch\": 0.2, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": 1.28, \"dy\": -1.4, \"dz\": -0.07, \"dpitch\": -0.03, \"dyaw\": -0.91, \"droll\": 0.0}, {\"dx\": 1.61, \"dy\": -1.81, \"dz\": -0.07, \"dpitch\": -0.19, \"dyaw\": -1.93, \"droll\": 0.0}, {\"dx\": 1.89, \"dy\": -2.21, \"dz\": -0.07, \"dpitch\": -0.31, \"dyaw\": -2.88, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.73, "window_alt_abs_m": 0.27, "target_px_mean_hist": 670.2, "cur_frame_id": 110, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00125/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00128/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [97.32, 24.19, 20.0, -47.39, -59.32, 0.0]\n  Target bbox: [559.95, 322.29, 605.7, 401.76]\n\nFrame 2:\n  Drone pose: [97.54, 23.77, 20.0, -43.69, -60.09, 0.0]\n  Target bbox: [558.42, 383.45, 607.01, 466.11]\n\nFrame 3:\n  Drone pose: [97.76, 23.35, 20.0, -47.49, -65.83, 0.0]\n  Target bbox: [616.49, 319.74, 663.82, 399.51]\n\nFrame 4:\n  Drone pose: [97.98, 22.81, 20.01, -47.7, -66.46, 0.0]\n  Target bbox: [622.75, 320.27, 657.56, 398.73]\n\nFrame 5 (current):\n  Drone pose: [98.04, 22.4, 19.89, -51.75, -61.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 558.33, \"ymin\": 248.37, \"xmax\": 606.87, \"ymax\": 329.29}, \"waypoint_deltas\": [{\"dx\": 0.31, \"dy\": -0.34, \"dz\": 0.11, \"dpitch\": 4.2, \"dyaw\": -6.16, \"droll\": 0.0}, {\"dx\": 0.49, \"dy\": -0.77, \"dz\": 0.11, \"dpitch\": 4.19, \"dyaw\": -6.8, \"droll\": 0.0}, {\"dx\": 0.67, \"dy\": -1.21, \"dz\": 0.11, \"dpitch\": 4.18, \"dyaw\": -7.41, \"droll\": 0.0}, {\"dx\": 0.84, \"dy\": -1.65, \"dz\": 0.11, \"dpitch\": 4.18, \"dyaw\": -8.01, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -2.09, \"dz\": 0.11, \"dpitch\": 4.17, \"dyaw\": -8.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.86, "window_alt_abs_m": 0.12, "target_px_mean_hist": 698.8, "cur_frame_id": 128, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00146/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [100.25, 16.22, 20.0, -47.89, -73.98, 0.0]\n  Target bbox: [618.36, 314.71, 649.27, 391.92]\n\nFrame 2:\n  Drone pose: [100.35, 15.67, 20.06, -47.73, -74.82, 0.0]\n  Target bbox: [618.7, 320.87, 661.69, 398.27]\n\nFrame 3:\n  Drone pose: [100.46, 15.3, 20.0, -51.05, -77.5, 0.0]\n  Target bbox: [643.69, 260.81, 688.54, 339.48]\n\nFrame 4:\n  Drone pose: [100.56, 14.83, 20.0, -47.49, -75.62, 0.0]\n  Target bbox: [618.2, 320.93, 662.21, 398.24]\n\nFrame 5 (current):\n  Drone pose: [100.54, 14.3, 19.94, -47.43, -75.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.93, \"ymin\": 319.96, \"xmax\": 663.48, \"ymax\": 399.13}, \"waypoint_deltas\": [{\"dx\": 0.21, \"dy\": -0.41, \"dz\": 0.06, \"dpitch\": -0.03, \"dyaw\": -0.75, \"droll\": 0.0}, {\"dx\": 0.3, \"dy\": -0.88, \"dz\": 0.06, \"dpitch\": -0.02, \"dyaw\": -1.06, \"droll\": 0.0}, {\"dx\": 0.39, \"dy\": -1.35, \"dz\": 0.06, \"dpitch\": 0.0, \"dyaw\": -1.35, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": -1.82, \"dz\": 0.06, \"dpitch\": 0.01, \"dyaw\": -1.63, \"droll\": 0.0}, {\"dx\": 0.55, \"dy\": -2.29, \"dz\": 0.06, \"dpitch\": 0.03, \"dyaw\": -1.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.49, "window_alt_abs_m": 0.18, "target_px_mean_hist": 711.0, "cur_frame_id": 146, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00160/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00161/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769/aug_001/frames_playback/frame_00164/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.67, 7.7, 20.0, -42.99, -79.15, 0.0]\n  Target bbox: [616.78, 393.62, 657.57, 469.91]\n\nFrame 2:\n  Drone pose: [101.72, 7.22, 20.0, -51.02, -84.52, 0.0]\n  Target bbox: [674.5, 260.4, 719.44, 336.08]\n\nFrame 3:\n  Drone pose: [101.86, 6.63, 19.91, -48.13, -79.9, 0.0]\n  Target bbox: [624.54, 308.15, 654.0, 383.69]\n\nFrame 4:\n  Drone pose: [101.89, 6.11, 19.91, -46.54, -78.39, 0.0]\n  Target bbox: [602.45, 334.56, 638.39, 412.68]\n\nFrame 5 (current):\n  Drone pose: [101.86, 5.77, 20.0, -47.24, -80.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.38, \"ymin\": 321.25, \"xmax\": 655.97, \"ymax\": 397.75}, \"waypoint_deltas\": [{\"dx\": 0.05, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -0.98, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.29, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": -1.46, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -1.95, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.56, \"droll\": 0.0}, {\"dx\": 0.21, \"dy\": -2.44, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.18, "window_alt_abs_m": 0.18, "target_px_mean_hist": 724.5, "cur_frame_id": 164, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776323769", "difficulty_score": 0.4755, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-130.39, -1.02, 22.0, -54.41, 11.35, 0.0]\n  Target bbox: [622.77, 326.04, 657.15, 392.72]\n\nFrame 2:\n  Drone pose: [-130.14, -1.02, 21.2, -52.71, 12.97, 0.0]\n  Target bbox: [621.7, 321.42, 658.66, 397.42]\n\nFrame 3:\n  Drone pose: [-129.89, -1.01, 20.67, -51.54, 12.74, 0.0]\n  Target bbox: [621.85, 324.8, 658.12, 393.98]\n\nFrame 4:\n  Drone pose: [-129.65, -1.01, 20.64, -50.87, 14.26, 0.0]\n  Target bbox: [619.12, 320.95, 660.67, 397.95]\n\nFrame 5 (current):\n  Drone pose: [-129.4, -1.0, 20.62, -50.18, 15.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.89, \"ymin\": 328.72, \"xmax\": 656.2, \"ymax\": 390.08}, \"waypoint_deltas\": [{\"dx\": 0.25, \"dy\": 0.01, \"dz\": -0.03, \"dpitch\": 0.45, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": 0.01, \"dz\": -0.05, \"dpitch\": 0.89, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": 0.74, \"dy\": 0.02, \"dz\": -0.07, \"dpitch\": 1.32, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": 0.02, \"dz\": -0.09, \"dpitch\": 1.76, \"dyaw\": -0.97, \"droll\": 0.0}, {\"dx\": 1.24, \"dy\": 0.03, \"dz\": -0.2, \"dpitch\": 2.31, \"dyaw\": -1.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.82, "window_alt_abs_m": 1.38, "target_px_mean_hist": 553.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-125.69, -0.92, 20.15, -43.76, 12.56, 0.0]\n  Target bbox: [623.95, 324.35, 656.36, 395.0]\n\nFrame 2:\n  Drone pose: [-125.44, -0.91, 20.14, -43.38, 12.39, 0.0]\n  Target bbox: [624.67, 324.22, 655.68, 395.27]\n\nFrame 3:\n  Drone pose: [-124.95, -1.0, 20.12, -43.32, 12.63, 0.0]\n  Target bbox: [623.57, 323.54, 656.78, 395.88]\n\nFrame 4:\n  Drone pose: [-124.46, -1.12, 20.1, -43.26, 12.94, 0.0]\n  Target bbox: [625.83, 328.75, 654.31, 390.43]\n\nFrame 5 (current):\n  Drone pose: [-123.95, -1.25, 20.09, -43.21, 13.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.37, \"ymin\": 329.62, \"xmax\": 653.72, \"ymax\": 389.47}, \"waypoint_deltas\": [{\"dx\": 0.52, \"dy\": -0.15, \"dz\": -0.01, \"dpitch\": 0.05, \"dyaw\": 0.43, \"droll\": 0.0}, {\"dx\": 1.05, \"dy\": -0.32, \"dz\": -0.02, \"dpitch\": 0.08, \"dyaw\": 0.89, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": -0.48, \"dz\": -0.03, \"dpitch\": 0.1, \"dyaw\": 1.37, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": -0.64, \"dz\": -0.04, \"dpitch\": 0.1, \"dyaw\": 1.82, \"droll\": 0.0}, {\"dx\": 2.69, \"dy\": -0.76, \"dz\": -0.05, \"dpitch\": 0.09, \"dyaw\": 2.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.1, "window_alt_abs_m": 0.06, "target_px_mean_hist": 530.5, "cur_frame_id": 23, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00042/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-115.06, -1.1, 20.01, -43.15, 24.84, 0.0]\n  Target bbox: [627.71, 326.66, 652.12, 392.57]\n\nFrame 2:\n  Drone pose: [-114.44, -0.91, 20.01, -43.12, 25.77, 0.0]\n  Target bbox: [626.74, 328.24, 653.14, 390.99]\n\nFrame 3:\n  Drone pose: [-113.84, -0.7, 20.0, -43.07, 26.64, 0.0]\n  Target bbox: [624.64, 328.31, 655.23, 390.97]\n\nFrame 4:\n  Drone pose: [-113.24, -0.47, 20.0, -43.01, 27.45, 0.0]\n  Target bbox: [622.84, 324.4, 656.88, 395.0]\n\nFrame 5 (current):\n  Drone pose: [-112.65, -0.23, 20.0, -42.95, 28.2, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.77, \"ymin\": 325.12, \"xmax\": 658.98, \"ymax\": 394.27}, \"waypoint_deltas\": [{\"dx\": 0.58, \"dy\": 0.26, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": 1.17, \"dy\": 0.55, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 1.33, \"droll\": 0.0}, {\"dx\": 1.76, \"dy\": 0.86, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 1.92, \"droll\": 0.0}, {\"dx\": 2.34, \"dy\": 1.17, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": 2.49, \"droll\": 0.0}, {\"dx\": 2.93, \"dy\": 1.49, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": 3.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.35, "window_alt_abs_m": 0.0, "target_px_mean_hist": 530.5, "cur_frame_id": 42, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00061/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.02, 4.37, 20.0, -42.53, 35.52, 0.0]\n  Target bbox: [622.07, 326.15, 657.62, 393.24]\n\nFrame 2:\n  Drone pose: [-103.47, 4.73, 20.0, -42.47, 35.91, 0.0]\n  Target bbox: [624.75, 325.2, 654.93, 394.21]\n\nFrame 3:\n  Drone pose: [-102.93, 5.11, 20.0, -42.42, 36.25, 0.0]\n  Target bbox: [623.32, 326.54, 656.39, 392.84]\n\nFrame 4:\n  Drone pose: [-102.4, 5.51, 20.0, -42.37, 36.53, 0.0]\n  Target bbox: [626.84, 328.7, 652.97, 390.58]\n\nFrame 5 (current):\n  Drone pose: [-101.88, 5.92, 20.0, -42.31, 36.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.86, \"ymin\": 326.56, \"xmax\": 656.84, \"ymax\": 392.82}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.42, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.86, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.34, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 1.3, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": 2.07, \"dy\": 1.74, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.67, \"droll\": 0.0}, {\"dx\": 2.61, \"dy\": 2.19, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.23, "window_alt_abs_m": 0.0, "target_px_mean_hist": 537.2, "cur_frame_id": 61, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-93.64, 12.25, 20.0, -42.12, 40.51, 0.0]\n  Target bbox: [624.4, 327.9, 655.32, 391.41]\n\nFrame 2:\n  Drone pose: [-93.11, 12.68, 20.0, -42.08, 40.7, 0.0]\n  Target bbox: [625.18, 325.89, 654.49, 393.47]\n\nFrame 3:\n  Drone pose: [-92.6, 13.13, 20.0, -42.05, 40.83, 0.0]\n  Target bbox: [623.43, 327.62, 656.27, 391.69]\n\nFrame 4:\n  Drone pose: [-92.09, 13.6, 20.0, -42.03, 40.9, 0.0]\n  Target bbox: [629.64, 326.93, 650.15, 392.35]\n\nFrame 5 (current):\n  Drone pose: [-91.59, 14.09, 20.0, -42.02, 40.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.65, \"ymin\": 330.15, \"xmax\": 651.2, \"ymax\": 389.07}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 1.43, \"dy\": 1.54, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": 1.88, \"dy\": 2.08, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": 2.32, \"dy\": 2.65, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.42, "window_alt_abs_m": 0.0, "target_px_mean_hist": 535.5, "cur_frame_id": 80, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-85.34, 22.63, 20.0, -41.76, 35.54, 0.0]\n  Target bbox: [625.13, 327.86, 654.85, 391.45]\n\nFrame 2:\n  Drone pose: [-84.99, 23.05, 20.0, -41.91, 34.39, 0.0]\n  Target bbox: [620.24, 323.16, 659.99, 396.46]\n\nFrame 3:\n  Drone pose: [-84.64, 23.43, 20.0, -42.03, 33.31, 0.0]\n  Target bbox: [620.42, 323.22, 659.79, 396.36]\n\nFrame 4:\n  Drone pose: [-84.28, 23.78, 20.0, -42.13, 32.28, 0.0]\n  Target bbox: [624.5, 326.97, 655.56, 392.36]\n\nFrame 5 (current):\n  Drone pose: [-83.92, 24.12, 20.0, -42.21, 31.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.85, \"ymin\": 323.49, \"xmax\": 659.4, \"ymax\": 396.11}, \"waypoint_deltas\": [{\"dx\": 0.37, \"dy\": 0.31, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.94, \"droll\": 0.0}, {\"dx\": 0.74, \"dy\": 0.62, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": -1.85, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": 0.91, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": -2.71, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 1.18, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": -3.53, \"droll\": 0.0}, {\"dx\": 1.89, \"dy\": 1.44, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": -4.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.24, "window_alt_abs_m": 0.0, "target_px_mean_hist": 520.5, "cur_frame_id": 99, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00115/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00116/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00117/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00118/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-77.47, 26.17, 20.0, -42.49, 24.93, 0.0]\n  Target bbox: [621.31, 323.01, 658.98, 396.51]\n\nFrame 2:\n  Drone pose: [-76.93, 26.07, 20.0, -42.48, 25.24, 0.0]\n  Target bbox: [622.55, 324.43, 657.68, 395.06]\n\nFrame 3:\n  Drone pose: [-76.38, 25.95, 20.0, -42.48, 25.6, 0.0]\n  Target bbox: [621.58, 323.53, 658.68, 395.99]\n\nFrame 4:\n  Drone pose: [-75.8, 25.83, 20.0, -42.5, 25.98, 0.0]\n  Target bbox: [625.27, 328.14, 654.78, 391.11]\n\nFrame 5 (current):\n  Drone pose: [-75.21, 25.72, 20.0, -42.54, 26.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.82, \"ymin\": 322.53, \"xmax\": 659.47, \"ymax\": 397.02}, \"waypoint_deltas\": [{\"dx\": 0.61, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.43, \"droll\": 0.0}, {\"dx\": 1.25, \"dy\": -0.26, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": 0.95, \"droll\": 0.0}, {\"dx\": 1.91, \"dy\": -0.43, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 2.78, \"droll\": 0.0}, {\"dx\": 2.59, \"dy\": -0.62, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": 5.38, \"droll\": 0.0}, {\"dx\": 3.3, \"dy\": -0.82, \"dz\": 0.0, \"dpitch\": -0.54, \"dyaw\": 8.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.44, "window_alt_abs_m": 0.0, "target_px_mean_hist": 504.8, "cur_frame_id": 118, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00133/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00134/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00135/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00136/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00137/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-64.91, 25.48, 20.0, -43.41, 52.39, 0.0]\n  Target bbox: [623.56, 326.29, 656.39, 393.05]\n\nFrame 2:\n  Drone pose: [-64.45, 25.71, 20.0, -43.48, 53.88, 0.0]\n  Target bbox: [619.64, 323.33, 660.3, 396.08]\n\nFrame 3:\n  Drone pose: [-64.03, 25.97, 20.0, -43.55, 55.26, 0.0]\n  Target bbox: [618.82, 323.42, 661.17, 396.0]\n\nFrame 4:\n  Drone pose: [-63.65, 26.25, 20.0, -43.6, 56.53, 0.0]\n  Target bbox: [618.91, 322.9, 661.02, 396.5]\n\nFrame 5 (current):\n  Drone pose: [-63.28, 26.54, 20.0, -43.63, 57.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.27, \"ymin\": 325.46, \"xmax\": 656.63, \"ymax\": 393.84}, \"waypoint_deltas\": [{\"dx\": 0.34, \"dy\": 0.32, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 1.12, \"droll\": 0.0}, {\"dx\": 0.67, \"dy\": 0.66, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 2.17, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 3.22, \"droll\": 0.0}, {\"dx\": 1.32, \"dy\": 1.32, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 4.28, \"droll\": 0.0}, {\"dx\": 1.63, \"dy\": 1.65, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": 5.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.33, "window_alt_abs_m": 0.0, "target_px_mean_hist": 537.8, "cur_frame_id": 137, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00152/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00153/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00154/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00155/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00156/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.93, 31.76, 20.0, -43.33, 71.75, 0.0]\n  Target bbox: [624.33, 326.12, 655.49, 393.12]\n\nFrame 2:\n  Drone pose: [-58.7, 32.16, 20.0, -43.3, 72.47, 0.0]\n  Target bbox: [627.3, 327.1, 652.53, 392.15]\n\nFrame 3:\n  Drone pose: [-58.48, 32.57, 20.0, -43.27, 73.14, 0.0]\n  Target bbox: [627.0, 327.75, 652.85, 391.5]\n\nFrame 4:\n  Drone pose: [-58.27, 32.99, 20.0, -43.25, 73.78, 0.0]\n  Target bbox: [627.9, 326.42, 651.9, 392.86]\n\nFrame 5 (current):\n  Drone pose: [-58.07, 33.43, 20.0, -43.24, 74.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.57, \"ymin\": 326.39, \"xmax\": 655.23, \"ymax\": 392.85}, \"waypoint_deltas\": [{\"dx\": 0.19, \"dy\": 0.44, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.55, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": 0.9, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 1.04, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": 1.36, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 1.47, \"droll\": 0.0}, {\"dx\": 0.64, \"dy\": 1.84, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 1.85, \"droll\": 0.0}, {\"dx\": 0.74, \"dy\": 2.31, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 2.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.62, "window_alt_abs_m": 0.0, "target_px_mean_hist": 541.8, "cur_frame_id": 156, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00171/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00172/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00173/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00174/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/ORI/frames_playback/frame_00175/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-57.17, 40.44, 20.0, -42.89, 77.16, 0.0]\n  Target bbox: [629.64, 327.37, 650.19, 391.89]\n\nFrame 2:\n  Drone pose: [-57.2, 40.93, 20.0, -42.87, 77.08, 0.0]\n  Target bbox: [627.98, 328.23, 651.86, 391.02]\n\nFrame 3:\n  Drone pose: [-57.22, 41.44, 20.0, -42.88, 77.03, 0.0]\n  Target bbox: [629.66, 327.29, 650.17, 391.97]\n\nFrame 4:\n  Drone pose: [-57.22, 41.97, 20.0, -42.91, 77.01, 0.0]\n  Target bbox: [627.83, 327.02, 652.0, 392.19]\n\nFrame 5 (current):\n  Drone pose: [-57.19, 42.52, 20.0, -42.99, 77.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.0, \"ymin\": 324.04, \"xmax\": 660.13, \"ymax\": 395.45}, \"waypoint_deltas\": [{\"dx\": 0.05, \"dy\": 0.57, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -1.27, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": 1.17, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -1.13, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": 1.78, \"dz\": 0.0, \"dpitch\": -0.3, \"dyaw\": -0.93, \"droll\": 0.0}, {\"dx\": 0.34, \"dy\": 2.4, \"dz\": 0.0, \"dpitch\": -0.31, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": 0.45, \"dy\": 2.99, \"dz\": 0.0, \"dpitch\": -0.48, \"dyaw\": -1.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.18, "window_alt_abs_m": 0.0, "target_px_mean_hist": 518.2, "cur_frame_id": 175, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-130.39, -1.02, 22.0, -54.41, 11.35, 0.0]\n  Target bbox: [619.66, 323.37, 660.2, 395.48]\n\nFrame 2:\n  Drone pose: [-130.16, -0.9, 21.26, -53.7, 7.54, 0.0]\n  Target bbox: [675.27, 315.88, 707.12, 376.5]\n\nFrame 3:\n  Drone pose: [-129.89, -1.01, 20.67, -50.66, 7.74, 0.0]\n  Target bbox: [668.22, 336.83, 717.31, 415.36]\n\nFrame 4:\n  Drone pose: [-129.62, -1.12, 20.61, -50.81, 14.7, 0.0]\n  Target bbox: [622.76, 326.21, 657.22, 392.62]\n\nFrame 5 (current):\n  Drone pose: [-129.35, -0.9, 20.48, -50.11, 15.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.05, \"ymin\": 328.31, \"xmax\": 655.06, \"ymax\": 390.48}, \"waypoint_deltas\": [{\"dx\": 0.2, \"dy\": -0.09, \"dz\": 0.11, \"dpitch\": 0.38, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 0.45, \"dy\": -0.09, \"dz\": 0.09, \"dpitch\": 0.82, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -0.08, \"dz\": 0.07, \"dpitch\": 1.25, \"dyaw\": -0.45, \"droll\": 0.0}, {\"dx\": 0.94, \"dy\": -0.08, \"dz\": 0.05, \"dpitch\": 1.69, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": 1.19, \"dy\": -0.07, \"dz\": -0.06, \"dpitch\": 2.24, \"dyaw\": -0.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.7, "window_alt_abs_m": 1.52, "target_px_mean_hist": 569.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-125.69, -0.92, 20.15, -43.76, 12.56, 0.0]\n  Target bbox: [624.32, 323.75, 656.02, 395.68]\n\nFrame 2:\n  Drone pose: [-125.44, -0.91, 20.14, -43.38, 12.39, 0.0]\n  Target bbox: [626.36, 329.0, 653.76, 390.1]\n\nFrame 3:\n  Drone pose: [-124.95, -1.0, 20.12, -40.68, 16.61, 0.0]\n  Target bbox: [575.24, 370.4, 607.03, 440.13]\n\nFrame 4:\n  Drone pose: [-124.54, -1.21, 20.06, -43.04, 13.14, 0.0]\n  Target bbox: [625.87, 329.93, 654.23, 389.21]\n\nFrame 5 (current):\n  Drone pose: [-123.93, -1.3, 20.24, -43.43, 13.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.96, \"ymin\": 324.79, \"xmax\": 655.34, \"ymax\": 394.61}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.1, \"dz\": -0.16, \"dpitch\": 0.27, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -0.27, \"dz\": -0.17, \"dpitch\": 0.3, \"dyaw\": 0.75, \"droll\": 0.0}, {\"dx\": 1.57, \"dy\": -0.43, \"dz\": -0.18, \"dpitch\": 0.32, \"dyaw\": 1.23, \"droll\": 0.0}, {\"dx\": 2.11, \"dy\": -0.59, \"dz\": -0.19, \"dpitch\": 0.32, \"dyaw\": 1.68, \"droll\": 0.0}, {\"dx\": 2.67, \"dy\": -0.71, \"dz\": -0.2, \"dpitch\": 0.31, \"dyaw\": 2.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.19, "window_alt_abs_m": 0.27, "target_px_mean_hist": 543.5, "cur_frame_id": 23, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00042/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-115.06, -1.1, 20.01, -44.73, 29.84, 0.0]\n  Target bbox: [564.11, 302.98, 592.7, 367.13]\n\nFrame 2:\n  Drone pose: [-114.35, -0.89, 19.96, -43.18, 25.85, 0.0]\n  Target bbox: [621.19, 325.13, 658.59, 394.27]\n\nFrame 3:\n  Drone pose: [-113.75, -0.74, 20.16, -43.38, 26.84, 0.0]\n  Target bbox: [625.67, 325.65, 654.12, 393.62]\n\nFrame 4:\n  Drone pose: [-113.08, -0.41, 20.05, -43.98, 28.96, 0.0]\n  Target bbox: [607.22, 317.79, 636.19, 379.51]\n\nFrame 5 (current):\n  Drone pose: [-112.74, -0.17, 20.0, -44.42, 23.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 676.17, \"ymin\": 300.67, \"xmax\": 714.03, \"ymax\": 370.08}, \"waypoint_deltas\": [{\"dx\": 0.67, \"dy\": 0.2, \"dz\": 0.0, \"dpitch\": 1.52, \"dyaw\": 5.4, \"droll\": 0.0}, {\"dx\": 1.26, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": 1.56, \"dyaw\": 6.04, \"droll\": 0.0}, {\"dx\": 1.85, \"dy\": 0.8, \"dz\": 0.0, \"dpitch\": 1.59, \"dyaw\": 6.63, \"droll\": 0.0}, {\"dx\": 2.43, \"dy\": 1.11, \"dz\": 0.0, \"dpitch\": 1.62, \"dyaw\": 7.2, \"droll\": 0.0}, {\"dx\": 3.02, \"dy\": 1.43, \"dz\": 0.0, \"dpitch\": 1.64, \"dyaw\": 7.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.57, "window_alt_abs_m": 0.4, "target_px_mean_hist": 543.8, "cur_frame_id": 42, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00061/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.06, 4.44, 19.97, -44.08, 39.13, 0.0]\n  Target bbox: [574.75, 301.64, 609.29, 366.94]\n\nFrame 2:\n  Drone pose: [-103.47, 4.73, 20.0, -42.47, 35.91, 0.0]\n  Target bbox: [623.88, 325.11, 655.81, 394.25]\n\nFrame 3:\n  Drone pose: [-102.93, 5.11, 20.0, -39.22, 38.2, 0.0]\n  Target bbox: [599.42, 382.69, 631.47, 444.57]\n\nFrame 4:\n  Drone pose: [-102.54, 5.62, 19.95, -40.51, 41.05, 0.0]\n  Target bbox: [564.52, 356.97, 590.41, 423.55]\n\nFrame 5 (current):\n  Drone pose: [-101.88, 5.92, 20.0, -42.31, 36.75, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.93, \"ymin\": 328.07, \"xmax\": 654.86, \"ymax\": 391.19}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.42, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.86, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.34, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 1.3, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 0.5, \"droll\": 0.0}, {\"dx\": 2.07, \"dy\": 1.74, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.67, \"droll\": 0.0}, {\"dx\": 2.61, \"dy\": 2.19, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.67, "window_alt_abs_m": 0.13, "target_px_mean_hist": 533.8, "cur_frame_id": 61, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-93.67, 12.16, 19.87, -42.94, 37.08, 0.0]\n  Target bbox: [671.41, 309.42, 697.7, 373.73]\n\nFrame 2:\n  Drone pose: [-93.18, 12.73, 20.07, -42.16, 40.47, 0.0]\n  Target bbox: [627.83, 326.88, 651.93, 392.41]\n\nFrame 3:\n  Drone pose: [-92.6, 13.13, 20.0, -42.5, 37.29, 0.0]\n  Target bbox: [667.17, 320.18, 701.25, 386.11]\n\nFrame 4:\n  Drone pose: [-92.09, 13.6, 20.0, -39.68, 40.57, 0.0]\n  Target bbox: [631.17, 370.01, 657.03, 428.07]\n\nFrame 5 (current):\n  Drone pose: [-91.64, 14.07, 20.12, -41.85, 44.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 580.35, \"ymin\": 332.17, \"xmax\": 609.85, \"ymax\": 398.34}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": 0.52, \"dz\": -0.12, \"dpitch\": -0.15, \"dyaw\": -3.55, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 1.03, \"dz\": -0.12, \"dpitch\": -0.14, \"dyaw\": -3.61, \"droll\": 0.0}, {\"dx\": 1.48, \"dy\": 1.56, \"dz\": -0.12, \"dpitch\": -0.13, \"dyaw\": -3.73, \"droll\": 0.0}, {\"dx\": 1.93, \"dy\": 2.1, \"dz\": -0.12, \"dpitch\": -0.12, \"dyaw\": -3.91, \"droll\": 0.0}, {\"dx\": 2.37, \"dy\": 2.67, \"dz\": -0.12, \"dpitch\": -0.12, \"dyaw\": -4.14, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.71, "window_alt_abs_m": 0.39, "target_px_mean_hist": 537.8, "cur_frame_id": 80, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-85.27, 22.52, 19.87, -40.14, 40.91, 0.0]\n  Target bbox: [560.17, 351.73, 593.56, 418.22]\n\nFrame 2:\n  Drone pose: [-84.99, 23.05, 20.0, -42.0, 34.58, 0.0]\n  Target bbox: [623.57, 326.8, 651.62, 389.71]\n\nFrame 3:\n  Drone pose: [-84.46, 23.4, 20.03, -42.25, 33.64, 0.0]\n  Target bbox: [625.0, 327.86, 654.97, 391.41]\n\nFrame 4:\n  Drone pose: [-84.46, 23.74, 20.03, -41.95, 32.12, 0.0]\n  Target bbox: [622.13, 324.19, 658.05, 395.3]\n\nFrame 5 (current):\n  Drone pose: [-84.09, 24.11, 20.1, -42.15, 31.08, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.85, \"ymin\": 323.46, \"xmax\": 659.38, \"ymax\": 396.12}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": 0.32, \"dz\": -0.1, \"dpitch\": -0.12, \"dyaw\": -0.72, \"droll\": 0.0}, {\"dx\": 0.91, \"dy\": 0.63, \"dz\": -0.1, \"dpitch\": -0.18, \"dyaw\": -1.63, \"droll\": 0.0}, {\"dx\": 1.29, \"dy\": 0.92, \"dz\": -0.1, \"dpitch\": -0.23, \"dyaw\": -2.49, \"droll\": 0.0}, {\"dx\": 1.67, \"dy\": 1.19, \"dz\": -0.1, \"dpitch\": -0.26, \"dyaw\": -3.31, \"droll\": 0.0}, {\"dx\": 2.06, \"dy\": 1.45, \"dz\": -0.1, \"dpitch\": -0.29, \"dyaw\": -4.08, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.84, "window_alt_abs_m": 0.23, "target_px_mean_hist": 521.2, "cur_frame_id": 99, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00115/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00116/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00117/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00118/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-77.47, 26.17, 20.0, -42.49, 24.93, 0.0]\n  Target bbox: [621.81, 324.08, 658.42, 395.39]\n\nFrame 2:\n  Drone pose: [-76.93, 25.99, 20.01, -41.79, 21.68, 0.0]\n  Target bbox: [671.07, 340.33, 702.6, 403.71]\n\nFrame 3:\n  Drone pose: [-76.38, 25.95, 20.0, -44.53, 26.49, 0.0]\n  Target bbox: [612.25, 290.88, 645.39, 359.88]\n\nFrame 4:\n  Drone pose: [-75.91, 25.73, 20.11, -41.36, 21.64, 0.0]\n  Target bbox: [677.93, 345.17, 713.48, 414.75]\n\nFrame 5 (current):\n  Drone pose: [-75.17, 25.9, 19.98, -41.32, 25.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 634.08, \"ymin\": 351.34, \"xmax\": 663.66, \"ymax\": 413.58}, \"waypoint_deltas\": [{\"dx\": 0.57, \"dy\": -0.3, \"dz\": 0.02, \"dpitch\": -1.29, \"dyaw\": 1.52, \"droll\": 0.0}, {\"dx\": 1.21, \"dy\": -0.44, \"dz\": 0.02, \"dpitch\": -1.37, \"dyaw\": 2.04, \"droll\": 0.0}, {\"dx\": 1.87, \"dy\": -0.61, \"dz\": 0.02, \"dpitch\": -1.13, \"dyaw\": 3.87, \"droll\": 0.0}, {\"dx\": 2.55, \"dy\": -0.8, \"dz\": 0.02, \"dpitch\": -1.46, \"dyaw\": 6.47, \"droll\": 0.0}, {\"dx\": 3.26, \"dy\": -1.0, \"dz\": 0.02, \"dpitch\": -1.76, \"dyaw\": 9.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.55, "window_alt_abs_m": 0.26, "target_px_mean_hist": 527.8, "cur_frame_id": 118, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00133/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00134/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00135/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00136/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00137/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-64.82, 25.45, 19.97, -43.4, 52.64, 0.0]\n  Target bbox: [622.33, 324.51, 657.57, 394.82]\n\nFrame 2:\n  Drone pose: [-64.5, 25.71, 20.14, -43.65, 53.78, 0.0]\n  Target bbox: [618.69, 323.82, 661.32, 395.65]\n\nFrame 3:\n  Drone pose: [-64.11, 26.04, 19.86, -42.62, 55.72, 0.0]\n  Target bbox: [613.83, 337.04, 647.51, 406.99]\n\nFrame 4:\n  Drone pose: [-63.61, 26.13, 20.09, -43.88, 51.79, 0.0]\n  Target bbox: [681.04, 322.17, 721.25, 392.12]\n\nFrame 5 (current):\n  Drone pose: [-63.35, 26.59, 19.83, -43.37, 57.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.89, \"ymin\": 323.66, \"xmax\": 661.06, \"ymax\": 395.79}, \"waypoint_deltas\": [{\"dx\": 0.41, \"dy\": 0.27, \"dz\": 0.17, \"dpitch\": -0.3, \"dyaw\": 1.35, \"droll\": 0.0}, {\"dx\": 0.74, \"dy\": 0.61, \"dz\": 0.17, \"dpitch\": -0.35, \"dyaw\": 2.4, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": 0.95, \"dz\": 0.17, \"dpitch\": -0.38, \"dyaw\": 3.45, \"droll\": 0.0}, {\"dx\": 1.39, \"dy\": 1.27, \"dz\": 0.17, \"dpitch\": -0.36, \"dyaw\": 4.51, \"droll\": 0.0}, {\"dx\": 1.7, \"dy\": 1.6, \"dz\": 0.17, \"dpitch\": -0.36, \"dyaw\": 5.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.7, "window_alt_abs_m": 0.95, "target_px_mean_hist": 535.8, "cur_frame_id": 137, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00152/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00153/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00154/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00155/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00156/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-58.87, 31.68, 20.09, -39.38, 70.37, 0.0]\n  Target bbox: [641.28, 393.73, 678.79, 461.23]\n\nFrame 2:\n  Drone pose: [-58.85, 32.14, 20.11, -40.66, 67.09, 0.0]\n  Target bbox: [684.46, 373.72, 718.13, 440.47]\n\nFrame 3:\n  Drone pose: [-58.57, 32.72, 20.04, -43.5, 72.76, 0.0]\n  Target bbox: [627.6, 327.1, 652.23, 392.13]\n\nFrame 4:\n  Drone pose: [-58.17, 32.93, 19.87, -38.88, 72.74, 0.0]\n  Target bbox: [645.43, 396.26, 667.73, 462.33]\n\nFrame 5 (current):\n  Drone pose: [-58.07, 33.43, 20.0, -42.45, 70.58, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 672.44, \"ymin\": 340.89, \"xmax\": 700.64, \"ymax\": 406.94}, \"waypoint_deltas\": [{\"dx\": 0.19, \"dy\": 0.44, \"dz\": 0.0, \"dpitch\": -0.78, \"dyaw\": 4.34, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": 0.9, \"dz\": 0.0, \"dpitch\": -0.79, \"dyaw\": 4.83, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": 1.36, \"dz\": 0.0, \"dpitch\": -0.79, \"dyaw\": 5.26, \"droll\": 0.0}, {\"dx\": 0.64, \"dy\": 1.84, \"dz\": 0.0, \"dpitch\": -0.8, \"dyaw\": 5.64, \"droll\": 0.0}, {\"dx\": 0.74, \"dy\": 2.31, \"dz\": 0.0, \"dpitch\": -0.8, \"dyaw\": 5.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.13, "window_alt_abs_m": 0.39, "target_px_mean_hist": 560.0, "cur_frame_id": 156, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00171/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00172/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00173/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00174/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982/aug_001/frames_playback/frame_00175/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-57.14, 40.55, 20.04, -44.82, 81.76, 0.0]\n  Target bbox: [572.39, 301.27, 594.81, 363.45]\n\nFrame 2:\n  Drone pose: [-57.2, 40.93, 20.0, -42.87, 77.08, 0.0]\n  Target bbox: [621.12, 325.75, 658.61, 393.59]\n\nFrame 3:\n  Drone pose: [-57.21, 41.53, 20.16, -38.46, 77.35, 0.0]\n  Target bbox: [619.81, 407.43, 651.13, 472.87]\n\nFrame 4:\n  Drone pose: [-57.22, 41.97, 20.0, -42.91, 77.01, 0.0]\n  Target bbox: [621.19, 326.59, 658.58, 392.74]\n\nFrame 5 (current):\n  Drone pose: [-57.25, 42.42, 20.07, -39.5, 81.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 564.8, \"ymin\": 382.98, \"xmax\": 606.51, \"ymax\": 454.99}, \"waypoint_deltas\": [{\"dx\": 0.11, \"dy\": 0.67, \"dz\": -0.07, \"dpitch\": -3.44, \"dyaw\": -5.56, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": 1.27, \"dz\": -0.07, \"dpitch\": -3.6, \"dyaw\": -5.42, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": 1.88, \"dz\": -0.07, \"dpitch\": -3.79, \"dyaw\": -5.22, \"droll\": 0.0}, {\"dx\": 0.4, \"dy\": 2.5, \"dz\": -0.07, \"dpitch\": -3.8, \"dyaw\": -6.37, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": 3.09, \"dz\": -0.07, \"dpitch\": -3.97, \"dyaw\": -6.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.6, "window_alt_abs_m": 0.44, "target_px_mean_hist": 534.0, "cur_frame_id": 175, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-17/trajectory_1776366982", "difficulty_score": 0.4433, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [78.89, -56.56, 22.0, -46.45, 180.0, 0.0]\n  Target bbox: [624.2, 331.2, 655.8, 388.01]\n\nFrame 2:\n  Drone pose: [76.85, -58.04, 21.2, -47.54, 175.4, 0.0]\n  Target bbox: [621.23, 326.45, 658.42, 392.66]\n\nFrame 3:\n  Drone pose: [75.51, -58.93, 20.67, -47.95, 172.31, 0.0]\n  Target bbox: [620.81, 322.37, 658.74, 396.85]\n\nFrame 4:\n  Drone pose: [74.57, -59.44, 20.64, -48.49, 170.48, 0.0]\n  Target bbox: [619.42, 325.66, 660.29, 393.34]\n\nFrame 5 (current):\n  Drone pose: [73.87, -59.7, 20.62, -48.7, 169.52, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.37, \"ymin\": 327.25, \"xmax\": 659.4, \"ymax\": 391.75}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": -0.11, \"dz\": -0.03, \"dpitch\": -0.07, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": -1.11, \"dy\": -0.17, \"dz\": -0.05, \"dpitch\": -0.06, \"dyaw\": -0.62, \"droll\": 0.0}, {\"dx\": -1.61, \"dy\": -0.2, \"dz\": -0.07, \"dpitch\": -0.01, \"dyaw\": -0.75, \"droll\": 0.0}, {\"dx\": -2.08, \"dy\": -0.25, \"dz\": -0.09, \"dpitch\": 0.08, \"dyaw\": -0.87, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": -0.3, \"dz\": -0.2, \"dpitch\": 0.33, \"dyaw\": -1.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.48, "window_alt_abs_m": 1.38, "target_px_mean_hist": 656.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [71.34, -60.0, 20.42, -48.37, 168.52, 0.0]\n  Target bbox: [619.82, 324.99, 659.88, 394.08]\n\nFrame 2:\n  Drone pose: [70.89, -60.06, 20.39, -48.22, 168.36, 0.0]\n  Target bbox: [620.12, 326.72, 659.66, 392.29]\n\nFrame 3:\n  Drone pose: [70.45, -60.12, 20.36, -48.07, 168.19, 0.0]\n  Target bbox: [618.93, 320.06, 660.61, 399.07]\n\nFrame 4:\n  Drone pose: [70.0, -60.19, 20.33, -47.91, 168.01, 0.0]\n  Target bbox: [619.82, 325.5, 659.93, 393.51]\n\nFrame 5 (current):\n  Drone pose: [69.56, -60.26, 20.3, -47.76, 167.83, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.98, \"ymin\": 326.19, \"xmax\": 659.78, \"ymax\": 392.85}, \"waypoint_deltas\": [{\"dx\": -0.45, \"dy\": -0.07, \"dz\": -0.03, \"dpitch\": 0.16, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": -0.91, \"dy\": -0.13, \"dz\": -0.06, \"dpitch\": 0.28, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": -1.38, \"dy\": -0.22, \"dz\": -0.08, \"dpitch\": 0.38, \"dyaw\": -0.61, \"droll\": 0.0}, {\"dx\": -1.89, \"dy\": -0.24, \"dz\": -0.11, \"dpitch\": 0.42, \"dyaw\": -0.68, \"droll\": 0.0}, {\"dx\": -2.39, \"dy\": -0.24, \"dz\": -0.13, \"dpitch\": 0.46, \"dyaw\": -0.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.68, "window_alt_abs_m": 0.12, "target_px_mean_hist": 696.0, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [66.67, -60.47, 20.15, -47.28, 167.23, 0.0]\n  Target bbox: [618.49, 320.36, 661.02, 398.94]\n\nFrame 2:\n  Drone pose: [66.25, -60.23, 20.14, -47.21, 168.04, 0.0]\n  Target bbox: [620.11, 326.41, 659.67, 392.64]\n\nFrame 3:\n  Drone pose: [65.86, -59.91, 20.12, -47.12, 169.12, 0.0]\n  Target bbox: [619.02, 319.43, 660.46, 399.87]\n\nFrame 4:\n  Drone pose: [65.47, -59.58, 20.1, -47.02, 170.23, 0.0]\n  Target bbox: [618.53, 322.24, 661.05, 396.88]\n\nFrame 5 (current):\n  Drone pose: [65.05, -59.33, 20.09, -46.95, 171.08, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.29, \"ymin\": 319.54, \"xmax\": 660.16, \"ymax\": 399.76}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.12, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.39, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": 0.13, \"dz\": -0.02, \"dpitch\": -0.02, \"dyaw\": 0.39, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.07, \"dz\": -0.03, \"dpitch\": -0.07, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": -2.13, \"dy\": -0.03, \"dz\": -0.04, \"dpitch\": -0.13, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": -2.68, \"dy\": -0.14, \"dz\": -0.05, \"dpitch\": -0.18, \"dyaw\": -0.56, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.86, "window_alt_abs_m": 0.06, "target_px_mean_hist": 710.5, "cur_frame_id": 23, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00032/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [62.37, -59.47, 20.04, -47.13, 170.52, 0.0]\n  Target bbox: [620.03, 326.72, 659.74, 392.25]\n\nFrame 2:\n  Drone pose: [61.83, -59.56, 20.04, -47.15, 170.23, 0.0]\n  Target bbox: [619.85, 325.02, 659.84, 394.05]\n\nFrame 3:\n  Drone pose: [61.33, -59.59, 20.03, -47.14, 170.12, 0.0]\n  Target bbox: [620.09, 324.59, 659.58, 394.52]\n\nFrame 4:\n  Drone pose: [60.84, -59.61, 20.03, -47.11, 170.07, 0.0]\n  Target bbox: [620.2, 325.73, 659.52, 393.35]\n\nFrame 5 (current):\n  Drone pose: [60.33, -59.65, 20.02, -47.11, 169.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.33, \"ymin\": 326.78, \"xmax\": 659.44, \"ymax\": 392.26}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": -0.16, \"dz\": -0.01, \"dpitch\": -0.04, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -0.24, \"dz\": -0.01, \"dpitch\": -0.06, \"dyaw\": -0.83, \"droll\": 0.0}, {\"dx\": -2.66, \"dy\": -0.37, \"dz\": -0.01, \"dpitch\": -0.12, \"dyaw\": -1.28, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.57, "window_alt_abs_m": 0.02, "target_px_mean_hist": 714.2, "cur_frame_id": 32, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00042/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [57.06, -60.18, 20.01, -47.35, 168.07, 0.0]\n  Target bbox: [618.08, 318.46, 661.38, 400.78]\n\nFrame 2:\n  Drone pose: [56.42, -60.36, 20.01, -47.5, 167.4, 0.0]\n  Target bbox: [620.21, 322.49, 659.94, 396.63]\n\nFrame 3:\n  Drone pose: [55.78, -60.54, 20.0, -47.84, 168.35, 0.0]\n  Target bbox: [619.27, 325.97, 660.51, 392.97]\n\nFrame 4:\n  Drone pose: [55.15, -60.69, 20.0, -47.99, 167.77, 0.0]\n  Target bbox: [619.34, 322.36, 660.79, 396.77]\n\nFrame 5 (current):\n  Drone pose: [54.56, -60.81, 20.0, -48.27, 168.97, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.54, \"ymin\": 319.99, \"xmax\": 661.0, \"ymax\": 399.07}, \"waypoint_deltas\": [{\"dx\": -0.56, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": -1.09, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": -0.27, \"dyaw\": 1.15, \"droll\": 0.0}, {\"dx\": -1.59, \"dy\": -0.19, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": 0.95, \"droll\": 0.0}, {\"dx\": -2.04, \"dy\": -0.27, \"dz\": 0.0, \"dpitch\": -0.27, \"dyaw\": 2.39, \"droll\": 0.0}, {\"dx\": -2.46, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 2.1, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.4, "window_alt_abs_m": 0.0, "target_px_mean_hist": 721.8, "cur_frame_id": 42, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [51.69, -61.29, 20.0, -48.33, 172.41, 0.0]\n  Target bbox: [618.97, 322.99, 660.63, 395.98]\n\nFrame 2:\n  Drone pose: [51.28, -61.43, 20.0, -48.15, 172.01, 0.0]\n  Target bbox: [620.17, 325.65, 659.52, 393.35]\n\nFrame 3:\n  Drone pose: [50.86, -61.58, 20.0, -47.98, 171.54, 0.0]\n  Target bbox: [615.23, 318.1, 664.98, 401.13]\n\nFrame 4:\n  Drone pose: [50.43, -61.75, 20.0, -47.94, 172.65, 0.0]\n  Target bbox: [618.93, 320.3, 660.57, 398.74]\n\nFrame 5 (current):\n  Drone pose: [49.98, -61.94, 20.0, -47.81, 172.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.97, \"ymin\": 320.4, \"xmax\": 662.24, \"ymax\": 398.8}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 1.0, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.34, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": -0.61, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 1.34, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": -0.8, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.73, \"droll\": 0.0}, {\"dx\": -2.39, \"dy\": -0.97, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 1.82, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.59, "window_alt_abs_m": 0.0, "target_px_mean_hist": 716.8, "cur_frame_id": 52, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00061/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [47.59, -62.91, 20.0, -47.75, 173.86, 0.0]\n  Target bbox: [620.68, 326.29, 659.01, 392.72]\n\nFrame 2:\n  Drone pose: [47.1, -63.06, 20.0, -47.7, 173.36, 0.0]\n  Target bbox: [615.67, 318.67, 664.51, 400.58]\n\nFrame 3:\n  Drone pose: [46.62, -63.2, 20.0, -47.73, 174.56, 0.0]\n  Target bbox: [619.81, 324.88, 659.83, 394.08]\n\nFrame 4:\n  Drone pose: [46.13, -63.33, 20.0, -47.69, 174.13, 0.0]\n  Target bbox: [620.75, 321.53, 658.78, 397.55]\n\nFrame 5 (current):\n  Drone pose: [45.64, -63.45, 20.0, -47.65, 173.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.06, \"ymin\": 316.61, \"xmax\": 667.93, \"ymax\": 402.68}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 1.24, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -0.25, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.82, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": -0.38, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 2.04, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": 1.59, \"droll\": 0.0}, {\"dx\": -2.66, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": -0.38, \"dyaw\": 2.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.53, "window_alt_abs_m": 0.0, "target_px_mean_hist": 721.5, "cur_frame_id": 61, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00071/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [42.41, -64.21, 20.0, -48.15, 176.12, 0.0]\n  Target bbox: [615.98, 319.03, 664.16, 400.16]\n\nFrame 2:\n  Drone pose: [41.82, -64.31, 20.0, -48.33, 177.44, 0.0]\n  Target bbox: [621.16, 321.11, 658.35, 397.87]\n\nFrame 3:\n  Drone pose: [41.24, -64.4, 20.0, -48.46, 177.14, 0.0]\n  Target bbox: [616.49, 319.4, 663.67, 399.74]\n\nFrame 4:\n  Drone pose: [40.66, -64.47, 20.0, -48.62, 178.6, 0.0]\n  Target bbox: [620.66, 328.06, 659.04, 390.77]\n\nFrame 5 (current):\n  Drone pose: [40.07, -64.53, 20.0, -48.76, 178.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.5, \"ymin\": 327.63, \"xmax\": 659.19, \"ymax\": 391.2}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": -1.16, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": -0.29, \"dyaw\": 1.32, \"droll\": 0.0}, {\"dx\": -1.74, \"dy\": -0.17, \"dz\": 0.0, \"dpitch\": -0.42, \"dyaw\": 1.09, \"droll\": 0.0}, {\"dx\": -2.3, \"dy\": -0.25, \"dz\": 0.0, \"dpitch\": -0.54, \"dyaw\": 2.58, \"droll\": 0.0}, {\"dx\": -2.87, \"dy\": -0.33, \"dz\": 0.0, \"dpitch\": -0.65, \"dyaw\": 2.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.29, "window_alt_abs_m": 0.0, "target_px_mean_hist": 732.5, "cur_frame_id": 71, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [37.2, -64.86, 20.0, -49.41, -179.31, 0.0]\n  Target bbox: [618.7, 321.2, 661.32, 397.78]\n\nFrame 2:\n  Drone pose: [36.64, -64.94, 20.0, -49.49, -177.83, 0.0]\n  Target bbox: [621.47, 320.65, 659.03, 398.24]\n\nFrame 3:\n  Drone pose: [36.09, -65.02, 20.0, -49.59, -178.12, 0.0]\n  Target bbox: [617.08, 320.54, 662.9, 398.53]\n\nFrame 4:\n  Drone pose: [35.54, -65.1, 20.0, -49.64, -176.61, 0.0]\n  Target bbox: [620.93, 327.14, 659.36, 391.7]\n\nFrame 5 (current):\n  Drone pose: [35.01, -65.16, 20.0, -49.71, -176.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.88, \"ymin\": 321.56, \"xmax\": 661.14, \"ymax\": 397.39}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 1.64, \"droll\": 0.0}, {\"dx\": -1.08, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 1.53, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 1.43, \"droll\": 0.0}, {\"dx\": -2.1, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 3.24, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 3.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.46, "window_alt_abs_m": 0.0, "target_px_mean_hist": 755.2, "cur_frame_id": 80, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/ORI/frames_playback/frame_00090/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [32.06, -65.15, 20.0, -49.36, -171.53, 0.0]\n  Target bbox: [619.13, 318.28, 661.44, 400.83]\n\nFrame 2:\n  Drone pose: [31.66, -65.08, 20.0, -49.16, -171.34, 0.0]\n  Target bbox: [616.13, 320.28, 663.87, 398.79]\n\nFrame 3:\n  Drone pose: [31.27, -65.0, 20.0, -48.82, -169.44, 0.0]\n  Target bbox: [619.21, 319.25, 661.27, 399.75]\n\nFrame 4:\n  Drone pose: [30.87, -64.93, 20.0, -48.62, -169.26, 0.0]\n  Target bbox: [613.51, 317.65, 666.4, 401.48]\n\nFrame 5 (current):\n  Drone pose: [30.47, -64.86, 20.0, -48.26, -167.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.16, \"ymin\": 318.2, \"xmax\": 662.39, \"ymax\": 401.04}, \"waypoint_deltas\": [{\"dx\": -0.39, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 0.15, \"dz\": 0.0, \"dpitch\": 0.59, \"dyaw\": 1.93, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 0.22, \"dz\": 0.0, \"dpitch\": 0.79, \"dyaw\": 2.07, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": 0.29, \"dz\": 0.0, \"dpitch\": 0.98, \"dyaw\": 2.22, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": 0.36, \"dz\": 0.0, \"dpitch\": 1.39, \"dyaw\": 3.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.1, "window_alt_abs_m": 0.0, "target_px_mean_hist": 732.8, "cur_frame_id": 90, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [78.89, -56.56, 22.0, -46.81, 176.56, 0.0]\n  Target bbox: [660.57, 319.67, 699.52, 389.27]\n\nFrame 2:\n  Drone pose: [76.91, -58.2, 21.24, -50.4, 170.71, 0.0]\n  Target bbox: [617.82, 321.37, 662.0, 397.71]\n\nFrame 3:\n  Drone pose: [75.51, -58.93, 20.67, -46.9, 177.31, 0.0]\n  Target bbox: [563.52, 343.84, 602.76, 414.31]\n\nFrame 4:\n  Drone pose: [74.57, -59.44, 20.64, -46.04, 165.48, 0.0]\n  Target bbox: [673.86, 366.54, 718.15, 438.3]\n\nFrame 5 (current):\n  Drone pose: [73.87, -59.7, 20.62, -46.36, 164.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 669.41, \"ymin\": 362.9, \"xmax\": 713.0, \"ymax\": 437.79}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": -0.11, \"dz\": -0.03, \"dpitch\": -2.41, \"dyaw\": 4.16, \"droll\": 0.0}, {\"dx\": -1.11, \"dy\": -0.17, \"dz\": -0.05, \"dpitch\": -2.4, \"dyaw\": 3.97, \"droll\": 0.0}, {\"dx\": -1.61, \"dy\": -0.2, \"dz\": -0.07, \"dpitch\": -2.35, \"dyaw\": 3.84, \"droll\": 0.0}, {\"dx\": -2.08, \"dy\": -0.25, \"dz\": -0.09, \"dpitch\": -2.26, \"dyaw\": 3.72, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": -0.3, \"dz\": -0.2, \"dpitch\": -2.01, \"dyaw\": 3.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.85, "window_alt_abs_m": 1.38, "target_px_mean_hist": 657.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [71.34, -60.0, 20.42, -48.87, 173.52, 0.0]\n  Target bbox: [562.07, 318.42, 605.09, 387.6]\n\nFrame 2:\n  Drone pose: [70.89, -60.06, 20.39, -43.22, 169.07, 0.0]\n  Target bbox: [611.6, 406.67, 651.65, 480.75]\n\nFrame 3:\n  Drone pose: [70.45, -60.12, 20.36, -48.07, 168.19, 0.0]\n  Target bbox: [618.85, 319.63, 660.64, 399.62]\n\nFrame 4:\n  Drone pose: [70.0, -60.19, 20.33, -45.6, 170.13, 0.0]\n  Target bbox: [595.34, 362.01, 635.63, 435.05]\n\nFrame 5 (current):\n  Drone pose: [69.56, -60.26, 20.3, -52.09, 169.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 594.95, \"ymin\": 251.05, \"xmax\": 635.74, \"ymax\": 323.03}, \"waypoint_deltas\": [{\"dx\": -0.45, \"dy\": -0.07, \"dz\": -0.03, \"dpitch\": 4.49, \"dyaw\": -2.3, \"droll\": 0.0}, {\"dx\": -0.91, \"dy\": -0.13, \"dz\": -0.06, \"dpitch\": 4.61, \"dyaw\": -2.49, \"droll\": 0.0}, {\"dx\": -1.38, \"dy\": -0.22, \"dz\": -0.08, \"dpitch\": 4.71, \"dyaw\": -2.73, \"droll\": 0.0}, {\"dx\": -1.89, \"dy\": -0.24, \"dz\": -0.11, \"dpitch\": 4.75, \"dyaw\": -2.8, \"droll\": 0.0}, {\"dx\": -2.39, \"dy\": -0.24, \"dz\": -0.13, \"dpitch\": 4.79, \"dyaw\": -2.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.44, "window_alt_abs_m": 0.12, "target_px_mean_hist": 717.0, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [66.67, -60.47, 20.15, -44.65, 169.71, 0.0]\n  Target bbox: [589.02, 363.56, 632.86, 445.06]\n\nFrame 2:\n  Drone pose: [66.25, -60.23, 20.14, -47.21, 168.04, 0.0]\n  Target bbox: [619.34, 320.92, 660.22, 398.27]\n\nFrame 3:\n  Drone pose: [65.86, -59.91, 20.12, -47.12, 169.12, 0.0]\n  Target bbox: [619.02, 324.08, 660.66, 394.99]\n\nFrame 4:\n  Drone pose: [65.47, -59.58, 20.1, -47.02, 170.23, 0.0]\n  Target bbox: [619.46, 321.95, 660.11, 397.19]\n\nFrame 5 (current):\n  Drone pose: [65.05, -59.33, 20.09, -51.95, 172.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 604.95, \"ymin\": 242.94, \"xmax\": 644.77, \"ymax\": 308.11}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.12, \"dz\": -0.01, \"dpitch\": 5.01, \"dyaw\": -0.89, \"droll\": 0.0}, {\"dx\": -1.02, \"dy\": 0.13, \"dz\": -0.02, \"dpitch\": 4.98, \"dyaw\": -0.89, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": 0.07, \"dz\": -0.03, \"dpitch\": 4.93, \"dyaw\": -1.12, \"droll\": 0.0}, {\"dx\": -2.13, \"dy\": -0.03, \"dz\": -0.04, \"dpitch\": 4.87, \"dyaw\": -1.46, \"droll\": 0.0}, {\"dx\": -2.68, \"dy\": -0.14, \"dz\": -0.05, \"dpitch\": 4.82, \"dyaw\": -1.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.98, "window_alt_abs_m": 0.06, "target_px_mean_hist": 711.5, "cur_frame_id": 23, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00032/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [62.37, -59.47, 20.04, -48.41, 172.19, 0.0]\n  Target bbox: [600.23, 299.16, 640.12, 377.32]\n\nFrame 2:\n  Drone pose: [61.83, -59.56, 20.04, -50.15, 165.23, 0.0]\n  Target bbox: [676.57, 278.61, 718.97, 343.23]\n\nFrame 3:\n  Drone pose: [61.19, -59.55, 19.91, -51.74, 161.01, 0.0]\n  Target bbox: [668.97, 261.87, 724.39, 345.02]\n\nFrame 4:\n  Drone pose: [60.84, -59.61, 20.03, -51.75, 165.07, 0.0]\n  Target bbox: [675.94, 247.03, 719.44, 319.84]\n\nFrame 5 (current):\n  Drone pose: [60.33, -59.65, 20.02, -47.2, 172.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 591.9, \"ymin\": 320.34, \"xmax\": 631.68, \"ymax\": 396.83}, \"waypoint_deltas\": [{\"dx\": -0.53, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -2.6, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -2.77, \"droll\": 0.0}, {\"dx\": -1.57, \"dy\": -0.16, \"dz\": -0.01, \"dpitch\": 0.05, \"dyaw\": -2.95, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -0.24, \"dz\": -0.01, \"dpitch\": 0.03, \"dyaw\": -3.24, \"droll\": 0.0}, {\"dx\": -2.66, \"dy\": -0.37, \"dz\": -0.01, \"dpitch\": -0.03, \"dyaw\": -3.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 22.51, "window_alt_abs_m": 0.26, "target_px_mean_hist": 741.5, "cur_frame_id": 32, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00042/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [57.06, -60.18, 20.01, -42.35, 163.44, 0.0]\n  Target bbox: [671.02, 405.09, 715.18, 485.53]\n\nFrame 2:\n  Drone pose: [56.42, -60.36, 20.01, -42.5, 167.55, 0.0]\n  Target bbox: [617.13, 406.39, 659.59, 480.93]\n\nFrame 3:\n  Drone pose: [55.78, -60.54, 20.0, -46.9, 167.45, 0.0]\n  Target bbox: [629.18, 336.04, 671.05, 414.81]\n\nFrame 4:\n  Drone pose: [55.15, -60.69, 20.0, -52.54, 163.66, 0.0]\n  Target bbox: [665.11, 249.74, 708.33, 318.89]\n\nFrame 5 (current):\n  Drone pose: [54.64, -60.85, 19.94, -52.99, 164.28, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 614.64, \"ymin\": 319.63, \"xmax\": 665.18, \"ymax\": 399.09}, \"waypoint_deltas\": [{\"dx\": -0.64, \"dy\": -0.03, \"dz\": 0.06, \"dpitch\": 4.64, \"dyaw\": 4.39, \"droll\": 0.0}, {\"dx\": -1.17, \"dy\": -0.09, \"dz\": 0.06, \"dpitch\": 4.45, \"dyaw\": 5.84, \"droll\": 0.0}, {\"dx\": -1.67, \"dy\": -0.15, \"dz\": 0.06, \"dpitch\": 4.48, \"dyaw\": 5.64, \"droll\": 0.0}, {\"dx\": -2.12, \"dy\": -0.23, \"dz\": 0.06, \"dpitch\": 4.45, \"dyaw\": 7.08, \"droll\": 0.0}, {\"dx\": -2.54, \"dy\": -0.33, \"dz\": 0.06, \"dpitch\": 4.6, \"dyaw\": 6.79, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.61, "window_alt_abs_m": 0.07, "target_px_mean_hist": 725.0, "cur_frame_id": 42, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00052/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [51.69, -61.29, 20.0, -47.87, 167.41, 0.0]\n  Target bbox: [674.21, 329.86, 717.91, 408.47]\n\nFrame 2:\n  Drone pose: [51.28, -61.43, 20.0, -48.15, 172.01, 0.0]\n  Target bbox: [619.74, 319.9, 659.78, 399.14]\n\nFrame 3:\n  Drone pose: [50.86, -61.58, 20.0, -51.61, 176.54, 0.0]\n  Target bbox: [553.64, 256.74, 612.21, 344.17]\n\nFrame 4:\n  Drone pose: [50.43, -61.75, 20.0, -48.73, 177.65, 0.0]\n  Target bbox: [563.45, 314.38, 602.84, 381.71]\n\nFrame 5 (current):\n  Drone pose: [49.98, -61.94, 20.0, -52.3, 167.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 673.0, \"ymin\": 245.84, \"xmax\": 721.17, \"ymax\": 326.05}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": 4.48, \"dyaw\": 6.0, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": 4.56, \"dyaw\": 5.34, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": -0.61, \"dz\": 0.0, \"dpitch\": 4.53, \"dyaw\": 6.34, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": -0.8, \"dz\": 0.0, \"dpitch\": 4.59, \"dyaw\": 5.73, \"droll\": 0.0}, {\"dx\": -2.39, \"dy\": -0.97, \"dz\": 0.0, \"dpitch\": 4.55, \"dyaw\": 6.82, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.85, "window_alt_abs_m": 0.0, "target_px_mean_hist": 740.8, "cur_frame_id": 52, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00061/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [47.59, -62.91, 20.0, -47.11, 177.47, 0.0]\n  Target bbox: [579.31, 336.13, 617.93, 406.12]\n\nFrame 2:\n  Drone pose: [47.1, -63.06, 20.0, -46.14, 178.36, 0.0]\n  Target bbox: [560.69, 349.98, 605.39, 425.22]\n\nFrame 3:\n  Drone pose: [46.72, -63.12, 19.95, -46.86, -178.93, 0.0]\n  Target bbox: [604.87, 283.31, 649.26, 358.79]\n\nFrame 4:\n  Drone pose: [46.13, -63.33, 20.0, -47.69, 174.13, 0.0]\n  Target bbox: [620.74, 323.47, 658.86, 395.56]\n\nFrame 5 (current):\n  Drone pose: [45.64, -63.45, 20.0, -51.57, 170.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 656.93, \"ymin\": 252.8, \"xmax\": 708.21, \"ymax\": 336.49}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": 3.86, \"dyaw\": 4.93, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -0.25, \"dz\": 0.0, \"dpitch\": 3.86, \"dyaw\": 4.51, \"droll\": 0.0}, {\"dx\": -1.54, \"dy\": -0.38, \"dz\": 0.0, \"dpitch\": 3.76, \"dyaw\": 5.73, \"droll\": 0.0}, {\"dx\": -2.09, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 3.69, \"dyaw\": 5.28, \"droll\": 0.0}, {\"dx\": -2.66, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": 3.54, \"dyaw\": 6.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.64, "window_alt_abs_m": 0.09, "target_px_mean_hist": 716.8, "cur_frame_id": 61, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00071/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [42.41, -64.21, 20.0, -48.87, 171.79, 0.0]\n  Target bbox: [663.09, 306.56, 715.37, 391.13]\n\nFrame 2:\n  Drone pose: [41.82, -64.31, 20.0, -43.39, 174.4, 0.0]\n  Target bbox: [653.49, 403.94, 695.02, 482.5]\n\nFrame 3:\n  Drone pose: [41.24, -64.4, 20.0, -48.46, 177.14, 0.0]\n  Target bbox: [621.88, 325.65, 658.2, 393.31]\n\nFrame 4:\n  Drone pose: [40.71, -64.49, 19.89, -51.43, 173.97, 0.0]\n  Target bbox: [643.92, 334.04, 692.56, 419.41]\n\nFrame 5 (current):\n  Drone pose: [40.07, -64.53, 20.0, -48.76, 178.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.25, \"ymin\": 322.88, \"xmax\": 658.3, \"ymax\": 396.05}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": -1.16, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": -0.29, \"dyaw\": 1.32, \"droll\": 0.0}, {\"dx\": -1.74, \"dy\": -0.17, \"dz\": 0.0, \"dpitch\": -0.42, \"dyaw\": 1.09, \"droll\": 0.0}, {\"dx\": -2.3, \"dy\": -0.25, \"dz\": 0.0, \"dpitch\": -0.54, \"dyaw\": 2.58, \"droll\": 0.0}, {\"dx\": -2.87, \"dy\": -0.33, \"dz\": 0.0, \"dpitch\": -0.65, \"dyaw\": 2.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.93, "window_alt_abs_m": 0.22, "target_px_mean_hist": 734.8, "cur_frame_id": 71, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [37.2, -64.86, 20.0, -51.91, -181.97, 0.0]\n  Target bbox: [643.44, 275.19, 696.45, 360.86]\n\nFrame 2:\n  Drone pose: [36.64, -64.94, 20.0, -48.06, -182.83, 0.0]\n  Target bbox: [675.5, 346.94, 713.99, 423.93]\n\nFrame 3:\n  Drone pose: [36.09, -65.02, 20.0, -49.59, -178.12, 0.0]\n  Target bbox: [614.45, 317.82, 665.64, 401.22]\n\nFrame 4:\n  Drone pose: [35.54, -65.1, 20.0, -52.01, -172.59, 0.0]\n  Target bbox: [575.62, 281.21, 616.26, 360.49]\n\nFrame 5 (current):\n  Drone pose: [35.01, -65.16, 20.0, -52.4, -176.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 610.62, \"ymin\": 270.56, \"xmax\": 666.85, \"ymax\": 358.28}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 2.68, \"dyaw\": 1.54, \"droll\": 0.0}, {\"dx\": -1.08, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": 2.61, \"dyaw\": 1.43, \"droll\": 0.0}, {\"dx\": -1.62, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 2.53, \"dyaw\": 1.33, \"droll\": 0.0}, {\"dx\": -2.1, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": 2.66, \"dyaw\": 3.14, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 2.73, \"dyaw\": 3.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.22, "window_alt_abs_m": 0.0, "target_px_mean_hist": 728.2, "cur_frame_id": 80, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130/aug_001/frames_playback/frame_00090/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [32.06, -65.15, 20.0, -49.36, -171.53, 0.0]\n  Target bbox: [619.91, 324.58, 659.8, 394.3]\n\nFrame 2:\n  Drone pose: [31.63, -64.95, 20.09, -46.22, -171.86, 0.0]\n  Target bbox: [652.93, 306.13, 693.31, 378.28]\n\nFrame 3:\n  Drone pose: [31.27, -65.0, 20.0, -46.72, -172.4, 0.0]\n  Target bbox: [651.62, 355.27, 695.46, 435.47]\n\nFrame 4:\n  Drone pose: [30.87, -64.93, 20.0, -51.12, -164.26, 0.0]\n  Target bbox: [561.27, 281.54, 606.82, 357.16]\n\nFrame 5 (current):\n  Drone pose: [30.39, -64.74, 20.1, -50.4, -167.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.38, \"ymin\": 323.53, \"xmax\": 660.2, \"ymax\": 395.29}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": -0.05, \"dz\": -0.1, \"dpitch\": 2.34, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": -0.71, \"dy\": 0.03, \"dz\": -0.1, \"dpitch\": 2.73, \"dyaw\": 2.23, \"droll\": 0.0}, {\"dx\": -1.1, \"dy\": 0.1, \"dz\": -0.1, \"dpitch\": 2.93, \"dyaw\": 2.37, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.17, \"dz\": -0.1, \"dpitch\": 3.12, \"dyaw\": 2.52, \"droll\": 0.0}, {\"dx\": -1.9, \"dy\": 0.24, \"dz\": -0.1, \"dpitch\": 3.53, \"dyaw\": 4.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.48, "window_alt_abs_m": 0.27, "target_px_mean_hist": 722.5, "cur_frame_id": 90, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776215130", "difficulty_score": 0.2149, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [48.89, 25.94, 22.0, -46.42, 180.0, 0.0]\n  Target bbox: [627.26, 330.93, 652.74, 388.23]\n\nFrame 2:\n  Drone pose: [48.5, 26.15, 21.2, -45.15, -179.41, 0.0]\n  Target bbox: [627.65, 328.38, 652.6, 390.82]\n\nFrame 3:\n  Drone pose: [48.1, 26.36, 20.67, -44.24, -178.83, 0.0]\n  Target bbox: [626.27, 324.27, 654.11, 395.06]\n\nFrame 4:\n  Drone pose: [47.71, 26.57, 20.64, -44.04, -178.25, 0.0]\n  Target bbox: [626.64, 326.9, 653.65, 392.36]\n\nFrame 5 (current):\n  Drone pose: [47.32, 26.77, 20.62, -43.84, -177.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.7, \"ymin\": 324.87, \"xmax\": 653.64, \"ymax\": 394.46}, \"waypoint_deltas\": [{\"dx\": -0.4, \"dy\": 0.21, \"dz\": -0.03, \"dpitch\": 0.19, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 0.42, \"dz\": -0.05, \"dpitch\": 0.39, \"dyaw\": 1.13, \"droll\": 0.0}, {\"dx\": -1.19, \"dy\": 0.62, \"dz\": -0.07, \"dpitch\": 0.59, \"dyaw\": 1.68, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": 0.83, \"dz\": -0.09, \"dpitch\": 0.78, \"dyaw\": 2.22, \"droll\": 0.0}, {\"dx\": -1.97, \"dy\": 1.04, \"dz\": -0.2, \"dpitch\": 1.11, \"dyaw\": 2.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.32, "window_alt_abs_m": 1.38, "target_px_mean_hist": 552.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [41.8, 29.67, 20.17, -40.8, -170.35, 0.0]\n  Target bbox: [625.8, 325.54, 654.5, 393.95]\n\nFrame 2:\n  Drone pose: [41.41, 29.88, 20.15, -40.6, -169.87, 0.0]\n  Target bbox: [625.46, 328.49, 654.72, 390.83]\n\nFrame 3:\n  Drone pose: [41.01, 30.09, 20.14, -40.39, -169.4, 0.0]\n  Target bbox: [625.4, 323.94, 654.96, 395.67]\n\nFrame 4:\n  Drone pose: [40.62, 30.29, 20.12, -40.19, -168.93, 0.0]\n  Target bbox: [624.23, 324.47, 656.14, 395.17]\n\nFrame 5 (current):\n  Drone pose: [40.23, 30.5, 20.11, -39.99, -168.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.91, \"ymin\": 324.02, \"xmax\": 656.5, \"ymax\": 395.73}, \"waypoint_deltas\": [{\"dx\": -0.4, \"dy\": 0.21, \"dz\": -0.02, \"dpitch\": 0.2, \"dyaw\": 0.45, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 0.41, \"dz\": -0.03, \"dpitch\": 0.4, \"dyaw\": 0.9, \"droll\": 0.0}, {\"dx\": -1.19, \"dy\": 0.62, \"dz\": -0.04, \"dpitch\": 0.59, \"dyaw\": 1.34, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": 0.83, \"dz\": -0.05, \"dpitch\": 0.79, \"dyaw\": 1.78, \"droll\": 0.0}, {\"dx\": -1.97, \"dy\": 1.03, \"dz\": -0.06, \"dpitch\": 0.98, \"dyaw\": 2.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.87, "window_alt_abs_m": 0.07, "target_px_mean_hist": 535.0, "cur_frame_id": 22, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [34.07, 32.16, 20.01, -38.41, -165.0, 0.0]\n  Target bbox: [625.75, 329.42, 654.41, 390.1]\n\nFrame 2:\n  Drone pose: [33.57, 32.16, 20.01, -38.4, -165.0, 0.0]\n  Target bbox: [624.97, 325.67, 655.37, 394.1]\n\nFrame 3:\n  Drone pose: [33.07, 32.16, 20.01, -38.4, -165.0, 0.0]\n  Target bbox: [624.19, 324.79, 656.2, 395.08]\n\nFrame 4:\n  Drone pose: [32.57, 32.16, 20.01, -38.4, -165.0, 0.0]\n  Target bbox: [624.24, 326.41, 656.04, 393.25]\n\nFrame 5 (current):\n  Drone pose: [32.07, 32.16, 20.0, -38.4, -165.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.8, \"ymin\": 324.85, \"xmax\": 655.55, \"ymax\": 394.92}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.01, "target_px_mean_hist": 491.0, "cur_frame_id": 40, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.07, 32.16, 20.0, -38.39, -165.0, 0.0]\n  Target bbox: [625.73, 329.37, 654.43, 390.16]\n\nFrame 2:\n  Drone pose: [23.31, 32.0, 20.0, -39.91, -164.56, 0.0]\n  Target bbox: [625.1, 328.49, 655.04, 390.88]\n\nFrame 3:\n  Drone pose: [21.53, 31.88, 20.0, -41.52, -163.94, 0.0]\n  Target bbox: [622.19, 322.3, 658.25, 397.45]\n\nFrame 4:\n  Drone pose: [19.75, 31.82, 20.0, -43.23, -163.13, 0.0]\n  Target bbox: [622.77, 321.32, 657.62, 398.14]\n\nFrame 5 (current):\n  Drone pose: [17.98, 31.8, 20.0, -45.01, -162.07, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.43, \"ymin\": 325.82, \"xmax\": 656.76, \"ymax\": 393.35}, \"waypoint_deltas\": [{\"dx\": -0.48, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 0.07, \"droll\": 0.0}, {\"dx\": -0.95, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": -1.43, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": 0.22, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": 0.13, \"dz\": 0.0, \"dpitch\": 0.2, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": -2.38, \"dy\": 0.16, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": 0.37, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.93, "window_alt_abs_m": 0.0, "target_px_mean_hist": 535.0, "cur_frame_id": 58, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [11.31, 32.26, 20.0, -44.32, -161.06, 0.0]\n  Target bbox: [620.58, 320.31, 659.86, 399.21]\n\nFrame 2:\n  Drone pose: [10.83, 32.3, 20.0, -44.27, -160.99, 0.0]\n  Target bbox: [623.0, 325.55, 657.16, 393.58]\n\nFrame 3:\n  Drone pose: [10.35, 32.33, 20.0, -44.23, -160.92, 0.0]\n  Target bbox: [623.18, 325.66, 657.01, 393.57]\n\nFrame 4:\n  Drone pose: [9.86, 32.35, 20.0, -44.2, -160.89, 0.0]\n  Target bbox: [622.59, 324.09, 657.67, 395.21]\n\nFrame 5 (current):\n  Drone pose: [9.38, 32.36, 20.0, -44.18, -160.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.6, \"ymin\": 321.29, \"xmax\": 658.79, \"ymax\": 398.16}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": -1.96, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -2.45, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 0.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.21, "window_alt_abs_m": 0.0, "target_px_mean_hist": 584.0, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00095/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [2.04, 32.6, 20.0, -43.84, -160.37, 0.0]\n  Target bbox: [623.43, 326.19, 656.7, 392.96]\n\nFrame 2:\n  Drone pose: [1.55, 32.61, 20.0, -43.82, -160.34, 0.0]\n  Target bbox: [621.91, 322.49, 658.41, 396.89]\n\nFrame 3:\n  Drone pose: [1.06, 32.63, 20.0, -43.8, -160.3, 0.0]\n  Target bbox: [623.37, 326.22, 656.76, 392.93]\n\nFrame 4:\n  Drone pose: [0.57, 32.64, 20.0, -43.78, -160.27, 0.0]\n  Target bbox: [621.72, 321.84, 658.63, 397.57]\n\nFrame 5 (current):\n  Drone pose: [0.08, 32.66, 20.0, -43.75, -160.24, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.99, \"ymin\": 323.8, \"xmax\": 658.25, \"ymax\": 395.48}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -1.46, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": -1.95, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -2.44, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 0.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.13, "window_alt_abs_m": 0.0, "target_px_mean_hist": 581.8, "cur_frame_id": 95, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00113/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-6.82, 32.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [621.71, 320.94, 658.67, 398.53]\n\nFrame 2:\n  Drone pose: [-7.32, 32.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [622.97, 325.07, 657.25, 394.25]\n\nFrame 3:\n  Drone pose: [-7.82, 32.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [621.61, 321.15, 658.76, 398.3]\n\nFrame 4:\n  Drone pose: [-8.32, 32.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [620.27, 320.59, 660.17, 399.01]\n\nFrame 5 (current):\n  Drone pose: [-8.82, 32.8, 20.0, -43.55, -159.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.98, \"ymin\": 322.3, \"xmax\": 658.37, \"ymax\": 397.16}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 576.0, "cur_frame_id": 113, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00129/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00130/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00131/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-15.82, 29.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [621.81, 321.26, 658.56, 398.18]\n\nFrame 2:\n  Drone pose: [-16.32, 29.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [622.02, 322.49, 658.3, 396.92]\n\nFrame 3:\n  Drone pose: [-16.82, 29.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [622.63, 323.55, 657.61, 395.73]\n\nFrame 4:\n  Drone pose: [-17.32, 29.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [623.48, 326.27, 656.68, 392.98]\n\nFrame 5 (current):\n  Drone pose: [-17.82, 29.8, 20.0, -43.55, -159.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.59, \"ymin\": 320.8, \"xmax\": 659.83, \"ymax\": 398.76}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 381.0, "cur_frame_id": 131, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00146/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00147/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00148/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00149/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-26.74, 29.29, 20.0, -46.53, -159.37, 0.0]\n  Target bbox: [621.72, 322.1, 658.54, 396.98]\n\nFrame 2:\n  Drone pose: [-28.22, 29.09, 20.0, -48.15, -158.83, 0.0]\n  Target bbox: [621.7, 323.27, 658.48, 395.61]\n\nFrame 3:\n  Drone pose: [-29.72, 28.93, 20.0, -49.84, -158.1, 0.0]\n  Target bbox: [618.33, 318.18, 662.04, 400.76]\n\nFrame 4:\n  Drone pose: [-31.22, 28.81, 20.0, -51.59, -157.1, 0.0]\n  Target bbox: [618.1, 319.11, 662.18, 399.62]\n\nFrame 5 (current):\n  Drone pose: [-32.72, 28.72, 20.0, -53.39, -155.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.56, \"ymin\": 315.81, \"xmax\": 662.85, \"ymax\": 402.81}, \"waypoint_deltas\": [{\"dx\": -1.49, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": -1.91, \"dyaw\": 1.3, \"droll\": 0.0}, {\"dx\": -3.0, \"dy\": -0.32, \"dz\": 0.0, \"dpitch\": -3.97, \"dyaw\": 2.51, \"droll\": 0.0}, {\"dx\": -3.46, \"dy\": -0.37, \"dz\": 0.0, \"dpitch\": -3.94, \"dyaw\": 2.18, \"droll\": 0.0}, {\"dx\": -3.92, \"dy\": -0.45, \"dz\": 0.0, \"dpitch\": -3.95, \"dyaw\": 1.77, \"droll\": 0.0}, {\"dx\": -4.41, \"dy\": -0.56, \"dz\": 0.0, \"dpitch\": -4.03, \"dyaw\": 1.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.49, "window_alt_abs_m": 0.0, "target_px_mean_hist": 676.2, "cur_frame_id": 149, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00164/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00165/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00166/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/ORI/frames_playback/frame_00167/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-41.39, 26.24, 20.0, -58.41, -163.65, 0.0]\n  Target bbox: [619.38, 322.91, 660.81, 395.21]\n\nFrame 2:\n  Drone pose: [-41.66, 26.13, 20.0, -57.99, -164.44, 0.0]\n  Target bbox: [618.68, 316.75, 661.74, 401.51]\n\nFrame 3:\n  Drone pose: [-41.93, 26.03, 20.0, -57.57, -165.2, 0.0]\n  Target bbox: [618.84, 315.62, 661.61, 402.57]\n\nFrame 4:\n  Drone pose: [-42.2, 25.93, 20.0, -57.15, -165.94, 0.0]\n  Target bbox: [620.93, 323.68, 659.29, 394.63]\n\nFrame 5 (current):\n  Drone pose: [-42.46, 25.82, 20.0, -56.72, -166.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.88, \"ymin\": 323.71, \"xmax\": 659.35, \"ymax\": 394.64}, \"waypoint_deltas\": [{\"dx\": -0.27, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.42, \"dyaw\": -0.69, \"droll\": 0.0}, {\"dx\": -0.54, \"dy\": -0.2, \"dz\": 0.0, \"dpitch\": 0.84, \"dyaw\": -1.37, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": -0.31, \"dz\": 0.0, \"dpitch\": 1.26, \"dyaw\": -2.02, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": -0.3, \"dz\": 0.0, \"dpitch\": 1.88, \"dyaw\": -2.25, \"droll\": 0.0}, {\"dx\": -1.2, \"dy\": -0.29, \"dz\": 0.0, \"dpitch\": 2.49, \"dyaw\": -2.47, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 750.8, "cur_frame_id": 167, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [48.89, 25.94, 22.0, -46.42, 180.0, 0.0]\n  Target bbox: [627.88, 325.21, 652.12, 394.05]\n\nFrame 2:\n  Drone pose: [48.5, 26.15, 21.2, -40.79, -184.41, 0.0]\n  Target bbox: [685.73, 398.85, 713.09, 470.93]\n\nFrame 3:\n  Drone pose: [47.98, 26.27, 20.73, -41.1, -183.3, 0.0]\n  Target bbox: [676.01, 385.97, 705.71, 450.63]\n\nFrame 4:\n  Drone pose: [47.64, 26.54, 20.68, -40.7, -174.77, 0.0]\n  Target bbox: [583.06, 388.44, 611.12, 450.1]\n\nFrame 5 (current):\n  Drone pose: [47.38, 26.64, 20.53, -40.55, -177.34, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.03, \"ymin\": 377.89, \"xmax\": 644.36, \"ymax\": 445.07}, \"waypoint_deltas\": [{\"dx\": -0.46, \"dy\": 0.34, \"dz\": 0.06, \"dpitch\": -3.1, \"dyaw\": 0.23, \"droll\": 0.0}, {\"dx\": -0.85, \"dy\": 0.55, \"dz\": 0.04, \"dpitch\": -2.9, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": 0.75, \"dz\": 0.02, \"dpitch\": -2.7, \"dyaw\": 1.34, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": 0.96, \"dz\": 0.0, \"dpitch\": -2.51, \"dyaw\": 1.88, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": 1.17, \"dz\": -0.11, \"dpitch\": -2.18, \"dyaw\": 2.42, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.61, "window_alt_abs_m": 1.47, "target_px_mean_hist": 563.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [41.8, 29.67, 20.17, -44.91, -169.46, 0.0]\n  Target bbox: [613.43, 254.9, 643.79, 326.87]\n\nFrame 2:\n  Drone pose: [41.35, 29.98, 20.13, -44.38, -171.62, 0.0]\n  Target bbox: [650.77, 263.48, 681.98, 330.13]\n\nFrame 3:\n  Drone pose: [41.05, 30.12, 20.08, -40.25, -169.33, 0.0]\n  Target bbox: [624.89, 323.82, 655.52, 395.93]\n\nFrame 4:\n  Drone pose: [40.62, 30.29, 20.12, -40.15, -163.93, 0.0]\n  Target bbox: [558.32, 327.64, 593.16, 397.0]\n\nFrame 5 (current):\n  Drone pose: [40.23, 30.5, 20.11, -41.62, -164.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 572.06, \"ymin\": 298.27, \"xmax\": 606.18, \"ymax\": 368.8}, \"waypoint_deltas\": [{\"dx\": -0.4, \"dy\": 0.21, \"dz\": -0.02, \"dpitch\": 1.83, \"dyaw\": -3.49, \"droll\": 0.0}, {\"dx\": -0.79, \"dy\": 0.41, \"dz\": -0.03, \"dpitch\": 2.03, \"dyaw\": -3.04, \"droll\": 0.0}, {\"dx\": -1.19, \"dy\": 0.62, \"dz\": -0.04, \"dpitch\": 2.22, \"dyaw\": -2.6, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": 0.83, \"dz\": -0.05, \"dpitch\": 2.42, \"dyaw\": -2.16, \"droll\": 0.0}, {\"dx\": -1.97, \"dy\": 1.03, \"dz\": -0.06, \"dpitch\": 2.61, \"dyaw\": -1.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.45, "window_alt_abs_m": 0.15, "target_px_mean_hist": 540.8, "cur_frame_id": 22, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00040/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [34.07, 32.16, 20.01, -38.41, -165.0, 0.0]\n  Target bbox: [625.06, 325.31, 655.27, 394.45]\n\nFrame 2:\n  Drone pose: [33.57, 32.16, 20.01, -41.87, -162.6, 0.0]\n  Target bbox: [591.74, 266.7, 624.81, 337.39]\n\nFrame 3:\n  Drone pose: [33.07, 32.16, 20.01, -40.53, -170.0, 0.0]\n  Target bbox: [690.56, 292.4, 721.91, 359.2]\n\nFrame 4:\n  Drone pose: [32.6, 32.25, 20.13, -36.8, -166.15, 0.0]\n  Target bbox: [643.09, 354.61, 673.17, 423.38]\n\nFrame 5 (current):\n  Drone pose: [32.02, 32.05, 20.01, -40.52, -161.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 576.03, \"ymin\": 291.22, \"xmax\": 611.2, \"ymax\": 362.48}, \"waypoint_deltas\": [{\"dx\": -0.45, \"dy\": 0.11, \"dz\": -0.01, \"dpitch\": 2.13, \"dyaw\": -3.31, \"droll\": 0.0}, {\"dx\": -0.95, \"dy\": 0.11, \"dz\": -0.01, \"dpitch\": 2.13, \"dyaw\": -3.31, \"droll\": 0.0}, {\"dx\": -1.45, \"dy\": 0.11, \"dz\": -0.01, \"dpitch\": 2.13, \"dyaw\": -3.31, \"droll\": 0.0}, {\"dx\": -1.95, \"dy\": 0.11, \"dz\": -0.01, \"dpitch\": 2.13, \"dyaw\": -3.31, \"droll\": 0.0}, {\"dx\": -2.45, \"dy\": 0.11, \"dz\": -0.01, \"dpitch\": 2.13, \"dyaw\": -3.31, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.11, "window_alt_abs_m": 0.24, "target_px_mean_hist": 505.5, "cur_frame_id": 40, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00058/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.07, 32.16, 20.0, -39.48, -160.71, 0.0]\n  Target bbox: [567.38, 309.63, 599.48, 376.14]\n\nFrame 2:\n  Drone pose: [23.42, 31.95, 19.91, -37.78, -166.78, 0.0]\n  Target bbox: [651.35, 357.63, 681.92, 425.23]\n\nFrame 3:\n  Drone pose: [21.44, 31.77, 20.13, -41.87, -164.18, 0.0]\n  Target bbox: [623.83, 323.17, 656.53, 396.39]\n\nFrame 4:\n  Drone pose: [19.7, 31.79, 19.81, -43.02, -163.15, 0.0]\n  Target bbox: [623.93, 326.49, 656.26, 392.78]\n\nFrame 5 (current):\n  Drone pose: [17.97, 31.63, 20.05, -45.16, -162.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.73, \"ymin\": 320.04, \"xmax\": 659.72, \"ymax\": 399.4}, \"waypoint_deltas\": [{\"dx\": -0.47, \"dy\": 0.2, \"dz\": -0.05, \"dpitch\": 0.2, \"dyaw\": 0.55, \"droll\": 0.0}, {\"dx\": -0.94, \"dy\": 0.23, \"dz\": -0.05, \"dpitch\": 0.25, \"dyaw\": 0.63, \"droll\": 0.0}, {\"dx\": -1.42, \"dy\": 0.27, \"dz\": -0.05, \"dpitch\": 0.3, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": -1.9, \"dy\": 0.3, \"dz\": -0.05, \"dpitch\": 0.35, \"dyaw\": 0.77, \"droll\": 0.0}, {\"dx\": -2.37, \"dy\": 0.33, \"dz\": -0.05, \"dpitch\": 0.4, \"dyaw\": 0.85, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.29, "window_alt_abs_m": 0.86, "target_px_mean_hist": 538.5, "cur_frame_id": 58, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [11.31, 32.26, 20.0, -44.32, -161.06, 0.0]\n  Target bbox: [623.07, 325.24, 657.13, 394.01]\n\nFrame 2:\n  Drone pose: [10.76, 32.15, 19.89, -41.46, -164.09, 0.0]\n  Target bbox: [654.18, 367.95, 693.54, 447.21]\n\nFrame 3:\n  Drone pose: [10.24, 32.48, 20.03, -44.35, -160.4, 0.0]\n  Target bbox: [623.31, 326.0, 656.86, 393.21]\n\nFrame 4:\n  Drone pose: [9.98, 32.22, 20.05, -44.17, -161.36, 0.0]\n  Target bbox: [620.72, 320.43, 659.73, 399.14]\n\nFrame 5 (current):\n  Drone pose: [9.38, 32.36, 20.0, -44.18, -160.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.59, \"ymin\": 320.47, \"xmax\": 659.84, \"ymax\": 399.03}, \"waypoint_deltas\": [{\"dx\": -0.49, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": -0.98, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": -1.47, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": -1.96, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.13, \"droll\": 0.0}, {\"dx\": -2.45, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 0.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.18, "window_alt_abs_m": 0.31, "target_px_mean_hist": 583.8, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00095/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [2.04, 32.6, 20.0, -41.56, -159.04, 0.0]\n  Target bbox: [605.57, 359.72, 641.96, 436.57]\n\nFrame 2:\n  Drone pose: [1.55, 32.61, 20.0, -43.82, -160.34, 0.0]\n  Target bbox: [620.32, 320.42, 660.13, 399.19]\n\nFrame 3:\n  Drone pose: [1.18, 32.76, 20.03, -46.75, -158.2, 0.0]\n  Target bbox: [597.71, 268.18, 636.61, 346.43]\n\nFrame 4:\n  Drone pose: [0.43, 32.51, 19.96, -45.11, -161.88, 0.0]\n  Target bbox: [639.33, 303.69, 675.29, 377.64]\n\nFrame 5 (current):\n  Drone pose: [0.15, 32.53, 19.89, -39.28, -156.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 575.43, \"ymin\": 395.19, \"xmax\": 613.63, \"ymax\": 470.45}, \"waypoint_deltas\": [{\"dx\": -0.56, \"dy\": 0.14, \"dz\": 0.11, \"dpitch\": -4.45, \"dyaw\": -3.26, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": 0.16, \"dz\": 0.11, \"dpitch\": -4.43, \"dyaw\": -3.23, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": 0.18, \"dz\": 0.11, \"dpitch\": -4.41, \"dyaw\": -3.2, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": 0.19, \"dz\": 0.11, \"dpitch\": -4.38, \"dyaw\": -3.16, \"droll\": 0.0}, {\"dx\": -2.51, \"dy\": 0.21, \"dz\": 0.11, \"dpitch\": -4.36, \"dyaw\": -3.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.05, "window_alt_abs_m": 0.16, "target_px_mean_hist": 584.0, "cur_frame_id": 95, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00111/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00113/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-6.82, 32.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [621.67, 321.69, 658.68, 397.75]\n\nFrame 2:\n  Drone pose: [-7.32, 32.8, 20.0, -44.81, -164.96, 0.0]\n  Target bbox: [683.56, 306.66, 719.04, 374.27]\n\nFrame 3:\n  Drone pose: [-7.88, 32.81, 20.04, -45.91, -157.34, 0.0]\n  Target bbox: [591.33, 288.24, 626.89, 357.37]\n\nFrame 4:\n  Drone pose: [-8.32, 32.8, 20.0, -38.55, -158.54, 0.0]\n  Target bbox: [603.84, 406.98, 641.14, 480.89]\n\nFrame 5 (current):\n  Drone pose: [-8.82, 32.8, 20.0, -43.55, -159.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.08, \"ymin\": 324.6, \"xmax\": 657.12, \"ymax\": 394.64}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -2.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.23, "window_alt_abs_m": 0.09, "target_px_mean_hist": 581.0, "cur_frame_id": 113, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00129/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00130/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00131/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-15.81, 29.88, 20.08, -44.69, -160.61, 0.0]\n  Target bbox: [632.79, 303.29, 669.37, 380.28]\n\nFrame 2:\n  Drone pose: [-16.41, 29.79, 19.82, -42.94, -155.11, 0.0]\n  Target bbox: [561.98, 331.99, 600.93, 406.56]\n\nFrame 3:\n  Drone pose: [-16.82, 29.8, 20.0, -44.11, -157.58, 0.0]\n  Target bbox: [591.8, 312.41, 629.88, 389.17]\n\nFrame 4:\n  Drone pose: [-17.32, 29.8, 20.0, -43.55, -159.96, 0.0]\n  Target bbox: [622.16, 322.1, 658.19, 397.37]\n\nFrame 5 (current):\n  Drone pose: [-17.98, 29.87, 19.99, -43.71, -159.61, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.86, \"ymin\": 324.95, \"xmax\": 657.35, \"ymax\": 394.36}, \"waypoint_deltas\": [{\"dx\": -0.34, \"dy\": -0.07, \"dz\": 0.01, \"dpitch\": 0.16, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -0.84, \"dy\": -0.07, \"dz\": 0.01, \"dpitch\": 0.16, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -1.34, \"dy\": -0.07, \"dz\": 0.01, \"dpitch\": 0.16, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -1.84, \"dy\": -0.07, \"dz\": 0.01, \"dpitch\": 0.16, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": -2.34, \"dy\": -0.07, \"dz\": 0.01, \"dpitch\": 0.16, \"dyaw\": -0.35, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.7, "window_alt_abs_m": 0.45, "target_px_mean_hist": 356.5, "cur_frame_id": 131, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00146/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00147/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00148/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00149/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-26.74, 29.29, 20.0, -48.91, -161.48, 0.0]\n  Target bbox: [645.74, 279.61, 684.51, 360.51]\n\nFrame 2:\n  Drone pose: [-28.3, 29.08, 19.99, -48.54, -159.02, 0.0]\n  Target bbox: [621.36, 313.0, 664.66, 396.6]\n\nFrame 3:\n  Drone pose: [-29.74, 29.03, 20.02, -49.85, -157.73, 0.0]\n  Target bbox: [618.73, 317.53, 661.71, 401.53]\n\nFrame 4:\n  Drone pose: [-31.26, 28.95, 19.99, -52.12, -161.55, 0.0]\n  Target bbox: [670.88, 309.57, 715.13, 393.3]\n\nFrame 5 (current):\n  Drone pose: [-32.79, 28.77, 19.92, -52.0, -150.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 567.75, \"ymin\": 346.14, \"xmax\": 611.07, \"ymax\": 422.17}, \"waypoint_deltas\": [{\"dx\": -1.42, \"dy\": -0.17, \"dz\": 0.08, \"dpitch\": -3.3, \"dyaw\": -4.02, \"droll\": 0.0}, {\"dx\": -2.93, \"dy\": -0.37, \"dz\": 0.08, \"dpitch\": -5.36, \"dyaw\": -2.81, \"droll\": 0.0}, {\"dx\": -3.39, \"dy\": -0.42, \"dz\": 0.08, \"dpitch\": -5.33, \"dyaw\": -3.14, \"droll\": 0.0}, {\"dx\": -3.85, \"dy\": -0.5, \"dz\": 0.08, \"dpitch\": -5.34, \"dyaw\": -3.55, \"droll\": 0.0}, {\"dx\": -4.34, \"dy\": -0.61, \"dz\": 0.08, \"dpitch\": -5.42, \"dyaw\": -4.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.56, "window_alt_abs_m": 0.15, "target_px_mean_hist": 678.2, "cur_frame_id": 149, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00164/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00165/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00166/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751/aug_001/frames_playback/frame_00167/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-41.39, 26.24, 20.0, -58.41, -163.65, 0.0]\n  Target bbox: [618.34, 320.25, 661.94, 397.87]\n\nFrame 2:\n  Drone pose: [-41.66, 26.13, 20.0, -57.99, -164.44, 0.0]\n  Target bbox: [616.83, 314.86, 663.67, 403.39]\n\nFrame 3:\n  Drone pose: [-41.93, 26.03, 20.0, -57.57, -165.2, 0.0]\n  Target bbox: [620.48, 323.49, 659.74, 394.79]\n\nFrame 4:\n  Drone pose: [-42.05, 25.86, 20.04, -53.7, -161.39, 0.0]\n  Target bbox: [571.23, 371.21, 616.16, 459.08]\n\nFrame 5 (current):\n  Drone pose: [-42.32, 25.92, 19.92, -55.91, -161.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 571.12, \"ymin\": 327.68, \"xmax\": 614.65, \"ymax\": 405.71}, \"waypoint_deltas\": [{\"dx\": -0.41, \"dy\": -0.2, \"dz\": 0.08, \"dpitch\": -0.39, \"dyaw\": -5.97, \"droll\": 0.0}, {\"dx\": -0.68, \"dy\": -0.3, \"dz\": 0.08, \"dpitch\": 0.03, \"dyaw\": -6.65, \"droll\": 0.0}, {\"dx\": -0.95, \"dy\": -0.41, \"dz\": 0.08, \"dpitch\": 0.45, \"dyaw\": -7.3, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": -0.4, \"dz\": 0.08, \"dpitch\": 1.07, \"dyaw\": -7.53, \"droll\": 0.0}, {\"dx\": -1.34, \"dy\": -0.39, \"dz\": 0.08, \"dpitch\": 1.68, \"dyaw\": -7.75, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.38, "window_alt_abs_m": 0.17, "target_px_mean_hist": 728.8, "cur_frame_id": 167, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-18/trajectory_1776498751", "difficulty_score": 0.2327, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-111.75, 25.97, 22.0, -46.48, -61.19, 0.0]\n  Target bbox: [627.08, 329.22, 653.0, 389.94]\n\nFrame 2:\n  Drone pose: [-112.87, 23.59, 21.2, -46.85, -55.49, 0.0]\n  Target bbox: [625.95, 327.76, 654.11, 391.35]\n\nFrame 3:\n  Drone pose: [-113.14, 22.47, 20.67, -46.64, -53.71, 0.0]\n  Target bbox: [627.38, 328.55, 652.46, 390.46]\n\nFrame 4:\n  Drone pose: [-113.17, 21.58, 20.64, -46.58, -51.69, 0.0]\n  Target bbox: [630.29, 328.53, 649.56, 390.52]\n\nFrame 5 (current):\n  Drone pose: [-113.22, 20.68, 20.62, -46.48, -49.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.79, \"ymin\": 327.8, \"xmax\": 654.98, \"ymax\": 391.2}, \"waypoint_deltas\": [{\"dx\": 0.18, \"dy\": -0.66, \"dz\": -0.03, \"dpitch\": 0.17, \"dyaw\": 1.07, \"droll\": 0.0}, {\"dx\": 0.4, \"dy\": -1.28, \"dz\": -0.05, \"dpitch\": 0.36, \"dyaw\": 1.95, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": -1.7, \"dz\": -0.07, \"dpitch\": 0.51, \"dyaw\": 1.85, \"droll\": 0.0}, {\"dx\": 1.39, \"dy\": -2.13, \"dz\": -0.09, \"dpitch\": 0.6, \"dyaw\": 1.68, \"droll\": 0.0}, {\"dx\": 1.93, \"dy\": -2.58, \"dz\": -0.2, \"dpitch\": 0.78, \"dyaw\": 1.48, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.57, "window_alt_abs_m": 1.38, "target_px_mean_hist": 463.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00008/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00012/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-111.83, 18.55, 20.53, -45.88, -47.94, 0.0]\n  Target bbox: [627.16, 328.23, 652.63, 390.82]\n\nFrame 2:\n  Drone pose: [-111.29, 18.1, 20.42, -45.7, -48.14, 0.0]\n  Target bbox: [628.34, 329.02, 651.49, 390.0]\n\nFrame 3:\n  Drone pose: [-110.75, 17.65, 20.39, -45.64, -48.32, 0.0]\n  Target bbox: [630.58, 328.6, 649.27, 390.48]\n\nFrame 4:\n  Drone pose: [-110.23, 17.2, 20.36, -45.34, -48.47, 0.0]\n  Target bbox: [625.78, 329.46, 654.04, 389.53]\n\nFrame 5 (current):\n  Drone pose: [-109.75, 16.75, 20.33, -45.22, -48.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.77, \"ymin\": 327.34, \"xmax\": 650.04, \"ymax\": 391.75}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.44, \"dz\": -0.03, \"dpitch\": 0.17, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 0.82, \"dy\": -0.87, \"dz\": -0.06, \"dpitch\": -0.08, \"dyaw\": -0.97, \"droll\": 0.0}, {\"dx\": 1.15, \"dy\": -1.3, \"dz\": -0.09, \"dpitch\": -0.29, \"dyaw\": -1.9, \"droll\": 0.0}, {\"dx\": 1.47, \"dy\": -1.72, \"dz\": -0.11, \"dpitch\": -0.46, \"dyaw\": -2.78, \"droll\": 0.0}, {\"dx\": 1.76, \"dy\": -2.15, \"dz\": -0.14, \"dpitch\": -0.61, \"dyaw\": -3.63, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.59, "window_alt_abs_m": 0.2, "target_px_mean_hist": 504.8, "cur_frame_id": 12, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-107.99, 14.6, 20.19, -45.83, -52.16, 0.0]\n  Target bbox: [623.02, 324.44, 657.01, 394.63]\n\nFrame 2:\n  Drone pose: [-107.71, 14.17, 20.17, -45.97, -52.98, 0.0]\n  Target bbox: [626.53, 326.95, 653.54, 392.09]\n\nFrame 3:\n  Drone pose: [-107.44, 13.77, 20.15, -46.07, -53.83, 0.0]\n  Target bbox: [623.43, 324.75, 656.62, 394.3]\n\nFrame 4:\n  Drone pose: [-107.18, 13.37, 20.14, -46.16, -54.68, 0.0]\n  Target bbox: [624.66, 325.74, 655.4, 393.31]\n\nFrame 5 (current):\n  Drone pose: [-106.93, 12.91, 20.12, -46.31, -55.41, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.66, \"ymin\": 326.51, \"xmax\": 653.42, \"ymax\": 392.47}, \"waypoint_deltas\": [{\"dx\": 0.26, \"dy\": -0.44, \"dz\": -0.02, \"dpitch\": -0.12, \"dyaw\": -0.77, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": -0.87, \"dz\": -0.03, \"dpitch\": -0.23, \"dyaw\": -1.53, \"droll\": 0.0}, {\"dx\": 0.73, \"dy\": -1.32, \"dz\": -0.04, \"dpitch\": -0.34, \"dyaw\": -2.24, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": -1.77, \"dz\": -0.05, \"dpitch\": -0.44, \"dyaw\": -2.92, \"droll\": 0.0}, {\"dx\": 1.16, \"dy\": -2.22, \"dz\": -0.06, \"dpitch\": -0.54, \"dyaw\": -3.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": true, "window_yaw_abs_deg": 3.26, "window_alt_abs_m": 0.07, "target_px_mean_hist": 397.2, "cur_frame_id": 21, "source": "ORI", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00030/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-105.77, 10.69, 20.06, -46.85, -59.0, 0.0]\n  Target bbox: [625.88, 326.12, 654.21, 392.86]\n\nFrame 2:\n  Drone pose: [-105.55, 10.23, 20.05, -46.95, -59.66, 0.0]\n  Target bbox: [623.14, 324.68, 656.94, 394.31]\n\nFrame 3:\n  Drone pose: [-105.34, 9.78, 20.04, -47.04, -60.35, 0.0]\n  Target bbox: [624.93, 325.16, 655.17, 393.78]\n\nFrame 4:\n  Drone pose: [-105.12, 9.34, 20.04, -47.12, -61.06, 0.0]\n  Target bbox: [626.82, 326.51, 653.28, 392.44]\n\nFrame 5 (current):\n  Drone pose: [-104.92, 8.9, 20.03, -47.19, -61.74, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.32, \"ymin\": 324.73, \"xmax\": 656.77, \"ymax\": 394.22}, \"waypoint_deltas\": [{\"dx\": 0.2, \"dy\": -0.45, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.65, \"droll\": 0.0}, {\"dx\": 0.39, \"dy\": -0.9, \"dz\": -0.01, \"dpitch\": -0.14, \"dyaw\": -1.26, \"droll\": 0.0}, {\"dx\": 0.56, \"dy\": -1.36, \"dz\": -0.01, \"dpitch\": -0.19, \"dyaw\": -1.85, \"droll\": 0.0}, {\"dx\": 0.73, \"dy\": -1.81, \"dz\": -0.01, \"dpitch\": -0.24, \"dyaw\": -2.39, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": -2.27, \"dz\": -0.02, \"dpitch\": -0.28, \"dyaw\": -2.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.74, "window_alt_abs_m": 0.03, "target_px_mean_hist": 503.2, "cur_frame_id": 30, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.04, 6.63, 20.01, -47.47, -64.65, 0.0]\n  Target bbox: [626.2, 326.12, 653.91, 392.79]\n\nFrame 2:\n  Drone pose: [-103.9, 6.17, 20.01, -47.5, -65.12, 0.0]\n  Target bbox: [624.98, 325.19, 655.13, 393.69]\n\nFrame 3:\n  Drone pose: [-103.78, 5.72, 20.01, -47.52, -65.56, 0.0]\n  Target bbox: [622.76, 324.15, 657.36, 394.75]\n\nFrame 4:\n  Drone pose: [-103.65, 5.26, 20.01, -47.53, -65.98, 0.0]\n  Target bbox: [625.8, 325.97, 654.32, 392.95]\n\nFrame 5 (current):\n  Drone pose: [-103.53, 4.79, 20.01, -47.56, -66.4, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.49, \"ymin\": 325.08, \"xmax\": 655.63, \"ymax\": 393.81}, \"waypoint_deltas\": [{\"dx\": 0.11, \"dy\": -0.46, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": -0.38, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": -0.93, \"dz\": -0.01, \"dpitch\": -0.03, \"dyaw\": -0.7, \"droll\": 0.0}, {\"dx\": 0.28, \"dy\": -1.39, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": -1.01, \"droll\": 0.0}, {\"dx\": 0.37, \"dy\": -1.85, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": -1.33, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": -2.35, \"dz\": -0.01, \"dpitch\": -0.06, \"dyaw\": -1.6, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.76, "window_alt_abs_m": 0.01, "target_px_mean_hist": 529.2, "cur_frame_id": 39, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00048/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.07, 2.44, 20.0, -47.62, -68.0, 0.0]\n  Target bbox: [625.73, 325.75, 654.4, 393.14]\n\nFrame 2:\n  Drone pose: [-103.01, 1.93, 20.0, -47.68, -68.18, 0.0]\n  Target bbox: [623.58, 324.86, 656.56, 394.04]\n\nFrame 3:\n  Drone pose: [-102.95, 1.47, 20.0, -47.66, -68.41, 0.0]\n  Target bbox: [624.96, 325.04, 655.18, 393.84]\n\nFrame 4:\n  Drone pose: [-102.9, 0.99, 20.0, -47.65, -68.58, 0.0]\n  Target bbox: [627.44, 326.58, 652.68, 392.32]\n\nFrame 5 (current):\n  Drone pose: [-102.86, 0.49, 20.0, -47.68, -68.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.9, \"ymin\": 325.55, \"xmax\": 654.23, \"ymax\": 393.3}, \"waypoint_deltas\": [{\"dx\": 0.04, \"dy\": -0.47, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -0.95, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.24, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.43, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -1.91, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": -2.4, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": -0.4, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.72, "window_alt_abs_m": 0.0, "target_px_mean_hist": 415.0, "cur_frame_id": 48, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.76, -1.91, 20.0, -47.59, -69.13, 0.0]\n  Target bbox: [626.01, 326.04, 654.12, 392.86]\n\nFrame 2:\n  Drone pose: [-102.76, -2.41, 20.0, -47.58, -69.16, 0.0]\n  Target bbox: [624.89, 325.35, 655.25, 393.51]\n\nFrame 3:\n  Drone pose: [-102.75, -2.9, 20.0, -47.58, -69.19, 0.0]\n  Target bbox: [622.82, 324.22, 657.34, 394.68]\n\nFrame 4:\n  Drone pose: [-102.74, -3.4, 20.0, -47.58, -69.22, 0.0]\n  Target bbox: [626.54, 326.3, 653.59, 392.6]\n\nFrame 5 (current):\n  Drone pose: [-102.72, -3.9, 20.0, -47.58, -69.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.78, \"ymin\": 325.17, \"xmax\": 656.36, \"ymax\": 393.74}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.01, \"dz\": 0.0, \"dpitch\": 0.27, \"dyaw\": 1.5, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -1.52, \"dz\": 0.0, \"dpitch\": 0.26, \"dyaw\": 1.52, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": -2.03, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": 1.57, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": -2.54, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 1.63, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.13, "window_alt_abs_m": 0.0, "target_px_mean_hist": 537.8, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.74, -6.44, 20.0, -47.34, -67.64, 0.0]\n  Target bbox: [626.67, 326.36, 653.46, 392.56]\n\nFrame 2:\n  Drone pose: [-102.74, -6.96, 20.0, -47.37, -67.59, 0.0]\n  Target bbox: [623.12, 324.82, 657.01, 394.12]\n\nFrame 3:\n  Drone pose: [-102.74, -7.49, 20.0, -47.4, -67.56, 0.0]\n  Target bbox: [627.23, 326.19, 652.9, 392.71]\n\nFrame 4:\n  Drone pose: [-102.73, -8.01, 20.0, -47.45, -67.57, 0.0]\n  Target bbox: [627.02, 325.82, 652.82, 393.13]\n\nFrame 5 (current):\n  Drone pose: [-102.71, -8.54, 20.0, -47.19, -66.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.36, \"ymin\": 324.92, \"xmax\": 656.76, \"ymax\": 394.03}, \"waypoint_deltas\": [{\"dx\": 0.03, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -1.07, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": 1.44, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -1.61, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": 1.46, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -2.15, \"dz\": 0.0, \"dpitch\": 0.45, \"dyaw\": 3.0, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -2.69, \"dz\": 0.0, \"dpitch\": 0.41, \"dyaw\": 3.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.57, "window_alt_abs_m": 0.0, "target_px_mean_hist": 527.5, "cur_frame_id": 66, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.69, -11.23, 20.0, -46.78, -62.96, 0.0]\n  Target bbox: [623.8, 324.98, 656.31, 393.99]\n\nFrame 2:\n  Drone pose: [-102.72, -11.78, 20.0, -46.83, -62.79, 0.0]\n  Target bbox: [624.07, 324.66, 656.05, 394.3]\n\nFrame 3:\n  Drone pose: [-102.75, -12.32, 20.0, -46.87, -62.65, 0.0]\n  Target bbox: [626.08, 326.1, 654.02, 392.85]\n\nFrame 4:\n  Drone pose: [-102.75, -12.87, 20.0, -46.94, -62.57, 0.0]\n  Target bbox: [623.58, 324.78, 656.52, 394.19]\n\nFrame 5 (current):\n  Drone pose: [-102.71, -13.41, 20.0, -47.02, -62.61, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.61, \"ymin\": 325.87, \"xmax\": 653.5, \"ymax\": 393.05}, \"waypoint_deltas\": [{\"dx\": 0.08, \"dy\": -0.54, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": -1.07, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 0.95, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.59, \"dz\": 0.0, \"dpitch\": 0.37, \"dyaw\": 1.98, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": -2.11, \"dz\": 0.0, \"dpitch\": 0.61, \"dyaw\": 2.97, \"droll\": 0.0}, {\"dx\": 0.65, \"dy\": -2.63, \"dz\": 0.0, \"dpitch\": 0.87, \"dyaw\": 3.97, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.43, "window_alt_abs_m": 0.0, "target_px_mean_hist": 532.8, "cur_frame_id": 75, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/ORI/frames_playback/frame_00084/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.06, -16.04, 20.0, -46.15, -58.64, 0.0]\n  Target bbox: [626.49, 325.16, 653.3, 393.87]\n\nFrame 2:\n  Drone pose: [-101.92, -16.56, 20.0, -45.87, -57.65, 0.0]\n  Target bbox: [623.96, 325.11, 656.12, 393.95]\n\nFrame 3:\n  Drone pose: [-101.79, -17.08, 20.0, -46.01, -57.97, 0.0]\n  Target bbox: [625.17, 325.19, 654.61, 393.84]\n\nFrame 4:\n  Drone pose: [-101.66, -17.6, 20.0, -45.72, -56.96, 0.0]\n  Target bbox: [623.79, 325.03, 656.27, 394.03]\n\nFrame 5 (current):\n  Drone pose: [-101.55, -18.11, 20.0, -45.83, -57.25, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.35, \"ymin\": 325.34, \"xmax\": 654.43, \"ymax\": 393.7}, \"waypoint_deltas\": [{\"dx\": 0.12, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 0.31, \"dyaw\": 1.03, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -1.02, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": 0.76, \"droll\": 0.0}, {\"dx\": 0.34, \"dy\": -1.53, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.49, \"droll\": 0.0}, {\"dx\": 0.45, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": 0.42, \"dyaw\": 1.5, \"droll\": 0.0}, {\"dx\": 0.57, \"dy\": -2.55, \"dz\": 0.0, \"dpitch\": 0.3, \"dyaw\": 1.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.63, "window_alt_abs_m": 0.0, "target_px_mean_hist": 525.0, "cur_frame_id": 84, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-111.76, 25.92, 21.84, -46.32, -61.09, 0.0]\n  Target bbox: [627.29, 329.43, 652.79, 389.75]\n\nFrame 2:\n  Drone pose: [-112.87, 23.59, 21.2, -46.28, -54.29, 0.0]\n  Target bbox: [609.79, 336.34, 642.03, 402.32]\n\nFrame 3:\n  Drone pose: [-113.14, 22.47, 20.67, -46.64, -53.71, 0.0]\n  Target bbox: [624.9, 326.97, 654.88, 392.05]\n\nFrame 4:\n  Drone pose: [-113.26, 21.62, 20.54, -46.28, -51.57, 0.0]\n  Target bbox: [627.25, 327.56, 652.55, 391.49]\n\nFrame 5 (current):\n  Drone pose: [-113.13, 20.62, 20.56, -51.1, -50.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 638.58, \"ymin\": 251.38, \"xmax\": 663.57, \"ymax\": 315.24}, \"waypoint_deltas\": [{\"dx\": 0.09, \"dy\": -0.6, \"dz\": 0.03, \"dpitch\": 4.79, \"dyaw\": 2.09, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": -1.22, \"dz\": 0.01, \"dpitch\": 4.98, \"dyaw\": 2.97, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": -1.64, \"dz\": -0.01, \"dpitch\": 5.13, \"dyaw\": 2.87, \"droll\": 0.0}, {\"dx\": 1.3, \"dy\": -2.07, \"dz\": -0.03, \"dpitch\": 5.22, \"dyaw\": 2.7, \"droll\": 0.0}, {\"dx\": 1.84, \"dy\": -2.52, \"dz\": -0.14, \"dpitch\": 5.4, \"dyaw\": 2.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.45, "window_alt_abs_m": 1.33, "target_px_mean_hist": 469.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00008/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00012/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-111.72, 18.52, 20.36, -45.62, -43.14, 0.0]\n  Target bbox: [566.2, 332.03, 595.89, 396.02]\n\nFrame 2:\n  Drone pose: [-111.29, 18.1, 20.42, -45.7, -48.14, 0.0]\n  Target bbox: [628.37, 327.19, 651.41, 391.88]\n\nFrame 3:\n  Drone pose: [-110.81, 17.71, 20.35, -49.8, -49.56, 0.0]\n  Target bbox: [639.92, 254.76, 669.9, 318.76]\n\nFrame 4:\n  Drone pose: [-110.23, 17.2, 20.36, -48.74, -51.03, 0.0]\n  Target bbox: [658.59, 270.01, 682.29, 335.78]\n\nFrame 5 (current):\n  Drone pose: [-109.75, 16.75, 20.33, -46.96, -46.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 596.35, \"ymin\": 298.58, \"xmax\": 626.57, \"ymax\": 362.76}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.44, \"dz\": -0.03, \"dpitch\": 1.91, \"dyaw\": -2.34, \"droll\": 0.0}, {\"dx\": 0.82, \"dy\": -0.87, \"dz\": -0.06, \"dpitch\": 1.66, \"dyaw\": -3.34, \"droll\": 0.0}, {\"dx\": 1.15, \"dy\": -1.3, \"dz\": -0.09, \"dpitch\": 1.45, \"dyaw\": -4.27, \"droll\": 0.0}, {\"dx\": 1.47, \"dy\": -1.72, \"dz\": -0.11, \"dpitch\": 1.28, \"dyaw\": -5.15, \"droll\": 0.0}, {\"dx\": 1.76, \"dy\": -2.15, \"dz\": -0.14, \"dpitch\": 1.13, \"dyaw\": -6.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.76, "window_alt_abs_m": 0.15, "target_px_mean_hist": 512.5, "cur_frame_id": 12, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00021/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-107.99, 14.6, 20.19, -45.83, -52.16, 0.0]\n  Target bbox: [625.9, 326.39, 654.17, 392.66]\n\nFrame 2:\n  Drone pose: [-107.71, 14.17, 20.17, -50.97, -50.43, 0.0]\n  Target bbox: [593.35, 241.18, 626.1, 310.63]\n\nFrame 3:\n  Drone pose: [-107.48, 13.9, 20.15, -45.88, -53.99, 0.0]\n  Target bbox: [623.17, 324.49, 656.88, 394.58]\n\nFrame 4:\n  Drone pose: [-107.18, 13.37, 20.14, -46.16, -54.68, 0.0]\n  Target bbox: [622.95, 324.31, 657.1, 394.73]\n\nFrame 5 (current):\n  Drone pose: [-106.93, 12.91, 20.12, -47.77, -50.41, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 563.39, \"ymin\": 301.47, \"xmax\": 599.83, \"ymax\": 372.27}, \"waypoint_deltas\": [{\"dx\": 0.26, \"dy\": -0.44, \"dz\": -0.02, \"dpitch\": 1.34, \"dyaw\": -5.77, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": -0.87, \"dz\": -0.03, \"dpitch\": 1.23, \"dyaw\": -6.53, \"droll\": 0.0}, {\"dx\": 0.73, \"dy\": -1.32, \"dz\": -0.04, \"dpitch\": 1.12, \"dyaw\": -7.24, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": -1.77, \"dz\": -0.05, \"dpitch\": 1.02, \"dyaw\": -7.92, \"droll\": 0.0}, {\"dx\": 1.16, \"dy\": -2.22, \"dz\": -0.06, \"dpitch\": 0.92, \"dyaw\": -8.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.24, "window_alt_abs_m": 0.07, "target_px_mean_hist": 379.0, "cur_frame_id": 21, "source": "aug_001", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00030/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-105.77, 10.69, 20.06, -46.75, -62.95, 0.0]\n  Target bbox: [671.16, 329.17, 700.46, 395.37]\n\nFrame 2:\n  Drone pose: [-105.55, 10.23, 20.05, -46.95, -59.66, 0.0]\n  Target bbox: [625.34, 325.81, 654.75, 393.16]\n\nFrame 3:\n  Drone pose: [-105.34, 9.78, 20.04, -47.04, -60.35, 0.0]\n  Target bbox: [624.48, 325.29, 655.61, 393.68]\n\nFrame 4:\n  Drone pose: [-105.12, 9.32, 20.04, -49.35, -61.94, 0.0]\n  Target bbox: [634.42, 288.03, 666.93, 357.27]\n\nFrame 5 (current):\n  Drone pose: [-105.0, 8.95, 20.09, -46.19, -56.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 564.33, \"ymin\": 342.05, \"xmax\": 600.75, \"ymax\": 412.43}, \"waypoint_deltas\": [{\"dx\": 0.28, \"dy\": -0.5, \"dz\": -0.06, \"dpitch\": -1.07, \"dyaw\": -5.8, \"droll\": 0.0}, {\"dx\": 0.47, \"dy\": -0.95, \"dz\": -0.07, \"dpitch\": -1.14, \"dyaw\": -6.41, \"droll\": 0.0}, {\"dx\": 0.64, \"dy\": -1.41, \"dz\": -0.07, \"dpitch\": -1.19, \"dyaw\": -7.0, \"droll\": 0.0}, {\"dx\": 0.81, \"dy\": -1.86, \"dz\": -0.07, \"dpitch\": -1.24, \"dyaw\": -7.54, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": -2.32, \"dz\": -0.08, \"dpitch\": -1.28, \"dyaw\": -8.06, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.92, "window_alt_abs_m": 0.08, "target_px_mean_hist": 501.2, "cur_frame_id": 30, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00035/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00036/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00039/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.97, 6.77, 20.12, -52.15, -68.65, 0.0]\n  Target bbox: [667.53, 248.05, 695.4, 315.5]\n\nFrame 2:\n  Drone pose: [-103.82, 6.09, 19.9, -48.37, -60.83, 0.0]\n  Target bbox: [572.39, 311.73, 606.61, 381.65]\n\nFrame 3:\n  Drone pose: [-103.87, 5.55, 19.99, -44.43, -67.86, 0.0]\n  Target bbox: [658.52, 381.46, 686.21, 447.87]\n\nFrame 4:\n  Drone pose: [-103.62, 5.23, 20.14, -52.8, -65.6, 0.0]\n  Target bbox: [621.77, 242.28, 647.77, 308.47]\n\nFrame 5 (current):\n  Drone pose: [-103.55, 4.84, 20.01, -47.49, -66.41, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.31, \"ymin\": 324.72, \"xmax\": 655.82, \"ymax\": 394.18}, \"waypoint_deltas\": [{\"dx\": 0.13, \"dy\": -0.51, \"dz\": -0.01, \"dpitch\": -0.09, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": 0.22, \"dy\": -0.98, \"dz\": -0.01, \"dpitch\": -0.1, \"dyaw\": -0.69, \"droll\": 0.0}, {\"dx\": 0.3, \"dy\": -1.44, \"dz\": -0.01, \"dpitch\": -0.09, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": 0.39, \"dy\": -1.9, \"dz\": -0.01, \"dpitch\": -0.09, \"dyaw\": -1.32, \"droll\": 0.0}, {\"dx\": 0.48, \"dy\": -2.4, \"dz\": -0.01, \"dpitch\": -0.13, \"dyaw\": -1.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.91, "window_alt_abs_m": 0.58, "target_px_mean_hist": 529.0, "cur_frame_id": 39, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00048/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.02, 2.47, 20.02, -44.49, -66.64, 0.0]\n  Target bbox: [605.51, 377.82, 638.88, 447.35]\n\nFrame 2:\n  Drone pose: [-103.01, 1.93, 20.0, -51.98, -68.67, 0.0]\n  Target bbox: [631.25, 253.53, 660.08, 321.04]\n\nFrame 3:\n  Drone pose: [-102.95, 1.47, 20.0, -47.97, -73.28, 0.0]\n  Target bbox: [680.13, 322.62, 710.99, 389.07]\n\nFrame 4:\n  Drone pose: [-103.04, 1.09, 20.01, -47.42, -68.27, 0.0]\n  Target bbox: [624.02, 325.17, 656.11, 393.74]\n\nFrame 5 (current):\n  Drone pose: [-102.81, 0.34, 19.89, -45.88, -65.19, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 583.43, \"ymin\": 356.92, \"xmax\": 616.86, \"ymax\": 427.44}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": -0.32, \"dz\": 0.11, \"dpitch\": -1.78, \"dyaw\": -3.68, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -0.8, \"dz\": 0.11, \"dpitch\": -1.75, \"dyaw\": -3.78, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -1.28, \"dz\": 0.11, \"dpitch\": -1.73, \"dyaw\": -3.86, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -1.76, \"dz\": 0.11, \"dpitch\": -1.72, \"dyaw\": -3.91, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -2.25, \"dz\": 0.11, \"dpitch\": -1.71, \"dyaw\": -3.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.73, "window_alt_abs_m": 0.16, "target_px_mean_hist": 403.5, "cur_frame_id": 48, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.75, -2.1, 20.02, -47.9, -68.94, 0.0]\n  Target bbox: [627.46, 326.57, 652.66, 392.31]\n\nFrame 2:\n  Drone pose: [-102.76, -2.41, 20.0, -50.45, -66.51, 0.0]\n  Target bbox: [594.51, 277.86, 624.83, 345.72]\n\nFrame 3:\n  Drone pose: [-102.75, -2.9, 20.0, -44.48, -70.15, 0.0]\n  Target bbox: [634.72, 377.08, 667.75, 445.86]\n\nFrame 4:\n  Drone pose: [-102.69, -3.55, 20.04, -45.9, -66.25, 0.0]\n  Target bbox: [593.22, 360.11, 619.89, 427.25]\n\nFrame 5 (current):\n  Drone pose: [-102.77, -3.94, 19.97, -47.87, -69.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.81, \"ymin\": 319.86, \"xmax\": 657.1, \"ymax\": 388.99}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": -0.46, \"dz\": 0.03, \"dpitch\": 0.28, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -0.97, \"dz\": 0.03, \"dpitch\": 0.56, \"dyaw\": 1.5, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": -1.48, \"dz\": 0.03, \"dpitch\": 0.55, \"dyaw\": 1.52, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": -1.99, \"dz\": 0.03, \"dpitch\": 0.54, \"dyaw\": 1.57, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": -2.5, \"dz\": 0.03, \"dpitch\": 0.53, \"dyaw\": 1.63, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.99, "window_alt_abs_m": 0.12, "target_px_mean_hist": 539.0, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.74, -6.44, 20.0, -47.6, -71.29, 0.0]\n  Target bbox: [664.73, 321.95, 699.3, 390.26]\n\nFrame 2:\n  Drone pose: [-102.74, -6.96, 20.0, -47.37, -67.59, 0.0]\n  Target bbox: [623.13, 324.83, 657.01, 394.11]\n\nFrame 3:\n  Drone pose: [-102.81, -7.56, 19.84, -47.26, -68.0, 0.0]\n  Target bbox: [632.77, 324.36, 664.4, 393.36]\n\nFrame 4:\n  Drone pose: [-102.83, -7.91, 19.98, -48.89, -69.67, 0.0]\n  Target bbox: [647.51, 295.34, 684.79, 367.9]\n\nFrame 5 (current):\n  Drone pose: [-102.78, -8.64, 20.09, -44.89, -63.65, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 599.71, \"ymin\": 367.99, \"xmax\": 632.32, \"ymax\": 436.77}, \"waypoint_deltas\": [{\"dx\": 0.1, \"dy\": -0.43, \"dz\": -0.09, \"dpitch\": -2.36, \"dyaw\": -2.47, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": -0.97, \"dz\": -0.09, \"dpitch\": -2.09, \"dyaw\": -1.0, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": -1.51, \"dz\": -0.09, \"dpitch\": -2.15, \"dyaw\": -0.98, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": -2.05, \"dz\": -0.09, \"dpitch\": -1.85, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -2.59, \"dz\": -0.09, \"dpitch\": -1.89, \"dyaw\": 0.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.81, "window_alt_abs_m": 0.41, "target_px_mean_hist": 543.8, "cur_frame_id": 66, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.7, -11.38, 19.88, -43.3, -57.7, 0.0]\n  Target bbox: [566.27, 386.06, 598.07, 454.35]\n\nFrame 2:\n  Drone pose: [-102.72, -11.78, 20.0, -44.0, -63.07, 0.0]\n  Target bbox: [626.63, 372.32, 660.0, 441.72]\n\nFrame 3:\n  Drone pose: [-102.61, -12.37, 19.97, -46.99, -62.96, 0.0]\n  Target bbox: [624.44, 324.78, 655.67, 394.16]\n\nFrame 4:\n  Drone pose: [-102.75, -12.87, 20.0, -46.94, -62.57, 0.0]\n  Target bbox: [626.84, 326.44, 653.27, 392.51]\n\nFrame 5 (current):\n  Drone pose: [-102.71, -13.41, 20.0, -50.48, -66.47, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 667.24, \"ymin\": 267.82, \"xmax\": 702.17, \"ymax\": 337.12}, \"waypoint_deltas\": [{\"dx\": 0.08, \"dy\": -0.54, \"dz\": 0.0, \"dpitch\": 3.35, \"dyaw\": 3.67, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": -1.07, \"dz\": 0.0, \"dpitch\": 3.59, \"dyaw\": 4.81, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.59, \"dz\": 0.0, \"dpitch\": 3.83, \"dyaw\": 5.84, \"droll\": 0.0}, {\"dx\": 0.5, \"dy\": -2.11, \"dz\": 0.0, \"dpitch\": 4.07, \"dyaw\": 6.83, \"droll\": 0.0}, {\"dx\": 0.65, \"dy\": -2.63, \"dz\": 0.0, \"dpitch\": 4.33, \"dyaw\": 7.83, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.78, "window_alt_abs_m": 0.19, "target_px_mean_hist": 534.2, "cur_frame_id": 75, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394/aug_001/frames_playback/frame_00084/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.06, -16.18, 20.12, -46.81, -62.14, 0.0]\n  Target bbox: [667.38, 321.19, 698.59, 389.9]\n\nFrame 2:\n  Drone pose: [-101.92, -16.56, 20.0, -44.88, -56.75, 0.0]\n  Target bbox: [613.99, 341.89, 644.67, 410.45]\n\nFrame 3:\n  Drone pose: [-101.79, -17.08, 20.0, -46.01, -57.97, 0.0]\n  Target bbox: [626.42, 327.36, 653.43, 391.62]\n\nFrame 4:\n  Drone pose: [-101.6, -17.44, 19.97, -45.52, -57.37, 0.0]\n  Target bbox: [623.46, 324.95, 656.6, 394.13]\n\nFrame 5 (current):\n  Drone pose: [-101.55, -18.11, 20.0, -45.83, -57.25, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.33, \"ymin\": 325.26, \"xmax\": 654.44, \"ymax\": 393.78}, \"waypoint_deltas\": [{\"dx\": 0.12, \"dy\": -0.51, \"dz\": 0.0, \"dpitch\": 0.31, \"dyaw\": 1.03, \"droll\": 0.0}, {\"dx\": 0.23, \"dy\": -1.02, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": 0.76, \"droll\": 0.0}, {\"dx\": 0.34, \"dy\": -1.53, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.49, \"droll\": 0.0}, {\"dx\": 0.45, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": 0.42, \"dyaw\": 1.5, \"droll\": 0.0}, {\"dx\": 0.57, \"dy\": -2.55, \"dz\": 0.0, \"dpitch\": 0.3, \"dyaw\": 1.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.34, "window_alt_abs_m": 0.17, "target_px_mean_hist": 524.8, "cur_frame_id": 84, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-15/trajectory_1776219394", "difficulty_score": 0.4581, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-114.61, 17.44, 22.0, -46.48, 0.0, 0.0]\n  Target bbox: [624.26, 330.69, 655.74, 388.44]\n\nFrame 2:\n  Drone pose: [-115.01, 16.42, 21.2, -44.07, 2.81, 0.0]\n  Target bbox: [624.05, 330.39, 656.19, 388.86]\n\nFrame 3:\n  Drone pose: [-114.88, 15.99, 20.67, -42.78, 3.92, 0.0]\n  Target bbox: [623.17, 329.57, 657.09, 389.71]\n\nFrame 4:\n  Drone pose: [-114.48, 15.86, 20.64, -42.59, 4.25, 0.0]\n  Target bbox: [623.84, 329.57, 656.41, 389.73]\n\nFrame 5 (current):\n  Drone pose: [-113.98, 15.84, 20.62, -42.55, 4.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.45, \"ymin\": 329.39, \"xmax\": 656.8, \"ymax\": 389.86}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.0, \"dz\": -0.07, \"dpitch\": 0.05, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": 0.0, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": 0.0, \"dz\": -0.2, \"dpitch\": 0.2, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.29, "window_alt_abs_m": 1.38, "target_px_mean_hist": 639.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.91, 15.84, 20.39, -42.32, 4.31, 0.0]\n  Target bbox: [622.49, 328.92, 657.78, 390.35]\n\nFrame 2:\n  Drone pose: [-110.39, 15.84, 20.36, -42.29, 4.31, 0.0]\n  Target bbox: [623.64, 326.8, 656.72, 392.62]\n\nFrame 3:\n  Drone pose: [-109.88, 15.84, 20.33, -42.26, 4.31, 0.0]\n  Target bbox: [622.37, 326.03, 658.05, 393.45]\n\nFrame 4:\n  Drone pose: [-109.37, 15.84, 20.3, -42.23, 4.31, 0.0]\n  Target bbox: [624.07, 328.63, 656.23, 390.77]\n\nFrame 5 (current):\n  Drone pose: [-108.86, 15.84, 20.27, -42.2, 4.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.34, \"ymin\": 329.11, \"xmax\": 656.93, \"ymax\": 390.2}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.0, \"dz\": -0.08, \"dpitch\": 0.08, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": 0.0, \"dz\": -0.1, \"dpitch\": 0.1, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.54, \"dy\": 0.0, \"dz\": -0.12, \"dpitch\": 0.12, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.12, "target_px_mean_hist": 655.0, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-105.81, 15.84, 20.13, -42.07, 4.33, 0.0]\n  Target bbox: [623.2, 328.86, 657.06, 390.42]\n\nFrame 2:\n  Drone pose: [-105.3, 15.84, 20.12, -42.06, 4.33, 0.0]\n  Target bbox: [621.7, 326.54, 658.68, 392.83]\n\nFrame 3:\n  Drone pose: [-104.8, 15.84, 20.1, -42.04, 4.33, 0.0]\n  Target bbox: [623.48, 326.03, 656.91, 393.44]\n\nFrame 4:\n  Drone pose: [-104.29, 15.84, 20.09, -42.03, 4.34, 0.0]\n  Target bbox: [623.11, 326.67, 657.26, 392.77]\n\nFrame 5 (current):\n  Drone pose: [-103.78, 15.84, 20.08, -42.02, 4.34, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.92, \"ymin\": 328.08, \"xmax\": 656.38, \"ymax\": 391.31}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.02, \"dy\": 0.0, \"dz\": -0.04, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.53, \"dy\": 0.0, \"dz\": -0.04, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.06, "target_px_mean_hist": 669.8, "cur_frame_id": 24, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.24, 15.84, 20.03, -42.0, 4.35, 0.0]\n  Target bbox: [622.97, 328.49, 657.3, 390.75]\n\nFrame 2:\n  Drone pose: [-99.74, 15.84, 20.02, -42.0, 4.35, 0.0]\n  Target bbox: [623.59, 328.73, 656.67, 390.55]\n\nFrame 3:\n  Drone pose: [-99.23, 15.84, 20.02, -42.0, 4.35, 0.0]\n  Target bbox: [623.96, 328.25, 656.35, 391.15]\n\nFrame 4:\n  Drone pose: [-98.73, 15.84, 20.02, -42.0, 4.35, 0.0]\n  Target bbox: [623.16, 328.74, 657.11, 390.56]\n\nFrame 5 (current):\n  Drone pose: [-98.23, 15.84, 20.01, -42.0, 4.35, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.27, \"ymin\": 327.6, \"xmax\": 657.03, \"ymax\": 391.7}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.02, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.52, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.01, "target_px_mean_hist": 685.8, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.2, 15.84, 20.0, -42.02, 4.36, 0.0]\n  Target bbox: [623.31, 328.72, 656.96, 390.56]\n\nFrame 2:\n  Drone pose: [-94.7, 15.84, 20.0, -42.02, 4.36, 0.0]\n  Target bbox: [622.01, 328.18, 658.28, 391.07]\n\nFrame 3:\n  Drone pose: [-94.2, 15.84, 20.0, -42.02, 4.36, 0.0]\n  Target bbox: [623.42, 329.04, 656.84, 390.24]\n\nFrame 4:\n  Drone pose: [-93.69, 15.84, 20.0, -42.03, 4.36, 0.0]\n  Target bbox: [623.23, 327.34, 657.09, 391.98]\n\nFrame 5 (current):\n  Drone pose: [-93.19, 15.84, 20.0, -42.03, 4.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.55, \"ymin\": 327.23, \"xmax\": 656.78, \"ymax\": 392.13}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 682.8, "cur_frame_id": 45, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-89.67, 15.84, 20.0, -42.05, 4.37, 0.0]\n  Target bbox: [623.35, 329.15, 656.91, 390.14]\n\nFrame 2:\n  Drone pose: [-89.17, 15.84, 20.0, -42.05, 4.37, 0.0]\n  Target bbox: [623.27, 327.77, 657.03, 391.52]\n\nFrame 3:\n  Drone pose: [-88.67, 15.84, 20.0, -42.06, 4.37, 0.0]\n  Target bbox: [623.42, 328.77, 656.85, 390.52]\n\nFrame 4:\n  Drone pose: [-88.17, 15.84, 20.0, -42.06, 4.37, 0.0]\n  Target bbox: [623.19, 325.97, 657.22, 393.55]\n\nFrame 5 (current):\n  Drone pose: [-87.67, 15.84, 20.0, -42.06, 4.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.05, \"ymin\": 329.0, \"xmax\": 657.21, \"ymax\": 390.28}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 698.0, "cur_frame_id": 56, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-84.66, 15.83, 20.0, -42.07, 4.37, 0.0]\n  Target bbox: [623.19, 328.65, 657.07, 390.62]\n\nFrame 2:\n  Drone pose: [-84.16, 15.83, 20.0, -42.08, 4.38, 0.0]\n  Target bbox: [623.87, 328.08, 656.44, 391.3]\n\nFrame 3:\n  Drone pose: [-83.65, 15.83, 20.0, -42.08, 4.38, 0.0]\n  Target bbox: [621.52, 325.1, 658.92, 394.35]\n\nFrame 4:\n  Drone pose: [-83.15, 15.83, 20.0, -42.08, 4.39, 0.0]\n  Target bbox: [623.32, 329.11, 656.94, 390.17]\n\nFrame 5 (current):\n  Drone pose: [-82.65, 15.82, 20.0, -42.08, 4.41, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.3, \"ymin\": 325.37, \"xmax\": 657.11, \"ymax\": 394.11}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.26, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.03, "window_alt_abs_m": 0.0, "target_px_mean_hist": 683.0, "cur_frame_id": 66, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-79.64, 15.69, 20.0, -42.08, 4.78, 0.0]\n  Target bbox: [623.28, 325.46, 657.12, 394.0]\n\nFrame 2:\n  Drone pose: [-79.14, 15.63, 20.0, -42.08, 4.92, 0.0]\n  Target bbox: [621.93, 325.24, 658.51, 394.24]\n\nFrame 3:\n  Drone pose: [-78.63, 15.57, 20.0, -42.08, 5.09, 0.0]\n  Target bbox: [623.06, 328.52, 657.2, 390.75]\n\nFrame 4:\n  Drone pose: [-78.12, 15.5, 20.0, -42.08, 5.29, 0.0]\n  Target bbox: [623.48, 328.44, 656.8, 390.92]\n\nFrame 5 (current):\n  Drone pose: [-77.62, 15.41, 20.0, -42.08, 5.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.28, \"ymin\": 326.64, \"xmax\": 658.07, \"ymax\": 392.74}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": -0.33, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.91, \"droll\": 0.0}, {\"dx\": 2.06, \"dy\": -0.46, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 1.27, \"droll\": 0.0}, {\"dx\": 2.57, \"dy\": -0.6, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 1.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.75, "window_alt_abs_m": 0.0, "target_px_mean_hist": 690.2, "cur_frame_id": 76, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-74.0, 14.54, 20.0, -42.1, 7.91, 0.0]\n  Target bbox: [621.85, 327.02, 658.46, 392.35]\n\nFrame 2:\n  Drone pose: [-73.48, 14.43, 20.0, -42.11, 8.22, 0.0]\n  Target bbox: [620.56, 324.01, 659.89, 395.51]\n\nFrame 3:\n  Drone pose: [-72.95, 14.34, 20.0, -42.13, 8.47, 0.0]\n  Target bbox: [622.66, 327.63, 657.59, 391.64]\n\nFrame 4:\n  Drone pose: [-72.41, 14.28, 20.0, -42.16, 8.66, 0.0]\n  Target bbox: [617.3, 322.91, 662.6, 396.79]\n\nFrame 5 (current):\n  Drone pose: [-71.87, 14.24, 20.0, -42.1, 10.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.23, \"ymin\": 325.75, \"xmax\": 659.1, \"ymax\": 393.68}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 1.48, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 1.5, \"droll\": 0.0}, {\"dx\": 2.16, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 1.49, \"droll\": 0.0}, {\"dx\": 2.71, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 2.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.21, "window_alt_abs_m": 0.0, "target_px_mean_hist": 696.8, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/ORI/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-68.62, 14.28, 20.0, -42.18, 12.81, 0.0]\n  Target bbox: [616.55, 322.15, 663.28, 397.56]\n\nFrame 2:\n  Drone pose: [-68.07, 14.33, 20.0, -42.1, 14.03, 0.0]\n  Target bbox: [619.93, 324.07, 660.43, 395.46]\n\nFrame 3:\n  Drone pose: [-67.53, 14.39, 20.0, -42.18, 13.91, 0.0]\n  Target bbox: [617.46, 323.03, 662.36, 396.66]\n\nFrame 4:\n  Drone pose: [-66.98, 14.45, 20.0, -42.09, 15.08, 0.0]\n  Target bbox: [619.63, 325.08, 660.64, 394.35]\n\nFrame 5 (current):\n  Drone pose: [-66.44, 14.53, 20.0, -42.17, 14.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.7, \"ymin\": 324.52, \"xmax\": 660.16, \"ymax\": 395.04}, \"waypoint_deltas\": [{\"dx\": 0.55, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 1.15, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 1.0, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 2.14, \"droll\": 0.0}, {\"dx\": 2.16, \"dy\": 0.28, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 1.97, \"droll\": 0.0}, {\"dx\": 2.69, \"dy\": 0.37, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.79, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.67, "window_alt_abs_m": 0.0, "target_px_mean_hist": 672.2, "cur_frame_id": 97, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-114.72, 17.51, 22.15, -46.51, -0.17, 0.0]\n  Target bbox: [624.17, 331.43, 655.77, 387.7]\n\nFrame 2:\n  Drone pose: [-115.01, 16.42, 21.2, -40.89, 7.05, 0.0]\n  Target bbox: [571.63, 383.82, 605.7, 444.61]\n\nFrame 3:\n  Drone pose: [-115.03, 16.08, 20.63, -46.83, 3.91, 0.0]\n  Target bbox: [619.18, 256.48, 654.75, 318.14]\n\nFrame 4:\n  Drone pose: [-114.58, 16.0, 20.64, -42.47, 3.85, 0.0]\n  Target bbox: [624.2, 329.71, 656.05, 389.59]\n\nFrame 5 (current):\n  Drone pose: [-113.93, 15.97, 20.75, -46.06, 1.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 660.1, \"ymin\": 272.81, \"xmax\": 693.1, \"ymax\": 338.83}, \"waypoint_deltas\": [{\"dx\": 0.46, \"dy\": -0.13, \"dz\": -0.16, \"dpitch\": 3.53, \"dyaw\": 3.26, \"droll\": 0.0}, {\"dx\": 0.97, \"dy\": -0.13, \"dz\": -0.18, \"dpitch\": 3.55, \"dyaw\": 3.26, \"droll\": 0.0}, {\"dx\": 1.49, \"dy\": -0.13, \"dz\": -0.2, \"dpitch\": 3.56, \"dyaw\": 3.27, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -0.13, \"dz\": -0.22, \"dpitch\": 3.57, \"dyaw\": 3.27, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": -0.13, \"dz\": -0.33, \"dpitch\": 3.71, \"dyaw\": 3.27, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.25, "window_alt_abs_m": 1.63, "target_px_mean_hist": 639.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.91, 15.84, 20.39, -46.12, 4.92, 0.0]\n  Target bbox: [614.25, 264.56, 650.37, 327.07]\n\nFrame 2:\n  Drone pose: [-110.49, 15.93, 20.45, -42.31, 4.05, 0.0]\n  Target bbox: [622.2, 325.55, 658.23, 393.93]\n\nFrame 3:\n  Drone pose: [-109.82, 15.97, 20.36, -39.91, 0.87, 0.0]\n  Target bbox: [661.99, 372.42, 695.62, 431.6]\n\nFrame 4:\n  Drone pose: [-109.48, 15.69, 20.36, -42.49, 5.61, 0.0]\n  Target bbox: [611.55, 320.23, 645.4, 388.56]\n\nFrame 5 (current):\n  Drone pose: [-108.86, 15.84, 20.27, -43.43, 2.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 643.29, \"ymin\": 305.72, \"xmax\": 678.78, \"ymax\": 372.99}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 1.26, \"dyaw\": 1.65, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 1.28, \"dyaw\": 1.65, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.0, \"dz\": -0.08, \"dpitch\": 1.31, \"dyaw\": 1.65, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": 0.0, \"dz\": -0.1, \"dpitch\": 1.33, \"dyaw\": 1.66, \"droll\": 0.0}, {\"dx\": 2.54, \"dy\": 0.0, \"dz\": -0.12, \"dpitch\": 1.35, \"dyaw\": 1.66, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.72, "window_alt_abs_m": 0.26, "target_px_mean_hist": 665.0, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-105.73, 15.86, 20.18, -42.97, 9.28, 0.0]\n  Target bbox: [559.03, 314.98, 596.43, 384.05]\n\nFrame 2:\n  Drone pose: [-105.31, 15.77, 19.97, -41.81, 4.5, 0.0]\n  Target bbox: [621.81, 327.78, 658.5, 391.51]\n\nFrame 3:\n  Drone pose: [-104.78, 15.83, 20.16, -40.82, 3.67, 0.0]\n  Target bbox: [632.36, 351.39, 665.26, 412.56]\n\nFrame 4:\n  Drone pose: [-104.17, 15.73, 20.12, -42.22, 4.66, 0.0]\n  Target bbox: [623.22, 328.9, 657.02, 390.34]\n\nFrame 5 (current):\n  Drone pose: [-103.74, 15.7, 20.1, -43.14, 6.42, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 601.96, \"ymin\": 311.15, \"xmax\": 634.86, \"ymax\": 373.67}, \"waypoint_deltas\": [{\"dx\": 0.46, \"dy\": 0.14, \"dz\": -0.03, \"dpitch\": 1.12, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": 0.97, \"dy\": 0.14, \"dz\": -0.04, \"dpitch\": 1.13, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": 1.48, \"dy\": 0.14, \"dz\": -0.05, \"dpitch\": 1.13, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": 1.98, \"dy\": 0.14, \"dz\": -0.06, \"dpitch\": 1.14, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": 2.49, \"dy\": 0.14, \"dz\": -0.06, \"dpitch\": 1.14, \"dyaw\": -2.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.36, "window_alt_abs_m": 0.46, "target_px_mean_hist": 682.0, "cur_frame_id": 24, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-100.19, 15.8, 19.94, -41.95, 4.46, 0.0]\n  Target bbox: [622.08, 325.52, 658.34, 393.96]\n\nFrame 2:\n  Drone pose: [-99.83, 15.73, 20.01, -41.85, 4.63, 0.0]\n  Target bbox: [623.59, 328.73, 656.67, 390.56]\n\nFrame 3:\n  Drone pose: [-99.28, 15.93, 20.15, -41.64, -0.9, 0.0]\n  Target bbox: [685.49, 337.01, 720.0, 403.15]\n\nFrame 4:\n  Drone pose: [-98.73, 15.84, 20.02, -42.5, 5.23, 0.0]\n  Target bbox: [610.27, 317.39, 647.71, 385.37]\n\nFrame 5 (current):\n  Drone pose: [-98.14, 15.95, 20.14, -42.32, 4.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.47, \"ymin\": 329.3, \"xmax\": 656.78, \"ymax\": 389.97}, \"waypoint_deltas\": [{\"dx\": 0.42, \"dy\": -0.11, \"dz\": -0.13, \"dpitch\": 0.32, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -0.11, \"dz\": -0.13, \"dpitch\": 0.31, \"dyaw\": 0.29, \"droll\": 0.0}, {\"dx\": 1.43, \"dy\": -0.11, \"dz\": -0.13, \"dpitch\": 0.31, \"dyaw\": 0.3, \"droll\": 0.0}, {\"dx\": 1.93, \"dy\": -0.11, \"dz\": -0.13, \"dpitch\": 0.31, \"dyaw\": 0.3, \"droll\": 0.0}, {\"dx\": 2.43, \"dy\": -0.11, \"dz\": -0.14, \"dpitch\": 0.3, \"dyaw\": 0.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.02, "window_alt_abs_m": 0.47, "target_px_mean_hist": 682.2, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00045/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-95.2, 15.84, 20.0, -42.45, 7.49, 0.0]\n  Target bbox: [583.28, 321.74, 618.2, 384.55]\n\nFrame 2:\n  Drone pose: [-94.73, 15.84, 19.91, -43.76, 2.86, 0.0]\n  Target bbox: [641.46, 296.85, 676.44, 358.57]\n\nFrame 3:\n  Drone pose: [-94.2, 15.84, 20.0, -39.77, 5.24, 0.0]\n  Target bbox: [612.29, 364.56, 645.78, 430.61]\n\nFrame 4:\n  Drone pose: [-93.58, 15.71, 20.04, -42.23, 4.73, 0.0]\n  Target bbox: [622.33, 326.11, 658.06, 393.33]\n\nFrame 5 (current):\n  Drone pose: [-93.19, 15.84, 20.0, -41.53, 4.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.14, \"ymin\": 336.69, \"xmax\": 657.58, \"ymax\": 399.68}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.5, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.51, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.51, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.51, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.52, \"dyaw\": 0.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.97, "window_alt_abs_m": 0.26, "target_px_mean_hist": 683.8, "cur_frame_id": 45, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00056/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-89.67, 15.84, 20.0, -46.1, 1.93, 0.0]\n  Target bbox: [653.19, 259.5, 688.69, 324.5]\n\nFrame 2:\n  Drone pose: [-89.21, 15.96, 19.89, -42.62, 2.52, 0.0]\n  Target bbox: [642.66, 316.0, 675.94, 377.33]\n\nFrame 3:\n  Drone pose: [-88.7, 15.88, 19.88, -41.84, 4.23, 0.0]\n  Target bbox: [623.65, 328.61, 656.64, 390.74]\n\nFrame 4:\n  Drone pose: [-88.17, 15.84, 20.0, -44.05, -0.63, 0.0]\n  Target bbox: [684.69, 295.86, 721.07, 360.39]\n\nFrame 5 (current):\n  Drone pose: [-87.67, 15.84, 20.0, -42.06, 4.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.3, \"ymin\": 325.36, \"xmax\": 657.11, \"ymax\": 394.13}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.16, "window_alt_abs_m": 0.24, "target_px_mean_hist": 687.5, "cur_frame_id": 56, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00066/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-84.61, 15.95, 20.09, -41.66, 6.44, 0.0]\n  Target bbox: [593.17, 339.36, 627.75, 401.9]\n\nFrame 2:\n  Drone pose: [-84.04, 15.72, 20.01, -42.23, 4.71, 0.0]\n  Target bbox: [623.17, 328.99, 657.09, 390.3]\n\nFrame 3:\n  Drone pose: [-83.62, 15.8, 20.17, -42.37, 4.47, 0.0]\n  Target bbox: [622.17, 325.97, 658.23, 393.45]\n\nFrame 4:\n  Drone pose: [-83.15, 15.83, 20.0, -42.57, 6.07, 0.0]\n  Target bbox: [601.63, 317.01, 636.15, 386.44]\n\nFrame 5 (current):\n  Drone pose: [-82.65, 15.82, 20.0, -42.92, 4.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.57, \"ymin\": 311.96, \"xmax\": 656.3, \"ymax\": 379.39}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.84, \"dyaw\": -0.11, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 0.84, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.84, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 0.84, \"dyaw\": 0.04, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": 0.84, \"dyaw\": 0.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.12, "window_alt_abs_m": 0.41, "target_px_mean_hist": 695.5, "cur_frame_id": 66, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00076/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-79.47, 15.68, 20.01, -44.59, 8.05, 0.0]\n  Target bbox: [580.04, 286.77, 619.7, 358.29]\n\nFrame 2:\n  Drone pose: [-79.22, 15.62, 20.02, -42.0, 4.94, 0.0]\n  Target bbox: [623.32, 326.43, 657.04, 392.98]\n\nFrame 3:\n  Drone pose: [-78.67, 15.48, 20.03, -42.06, 5.33, 0.0]\n  Target bbox: [621.92, 325.31, 658.5, 394.16]\n\nFrame 4:\n  Drone pose: [-78.0, 15.49, 20.03, -44.64, 3.68, 0.0]\n  Target bbox: [644.21, 289.51, 678.43, 351.38]\n\nFrame 5 (current):\n  Drone pose: [-77.62, 15.41, 20.0, -45.6, 8.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 587.67, \"ymin\": 269.7, \"xmax\": 623.46, \"ymax\": 332.52}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 3.52, \"dyaw\": -2.47, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -0.21, \"dz\": 0.0, \"dpitch\": 3.52, \"dyaw\": -2.17, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": -0.33, \"dz\": 0.0, \"dpitch\": 3.52, \"dyaw\": -1.83, \"droll\": 0.0}, {\"dx\": 2.06, \"dy\": -0.46, \"dz\": 0.0, \"dpitch\": 3.52, \"dyaw\": -1.47, \"droll\": 0.0}, {\"dx\": 2.57, \"dy\": -0.6, \"dz\": 0.0, \"dpitch\": 3.51, \"dyaw\": -1.09, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.75, "window_alt_abs_m": 0.05, "target_px_mean_hist": 680.2, "cur_frame_id": 76, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-73.98, 14.6, 20.1, -42.28, 7.76, 0.0]\n  Target bbox: [622.16, 325.89, 658.21, 393.57]\n\nFrame 2:\n  Drone pose: [-73.48, 14.43, 20.0, -41.2, 10.47, 0.0]\n  Target bbox: [594.08, 341.53, 629.72, 409.15]\n\nFrame 3:\n  Drone pose: [-72.95, 14.34, 20.0, -42.13, 8.47, 0.0]\n  Target bbox: [622.58, 328.37, 657.66, 390.94]\n\nFrame 4:\n  Drone pose: [-72.52, 14.27, 19.93, -41.92, 8.64, 0.0]\n  Target bbox: [618.88, 322.57, 660.96, 397.03]\n\nFrame 5 (current):\n  Drone pose: [-71.87, 14.24, 20.0, -42.1, 10.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.62, \"ymin\": 327.05, \"xmax\": 658.65, \"ymax\": 392.31}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 1.08, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 1.48, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 1.5, \"droll\": 0.0}, {\"dx\": 2.16, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 1.49, \"droll\": 0.0}, {\"dx\": 2.71, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 2.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.35, "window_alt_abs_m": 0.23, "target_px_mean_hist": 679.0, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480/aug_001/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-68.62, 14.28, 20.0, -42.18, 12.81, 0.0]\n  Target bbox: [620.5, 324.19, 659.36, 395.33]\n\nFrame 2:\n  Drone pose: [-68.06, 14.5, 20.08, -40.86, 13.86, 0.0]\n  Target bbox: [618.6, 350.42, 654.73, 416.99]\n\nFrame 3:\n  Drone pose: [-67.53, 14.39, 20.0, -40.29, 9.71, 0.0]\n  Target bbox: [670.24, 356.11, 715.05, 429.39]\n\nFrame 4:\n  Drone pose: [-66.98, 14.45, 20.0, -42.76, 12.42, 0.0]\n  Target bbox: [654.07, 316.28, 693.26, 381.49]\n\nFrame 5 (current):\n  Drone pose: [-66.44, 14.53, 20.0, -42.17, 14.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.21, \"ymin\": 324.68, \"xmax\": 659.64, \"ymax\": 394.88}, \"waypoint_deltas\": [{\"dx\": 0.55, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 1.15, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 1.0, \"droll\": 0.0}, {\"dx\": 1.62, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": 2.14, \"droll\": 0.0}, {\"dx\": 2.16, \"dy\": 0.28, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": 1.97, \"droll\": 0.0}, {\"dx\": 2.69, \"dy\": 0.37, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.79, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.41, "window_alt_abs_m": 0.16, "target_px_mean_hist": 702.0, "cur_frame_id": 97, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip37/2026-04-16/trajectory_1776307480", "difficulty_score": 0.2117, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.86, 69.67, 22.0, -46.47, 98.53, 0.0]\n  Target bbox: [619.03, 324.46, 660.85, 395.02]\n\nFrame 2:\n  Drone pose: [105.52, 69.27, 21.2, -44.26, 95.88, 0.0]\n  Target bbox: [625.56, 331.03, 654.63, 388.25]\n\nFrame 3:\n  Drone pose: [104.78, 69.4, 20.67, -43.08, 93.77, 0.0]\n  Target bbox: [624.73, 327.99, 655.27, 391.39]\n\nFrame 4:\n  Drone pose: [104.36, 69.79, 20.64, -42.89, 93.97, 0.0]\n  Target bbox: [624.24, 327.73, 655.68, 391.71]\n\nFrame 5 (current):\n  Drone pose: [104.1, 70.28, 20.62, -42.81, 94.62, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.27, \"ymin\": 330.05, \"xmax\": 656.97, \"ymax\": 389.23}, \"waypoint_deltas\": [{\"dx\": -0.17, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 0.0, \"dyaw\": -0.47, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": 1.02, \"dz\": -0.05, \"dpitch\": 0.01, \"dyaw\": -0.82, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": 1.53, \"dz\": -0.07, \"dpitch\": 0.02, \"dyaw\": -1.11, \"droll\": 0.0}, {\"dx\": -0.5, \"dy\": 2.04, \"dz\": -0.09, \"dpitch\": 0.03, \"dyaw\": -1.36, \"droll\": 0.0}, {\"dx\": -0.59, \"dy\": 2.54, \"dz\": -0.2, \"dpitch\": 0.17, \"dyaw\": -1.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.61, "window_alt_abs_m": 1.38, "target_px_mean_hist": 450.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.51, 72.82, 20.42, -42.64, 93.03, 0.0]\n  Target bbox: [628.25, 331.09, 651.93, 388.21]\n\nFrame 2:\n  Drone pose: [103.43, 73.33, 20.39, -42.6, 92.82, 0.0]\n  Target bbox: [619.92, 329.25, 660.39, 389.98]\n\nFrame 3:\n  Drone pose: [103.36, 73.83, 20.36, -42.57, 92.63, 0.0]\n  Target bbox: [624.16, 329.7, 656.09, 389.57]\n\nFrame 4:\n  Drone pose: [103.3, 74.34, 20.33, -42.54, 92.46, 0.0]\n  Target bbox: [621.8, 329.78, 658.47, 389.44]\n\nFrame 5 (current):\n  Drone pose: [103.25, 74.84, 20.3, -42.51, 92.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.44, \"ymin\": 330.49, \"xmax\": 659.84, \"ymax\": 388.78}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": 1.02, \"dz\": -0.06, \"dpitch\": 0.06, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": -0.08, \"dy\": 1.53, \"dz\": -0.08, \"dpitch\": 0.09, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": 2.04, \"dz\": -0.11, \"dpitch\": 0.11, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": 2.56, \"dz\": -0.13, \"dpitch\": 0.12, \"dyaw\": -0.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.72, "window_alt_abs_m": 0.12, "target_px_mean_hist": 493.5, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.23, 77.4, 20.17, -42.39, 92.28, 0.0]\n  Target bbox: [625.41, 330.36, 654.81, 388.91]\n\nFrame 2:\n  Drone pose: [103.3, 77.92, 20.15, -42.39, 92.46, 0.0]\n  Target bbox: [619.74, 329.23, 660.57, 389.99]\n\nFrame 3:\n  Drone pose: [103.39, 78.45, 20.13, -42.4, 92.72, 0.0]\n  Target bbox: [621.57, 329.5, 658.7, 389.71]\n\nFrame 4:\n  Drone pose: [103.51, 78.99, 20.12, -42.41, 93.06, 0.0]\n  Target bbox: [620.36, 330.0, 659.93, 389.27]\n\nFrame 5 (current):\n  Drone pose: [103.66, 79.53, 20.1, -42.44, 93.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.1, \"ymin\": 329.97, \"xmax\": 654.11, \"ymax\": 389.28}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": 0.55, \"dz\": -0.01, \"dpitch\": -0.04, \"dyaw\": 0.48, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": 1.11, \"dz\": -0.02, \"dpitch\": -0.08, \"dyaw\": 1.01, \"droll\": 0.0}, {\"dx\": 0.56, \"dy\": 1.68, \"dz\": -0.03, \"dpitch\": -0.14, \"dyaw\": 1.59, \"droll\": 0.0}, {\"dx\": 0.77, \"dy\": 2.26, \"dz\": -0.04, \"dpitch\": -0.21, \"dyaw\": 2.17, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": 2.85, \"dz\": -0.05, \"dpitch\": -0.29, \"dyaw\": 2.73, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.18, "window_alt_abs_m": 0.07, "target_px_mean_hist": 487.0, "cur_frame_id": 22, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.62, 82.38, 20.05, -42.73, 96.19, 0.0]\n  Target bbox: [621.01, 328.97, 659.23, 390.27]\n\nFrame 2:\n  Drone pose: [104.79, 82.98, 20.04, -42.83, 96.68, 0.0]\n  Target bbox: [627.53, 329.83, 652.64, 389.44]\n\nFrame 3:\n  Drone pose: [104.9, 83.59, 20.04, -42.95, 97.03, 0.0]\n  Target bbox: [625.0, 328.82, 655.21, 390.43]\n\nFrame 4:\n  Drone pose: [104.95, 84.2, 20.03, -43.09, 97.21, 0.0]\n  Target bbox: [623.07, 325.91, 656.82, 393.51]\n\nFrame 5 (current):\n  Drone pose: [104.94, 84.83, 20.03, -43.16, 98.61, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.37, \"ymin\": 326.91, \"xmax\": 655.5, \"ymax\": 392.5}, \"waypoint_deltas\": [{\"dx\": -0.07, \"dy\": 0.63, \"dz\": -0.01, \"dpitch\": -0.08, \"dyaw\": 1.25, \"droll\": 0.0}, {\"dx\": -0.19, \"dy\": 1.26, \"dz\": -0.01, \"dpitch\": -0.16, \"dyaw\": 2.37, \"droll\": 0.0}, {\"dx\": -0.35, \"dy\": 1.89, \"dz\": -0.01, \"dpitch\": -0.25, \"dyaw\": 3.4, \"droll\": 0.0}, {\"dx\": -0.54, \"dy\": 2.53, \"dz\": -0.02, \"dpitch\": -0.34, \"dyaw\": 4.35, \"droll\": 0.0}, {\"dx\": -0.75, \"dy\": 3.17, \"dz\": -0.02, \"dpitch\": -0.43, \"dyaw\": 5.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.42, "window_alt_abs_m": 0.02, "target_px_mean_hist": 501.2, "cur_frame_id": 31, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00041/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.96, 88.64, 20.01, -43.68, 104.71, 0.0]\n  Target bbox: [621.29, 324.64, 658.55, 394.8]\n\nFrame 2:\n  Drone pose: [103.72, 89.27, 20.01, -43.77, 105.54, 0.0]\n  Target bbox: [624.42, 327.57, 655.53, 391.68]\n\nFrame 3:\n  Drone pose: [103.47, 89.9, 20.01, -43.85, 106.34, 0.0]\n  Target bbox: [622.16, 325.05, 657.66, 394.36]\n\nFrame 4:\n  Drone pose: [103.21, 90.52, 20.0, -43.92, 107.11, 0.0]\n  Target bbox: [624.63, 327.75, 655.3, 391.49]\n\nFrame 5 (current):\n  Drone pose: [102.94, 91.15, 20.0, -43.98, 107.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.47, \"ymin\": 326.15, \"xmax\": 655.36, \"ymax\": 393.2}, \"waypoint_deltas\": [{\"dx\": -0.28, \"dy\": 0.61, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.71, \"droll\": 0.0}, {\"dx\": -0.57, \"dy\": 1.22, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 1.38, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": 1.82, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 2.03, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 2.42, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": 2.64, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 3.02, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": 3.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.14, "window_alt_abs_m": 0.01, "target_px_mean_hist": 507.0, "cur_frame_id": 41, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.44, 94.17, 20.0, -44.21, 111.09, 0.0]\n  Target bbox: [626.55, 327.1, 653.29, 392.18]\n\nFrame 2:\n  Drone pose: [101.11, 94.76, 20.0, -44.24, 111.66, 0.0]\n  Target bbox: [626.73, 327.26, 653.11, 392.01]\n\nFrame 3:\n  Drone pose: [100.79, 95.34, 20.0, -44.27, 112.22, 0.0]\n  Target bbox: [619.77, 323.24, 659.96, 396.21]\n\nFrame 4:\n  Drone pose: [100.46, 95.93, 20.0, -44.28, 112.78, 0.0]\n  Target bbox: [620.63, 324.33, 659.17, 394.99]\n\nFrame 5 (current):\n  Drone pose: [100.14, 96.51, 20.0, -44.28, 113.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.55, \"ymin\": 323.21, \"xmax\": 660.16, \"ymax\": 396.25}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": 0.57, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.61, \"droll\": 0.0}, {\"dx\": -0.6, \"dy\": 1.15, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 1.27, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": 1.72, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 1.94, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": 2.28, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": 1.31, \"droll\": 0.0}, {\"dx\": -1.41, \"dy\": 2.85, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 2.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.27, "window_alt_abs_m": 0.0, "target_px_mean_hist": 522.5, "cur_frame_id": 50, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.73, 99.36, 20.0, -44.4, 115.38, 0.0]\n  Target bbox: [619.94, 323.17, 659.79, 396.15]\n\nFrame 2:\n  Drone pose: [98.47, 99.93, 20.0, -44.34, 116.09, 0.0]\n  Target bbox: [622.49, 324.59, 657.28, 394.68]\n\nFrame 3:\n  Drone pose: [98.2, 100.5, 20.0, -44.28, 116.78, 0.0]\n  Target bbox: [620.29, 325.04, 659.8, 394.26]\n\nFrame 4:\n  Drone pose: [97.91, 101.06, 20.0, -44.54, 116.11, 0.0]\n  Target bbox: [621.31, 324.97, 658.47, 394.32]\n\nFrame 5 (current):\n  Drone pose: [97.61, 101.62, 20.0, -44.5, 116.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.22, \"ymin\": 326.03, \"xmax\": 654.56, \"ymax\": 393.29}, \"waypoint_deltas\": [{\"dx\": -0.33, \"dy\": 0.57, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.54, \"droll\": 0.0}, {\"dx\": -0.68, \"dy\": 1.13, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 1.02, \"droll\": 0.0}, {\"dx\": -1.05, \"dy\": 1.7, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 1.46, \"droll\": 0.0}, {\"dx\": -1.43, \"dy\": 2.26, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 1.86, \"droll\": 0.0}, {\"dx\": -1.82, \"dy\": 2.83, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 2.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.66, "window_alt_abs_m": 0.0, "target_px_mean_hist": 518.8, "cur_frame_id": 59, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00069/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [95.41, 105.01, 20.0, -44.45, 119.33, 0.0]\n  Target bbox: [624.35, 325.28, 655.4, 393.98]\n\nFrame 2:\n  Drone pose: [95.02, 105.57, 20.0, -44.45, 119.71, 0.0]\n  Target bbox: [620.38, 323.11, 659.27, 396.27]\n\nFrame 3:\n  Drone pose: [94.59, 106.13, 20.0, -44.47, 119.98, 0.0]\n  Target bbox: [622.99, 324.9, 656.76, 394.34]\n\nFrame 4:\n  Drone pose: [94.18, 106.68, 20.0, -44.48, 120.27, 0.0]\n  Target bbox: [625.89, 326.46, 653.89, 392.82]\n\nFrame 5 (current):\n  Drone pose: [93.76, 107.24, 20.0, -44.48, 120.58, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.56, \"ymin\": 325.97, \"xmax\": 657.23, \"ymax\": 393.21}, \"waypoint_deltas\": [{\"dx\": -0.41, \"dy\": 0.56, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": -0.83, \"dy\": 1.11, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.6, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": 1.66, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.88, \"droll\": 0.0}, {\"dx\": -1.67, \"dy\": 2.21, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 1.14, \"droll\": 0.0}, {\"dx\": -2.1, \"dy\": 2.76, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 1.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.25, "window_alt_abs_m": 0.0, "target_px_mean_hist": 522.5, "cur_frame_id": 69, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [91.66, 110.0, 20.0, -44.51, 121.97, 0.0]\n  Target bbox: [622.02, 325.55, 657.71, 393.7]\n\nFrame 2:\n  Drone pose: [91.23, 110.54, 20.0, -44.51, 122.21, 0.0]\n  Target bbox: [621.12, 323.94, 658.53, 395.44]\n\nFrame 3:\n  Drone pose: [90.79, 111.08, 20.0, -44.51, 122.44, 0.0]\n  Target bbox: [625.0, 326.95, 654.79, 392.24]\n\nFrame 4:\n  Drone pose: [90.35, 111.62, 20.0, -44.51, 122.65, 0.0]\n  Target bbox: [622.63, 324.91, 657.05, 394.44]\n\nFrame 5 (current):\n  Drone pose: [89.91, 112.16, 20.0, -44.51, 122.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.18, \"ymin\": 328.22, \"xmax\": 655.66, \"ymax\": 390.93}, \"waypoint_deltas\": [{\"dx\": -0.45, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": -0.9, \"dy\": 1.06, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.34, \"droll\": 0.0}, {\"dx\": -1.36, \"dy\": 1.58, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.47, \"droll\": 0.0}, {\"dx\": -1.83, \"dy\": 2.1, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.58, \"droll\": 0.0}, {\"dx\": -2.3, \"dy\": 2.61, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.88, "window_alt_abs_m": 0.0, "target_px_mean_hist": 525.0, "cur_frame_id": 78, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/ORI/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [87.61, 114.77, 20.0, -44.49, 123.52, 0.0]\n  Target bbox: [625.55, 329.19, 654.31, 389.95]\n\nFrame 2:\n  Drone pose: [87.13, 115.28, 20.0, -44.48, 123.59, 0.0]\n  Target bbox: [623.32, 325.24, 656.37, 394.09]\n\nFrame 3:\n  Drone pose: [86.64, 115.78, 20.0, -44.47, 123.63, 0.0]\n  Target bbox: [623.76, 325.21, 655.97, 394.01]\n\nFrame 4:\n  Drone pose: [86.15, 116.28, 20.0, -44.46, 123.65, 0.0]\n  Target bbox: [622.47, 326.06, 657.27, 393.18]\n\nFrame 5 (current):\n  Drone pose: [85.65, 116.77, 20.0, -44.45, 123.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.63, \"ymin\": 328.29, \"xmax\": 654.2, \"ymax\": 390.86}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": 0.49, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": 0.97, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": -1.53, \"dy\": 1.45, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.16, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": 1.92, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": 2.4, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.13, "window_alt_abs_m": 0.0, "target_px_mean_hist": 537.0, "cur_frame_id": 87, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.86, 69.67, 22.0, -44.36, 93.53, 0.0]\n  Target bbox: [681.11, 365.11, 715.08, 428.79]\n\nFrame 2:\n  Drone pose: [105.68, 69.2, 21.15, -44.07, 96.3, 0.0]\n  Target bbox: [621.38, 329.77, 658.85, 389.46]\n\nFrame 3:\n  Drone pose: [104.82, 69.43, 20.77, -46.39, 96.3, 0.0]\n  Target bbox: [593.89, 275.76, 626.32, 339.48]\n\nFrame 4:\n  Drone pose: [104.4, 69.79, 20.55, -42.75, 94.08, 0.0]\n  Target bbox: [618.71, 323.63, 661.21, 395.99]\n\nFrame 5 (current):\n  Drone pose: [104.1, 70.28, 20.62, -44.15, 90.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 669.89, \"ymin\": 309.78, \"xmax\": 709.22, \"ymax\": 366.89}, \"waypoint_deltas\": [{\"dx\": -0.17, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 1.34, \"dyaw\": 3.51, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": 1.02, \"dz\": -0.05, \"dpitch\": 1.35, \"dyaw\": 3.16, \"droll\": 0.0}, {\"dx\": -0.41, \"dy\": 1.53, \"dz\": -0.07, \"dpitch\": 1.36, \"dyaw\": 2.87, \"droll\": 0.0}, {\"dx\": -0.5, \"dy\": 2.04, \"dz\": -0.09, \"dpitch\": 1.37, \"dyaw\": 2.62, \"droll\": 0.0}, {\"dx\": -0.59, \"dy\": 2.54, \"dz\": -0.2, \"dpitch\": 1.51, \"dyaw\": 2.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.43, "window_alt_abs_m": 1.52, "target_px_mean_hist": 454.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.51, 72.82, 20.42, -43.25, 88.03, 0.0]\n  Target bbox: [691.05, 322.2, 712.95, 380.2]\n\nFrame 2:\n  Drone pose: [103.32, 73.22, 20.28, -42.31, 92.49, 0.0]\n  Target bbox: [623.92, 330.18, 656.33, 389.09]\n\nFrame 3:\n  Drone pose: [103.25, 73.72, 20.29, -42.34, 92.31, 0.0]\n  Target bbox: [626.82, 330.9, 653.38, 388.39]\n\nFrame 4:\n  Drone pose: [103.33, 74.35, 20.29, -46.91, 91.27, 0.0]\n  Target bbox: [637.44, 255.41, 674.95, 315.91]\n\nFrame 5 (current):\n  Drone pose: [103.25, 74.84, 20.3, -44.82, 91.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 638.64, \"ymin\": 291.66, \"xmax\": 671.89, \"ymax\": 350.14}, \"waypoint_deltas\": [{\"dx\": -0.05, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 2.35, \"dyaw\": 1.08, \"droll\": 0.0}, {\"dx\": -0.07, \"dy\": 1.02, \"dz\": -0.06, \"dpitch\": 2.37, \"dyaw\": 1.02, \"droll\": 0.0}, {\"dx\": -0.08, \"dy\": 1.53, \"dz\": -0.08, \"dpitch\": 2.4, \"dyaw\": 1.0, \"droll\": 0.0}, {\"dx\": -0.06, \"dy\": 2.04, \"dz\": -0.11, \"dpitch\": 2.42, \"dyaw\": 1.05, \"droll\": 0.0}, {\"dx\": -0.02, \"dy\": 2.56, \"dz\": -0.13, \"dpitch\": 2.43, \"dyaw\": 1.16, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.84, "window_alt_abs_m": 0.17, "target_px_mean_hist": 492.8, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00018/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00022/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.2, 77.47, 20.09, -42.38, 92.21, 0.0]\n  Target bbox: [626.74, 330.59, 653.46, 388.68]\n\nFrame 2:\n  Drone pose: [103.3, 77.92, 20.15, -41.35, 96.96, 0.0]\n  Target bbox: [564.16, 348.4, 603.8, 408.82]\n\nFrame 3:\n  Drone pose: [103.45, 78.51, 19.98, -40.06, 89.84, 0.0]\n  Target bbox: [666.12, 368.2, 690.21, 426.04]\n\nFrame 4:\n  Drone pose: [103.47, 79.11, 20.11, -41.75, 97.96, 0.0]\n  Target bbox: [556.83, 344.57, 599.02, 405.8]\n\nFrame 5 (current):\n  Drone pose: [103.66, 79.53, 20.1, -43.81, 98.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.92, \"ymin\": 307.68, \"xmax\": 603.16, \"ymax\": 368.94}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": 0.55, \"dz\": -0.01, \"dpitch\": 1.33, \"dyaw\": -4.18, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": 1.11, \"dz\": -0.02, \"dpitch\": 1.29, \"dyaw\": -3.65, \"droll\": 0.0}, {\"dx\": 0.56, \"dy\": 1.68, \"dz\": -0.03, \"dpitch\": 1.23, \"dyaw\": -3.07, \"droll\": 0.0}, {\"dx\": 0.77, \"dy\": 2.26, \"dz\": -0.04, \"dpitch\": 1.16, \"dyaw\": -2.49, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": 2.85, \"dz\": -0.05, \"dpitch\": 1.08, \"dyaw\": -1.93, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.17, "window_alt_abs_m": 0.36, "target_px_mean_hist": 503.2, "cur_frame_id": 22, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00031/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.61, 82.34, 20.19, -42.89, 96.13, 0.0]\n  Target bbox: [619.43, 329.3, 660.84, 389.98]\n\nFrame 2:\n  Drone pose: [104.87, 82.85, 20.06, -39.45, 91.87, 0.0]\n  Target bbox: [685.54, 385.91, 718.88, 445.33]\n\nFrame 3:\n  Drone pose: [104.9, 83.59, 20.04, -41.03, 98.26, 0.0]\n  Target bbox: [607.17, 361.04, 642.27, 422.92]\n\nFrame 4:\n  Drone pose: [104.95, 84.2, 20.03, -47.14, 102.21, 0.0]\n  Target bbox: [562.82, 261.26, 593.61, 325.63]\n\nFrame 5 (current):\n  Drone pose: [104.89, 84.82, 20.01, -43.15, 98.48, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.0, \"ymin\": 322.25, \"xmax\": 661.87, \"ymax\": 397.32}, \"waypoint_deltas\": [{\"dx\": -0.02, \"dy\": 0.64, \"dz\": 0.01, \"dpitch\": -0.09, \"dyaw\": 1.38, \"droll\": 0.0}, {\"dx\": -0.14, \"dy\": 1.27, \"dz\": 0.01, \"dpitch\": -0.17, \"dyaw\": 2.5, \"droll\": 0.0}, {\"dx\": -0.3, \"dy\": 1.9, \"dz\": 0.01, \"dpitch\": -0.26, \"dyaw\": 3.53, \"droll\": 0.0}, {\"dx\": -0.49, \"dy\": 2.54, \"dz\": 0.0, \"dpitch\": -0.35, \"dyaw\": 4.48, \"droll\": 0.0}, {\"dx\": -0.7, \"dy\": 3.18, \"dz\": 0.0, \"dpitch\": -0.44, \"dyaw\": 5.37, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.32, "window_alt_abs_m": 0.18, "target_px_mean_hist": 511.2, "cur_frame_id": 31, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00037/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00041/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.0, 88.69, 20.19, -45.32, 106.76, 0.0]\n  Target bbox: [601.5, 306.26, 632.16, 369.84]\n\nFrame 2:\n  Drone pose: [103.72, 89.27, 20.01, -39.67, 103.28, 0.0]\n  Target bbox: [653.31, 396.45, 681.78, 461.43]\n\nFrame 3:\n  Drone pose: [103.35, 89.97, 19.95, -42.24, 106.34, 0.0]\n  Target bbox: [615.82, 350.93, 656.45, 424.43]\n\nFrame 4:\n  Drone pose: [103.21, 90.52, 20.0, -43.35, 105.5, 0.0]\n  Target bbox: [642.95, 335.6, 676.48, 403.28]\n\nFrame 5 (current):\n  Drone pose: [102.94, 91.15, 20.0, -43.98, 107.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.61, \"ymin\": 326.84, \"xmax\": 654.22, \"ymax\": 392.49}, \"waypoint_deltas\": [{\"dx\": -0.28, \"dy\": 0.61, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.71, \"droll\": 0.0}, {\"dx\": -0.57, \"dy\": 1.22, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 1.38, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": 1.82, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 2.03, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": 2.42, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": 2.64, \"droll\": 0.0}, {\"dx\": -1.5, \"dy\": 3.02, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": 3.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.72, "window_alt_abs_m": 0.31, "target_px_mean_hist": 513.0, "cur_frame_id": 41, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00046/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00050/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [101.52, 94.28, 20.04, -44.19, 115.62, 0.0]\n  Target bbox: [572.46, 329.79, 606.39, 398.51]\n\nFrame 2:\n  Drone pose: [101.11, 94.76, 20.0, -44.24, 111.66, 0.0]\n  Target bbox: [619.51, 322.92, 660.26, 396.44]\n\nFrame 3:\n  Drone pose: [100.72, 95.28, 20.1, -44.26, 106.97, 0.0]\n  Target bbox: [685.58, 330.57, 714.81, 395.47]\n\nFrame 4:\n  Drone pose: [100.46, 95.93, 20.0, -44.28, 112.78, 0.0]\n  Target bbox: [625.65, 326.5, 654.16, 392.81]\n\nFrame 5 (current):\n  Drone pose: [100.14, 96.51, 20.0, -47.06, 118.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 561.14, \"ymin\": 281.68, \"xmax\": 597.64, \"ymax\": 347.98}, \"waypoint_deltas\": [{\"dx\": -0.31, \"dy\": 0.57, \"dz\": 0.0, \"dpitch\": 2.79, \"dyaw\": -4.39, \"droll\": 0.0}, {\"dx\": -0.6, \"dy\": 1.15, \"dz\": 0.0, \"dpitch\": 2.81, \"dyaw\": -3.73, \"droll\": 0.0}, {\"dx\": -0.87, \"dy\": 1.72, \"dz\": 0.0, \"dpitch\": 2.86, \"dyaw\": -3.06, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": 2.28, \"dz\": 0.0, \"dpitch\": 2.61, \"dyaw\": -3.69, \"droll\": 0.0}, {\"dx\": -1.41, \"dy\": 2.85, \"dz\": 0.0, \"dpitch\": 2.66, \"dyaw\": -2.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.04, "window_alt_abs_m": 0.23, "target_px_mean_hist": 512.8, "cur_frame_id": 50, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00059/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.84, 99.34, 20.16, -44.13, 114.5, 0.0]\n  Target bbox: [636.45, 332.63, 671.19, 400.6]\n\nFrame 2:\n  Drone pose: [98.4, 99.89, 19.93, -40.16, 113.8, 0.0]\n  Target bbox: [649.09, 393.67, 681.24, 463.04]\n\nFrame 3:\n  Drone pose: [98.25, 100.56, 20.02, -44.62, 122.02, 0.0]\n  Target bbox: [555.68, 321.32, 603.33, 393.12]\n\nFrame 4:\n  Drone pose: [97.83, 100.94, 20.05, -44.53, 115.75, 0.0]\n  Target bbox: [622.76, 326.74, 657.09, 392.47]\n\nFrame 5 (current):\n  Drone pose: [97.7, 101.76, 20.05, -47.43, 113.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 671.34, \"ymin\": 279.54, \"xmax\": 706.85, \"ymax\": 350.01}, \"waypoint_deltas\": [{\"dx\": -0.42, \"dy\": 0.43, \"dz\": -0.05, \"dpitch\": 2.95, \"dyaw\": 4.18, \"droll\": 0.0}, {\"dx\": -0.77, \"dy\": 0.99, \"dz\": -0.05, \"dpitch\": 2.97, \"dyaw\": 4.66, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": 1.56, \"dz\": -0.05, \"dpitch\": 2.98, \"dyaw\": 5.1, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": 2.12, \"dz\": -0.05, \"dpitch\": 2.98, \"dyaw\": 5.5, \"droll\": 0.0}, {\"dx\": -1.91, \"dy\": 2.69, \"dz\": -0.05, \"dpitch\": 2.98, \"dyaw\": 5.88, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.86, "window_alt_abs_m": 0.35, "target_px_mean_hist": 517.5, "cur_frame_id": 59, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00069/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [95.41, 105.01, 20.0, -46.43, 114.33, 0.0]\n  Target bbox: [684.73, 294.68, 715.54, 361.77]\n\nFrame 2:\n  Drone pose: [95.03, 105.5, 20.1, -44.48, 119.64, 0.0]\n  Target bbox: [620.42, 323.26, 659.25, 396.08]\n\nFrame 3:\n  Drone pose: [94.64, 106.24, 20.1, -41.48, 118.17, 0.0]\n  Target bbox: [650.19, 382.46, 680.37, 446.35]\n\nFrame 4:\n  Drone pose: [94.18, 106.68, 20.0, -45.3, 124.16, 0.0]\n  Target bbox: [575.8, 312.51, 610.17, 381.27]\n\nFrame 5 (current):\n  Drone pose: [93.76, 107.24, 20.0, -44.74, 121.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.79, \"ymin\": 321.71, \"xmax\": 643.14, \"ymax\": 389.11}, \"waypoint_deltas\": [{\"dx\": -0.41, \"dy\": 0.56, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": -0.66, \"droll\": 0.0}, {\"dx\": -0.83, \"dy\": 1.11, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": -1.25, \"dy\": 1.66, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": -0.09, \"droll\": 0.0}, {\"dx\": -1.67, \"dy\": 2.21, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 0.17, \"droll\": 0.0}, {\"dx\": -2.1, \"dy\": 2.76, \"dz\": 0.0, \"dpitch\": 0.23, \"dyaw\": 0.42, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.37, "window_alt_abs_m": 0.2, "target_px_mean_hist": 528.5, "cur_frame_id": 69, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [91.66, 109.88, 20.04, -44.43, 121.79, 0.0]\n  Target bbox: [626.1, 326.46, 653.66, 392.85]\n\nFrame 2:\n  Drone pose: [91.23, 110.54, 20.0, -47.59, 117.21, 0.0]\n  Target bbox: [680.04, 273.61, 720.45, 345.89]\n\nFrame 3:\n  Drone pose: [90.76, 110.97, 20.07, -44.5, 122.19, 0.0]\n  Target bbox: [625.6, 326.2, 654.16, 393.06]\n\nFrame 4:\n  Drone pose: [90.34, 111.65, 20.06, -44.64, 122.68, 0.0]\n  Target bbox: [620.93, 323.45, 658.71, 395.86]\n\nFrame 5 (current):\n  Drone pose: [90.0, 112.25, 19.99, -44.54, 123.22, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.61, \"ymin\": 325.37, \"xmax\": 655.13, \"ymax\": 393.85}, \"waypoint_deltas\": [{\"dx\": -0.54, \"dy\": 0.44, \"dz\": 0.01, \"dpitch\": 0.03, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": -0.99, \"dy\": 0.97, \"dz\": 0.01, \"dpitch\": 0.04, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": -1.45, \"dy\": 1.49, \"dz\": 0.01, \"dpitch\": 0.04, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": -1.92, \"dy\": 2.01, \"dz\": 0.01, \"dpitch\": 0.05, \"dyaw\": 0.21, \"droll\": 0.0}, {\"dx\": -2.39, \"dy\": 2.52, \"dz\": 0.01, \"dpitch\": 0.05, \"dyaw\": 0.3, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.59, "window_alt_abs_m": 0.2, "target_px_mean_hist": 531.5, "cur_frame_id": 78, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731/aug_001/frames_playback/frame_00087/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [87.55, 114.9, 19.99, -45.4, 126.42, 0.0]\n  Target bbox: [594.24, 316.42, 617.96, 379.99]\n\nFrame 2:\n  Drone pose: [86.98, 115.4, 19.96, -41.99, 119.25, 0.0]\n  Target bbox: [670.33, 369.85, 710.03, 443.02]\n\nFrame 3:\n  Drone pose: [86.48, 115.85, 20.07, -46.31, 118.35, 0.0]\n  Target bbox: [682.11, 301.12, 717.51, 370.96]\n\nFrame 4:\n  Drone pose: [86.15, 116.28, 20.0, -44.46, 123.65, 0.0]\n  Target bbox: [622.45, 325.51, 657.29, 393.68]\n\nFrame 5 (current):\n  Drone pose: [85.76, 116.82, 19.87, -43.22, 121.9, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 646.39, \"ymin\": 340.65, \"xmax\": 684.36, \"ymax\": 413.21}, \"waypoint_deltas\": [{\"dx\": -0.61, \"dy\": 0.44, \"dz\": 0.13, \"dpitch\": -1.22, \"dyaw\": 1.71, \"droll\": 0.0}, {\"dx\": -1.12, \"dy\": 0.92, \"dz\": 0.13, \"dpitch\": -1.21, \"dyaw\": 1.65, \"droll\": 0.0}, {\"dx\": -1.64, \"dy\": 1.4, \"dz\": 0.13, \"dpitch\": -1.19, \"dyaw\": 1.58, \"droll\": 0.0}, {\"dx\": -2.14, \"dy\": 1.87, \"dz\": 0.13, \"dpitch\": -1.16, \"dyaw\": 1.53, \"droll\": 0.0}, {\"dx\": -2.64, \"dy\": 2.35, \"dz\": 0.13, \"dpitch\": -1.13, \"dyaw\": 1.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.11, "window_alt_abs_m": 0.34, "target_px_mean_hist": 526.8, "cur_frame_id": 87, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-13/trajectory_1776061731", "difficulty_score": 0.2565, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-111.85, 47.29, 22.0, -47.68, 87.77, 0.0]\n  Target bbox: [623.48, 327.33, 656.59, 391.91]\n\nFrame 2:\n  Drone pose: [-112.17, 46.76, 21.2, -45.02, 85.56, 0.0]\n  Target bbox: [621.69, 326.33, 658.31, 393.09]\n\nFrame 3:\n  Drone pose: [-112.11, 46.85, 20.67, -43.64, 84.46, 0.0]\n  Target bbox: [621.33, 325.5, 658.73, 393.98]\n\nFrame 4:\n  Drone pose: [-111.89, 47.23, 20.64, -43.39, 83.71, 0.0]\n  Target bbox: [621.52, 325.04, 658.56, 394.4]\n\nFrame 5 (current):\n  Drone pose: [-111.66, 47.7, 20.62, -43.27, 82.98, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.92, \"ymin\": 324.95, \"xmax\": 659.16, \"ymax\": 394.55}, \"waypoint_deltas\": [{\"dx\": 0.19, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 0.08, \"dyaw\": -0.83, \"droll\": 0.0}, {\"dx\": 0.34, \"dy\": 1.02, \"dz\": -0.05, \"dpitch\": 0.17, \"dyaw\": -1.79, \"droll\": 0.0}, {\"dx\": 0.44, \"dy\": 1.51, \"dz\": -0.07, \"dpitch\": 0.19, \"dyaw\": -1.51, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": 2.0, \"dz\": -0.09, \"dpitch\": 0.22, \"dyaw\": -1.33, \"droll\": 0.0}, {\"dx\": 0.55, \"dy\": 2.49, \"dz\": -0.2, \"dpitch\": 0.38, \"dyaw\": -1.22, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.79, "window_alt_abs_m": 1.38, "target_px_mean_hist": 494.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-111.41, 55.69, 20.13, -42.4, 80.96, 0.0]\n  Target bbox: [621.58, 328.85, 658.19, 390.49]\n\nFrame 2:\n  Drone pose: [-111.54, 56.21, 20.12, -42.38, 80.6, 0.0]\n  Target bbox: [620.22, 328.73, 659.56, 390.58]\n\nFrame 3:\n  Drone pose: [-111.69, 56.76, 20.1, -42.38, 80.17, 0.0]\n  Target bbox: [619.16, 328.25, 660.59, 391.11]\n\nFrame 4:\n  Drone pose: [-111.87, 57.32, 20.09, -42.4, 79.66, 0.0]\n  Target bbox: [620.51, 328.7, 659.28, 390.62]\n\nFrame 5 (current):\n  Drone pose: [-112.07, 57.91, 20.08, -42.45, 79.07, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.91, \"ymin\": 326.98, \"xmax\": 660.83, \"ymax\": 392.35}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": 0.61, \"dz\": -0.01, \"dpitch\": -0.08, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": 1.23, \"dz\": -0.02, \"dpitch\": -0.16, \"dyaw\": -1.4, \"droll\": 0.0}, {\"dx\": -0.71, \"dy\": 1.86, \"dz\": -0.03, \"dpitch\": -0.24, \"dyaw\": -2.12, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": 2.47, \"dz\": -0.04, \"dpitch\": -0.32, \"dyaw\": -2.75, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 3.06, \"dz\": -0.04, \"dpitch\": -0.39, \"dyaw\": -3.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.89, "window_alt_abs_m": 0.06, "target_px_mean_hist": 515.8, "cur_frame_id": 24, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00044/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.77, 67.04, 20.0, -42.42, 68.46, 0.0]\n  Target bbox: [620.99, 325.53, 659.19, 393.98]\n\nFrame 2:\n  Drone pose: [-110.45, 67.59, 20.0, -42.39, 67.93, 0.0]\n  Target bbox: [620.34, 324.38, 659.89, 395.14]\n\nFrame 3:\n  Drone pose: [-110.12, 68.13, 20.0, -42.35, 67.47, 0.0]\n  Target bbox: [622.4, 326.56, 657.76, 392.87]\n\nFrame 4:\n  Drone pose: [-109.75, 68.67, 20.0, -42.33, 67.1, 0.0]\n  Target bbox: [622.35, 325.85, 657.83, 393.57]\n\nFrame 5 (current):\n  Drone pose: [-109.35, 69.19, 20.0, -42.3, 66.81, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 622.23, \"ymin\": 325.11, \"xmax\": 658.0, \"ymax\": 394.37}, \"waypoint_deltas\": [{\"dx\": 0.42, \"dy\": 0.52, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.2, \"droll\": 0.0}, {\"dx\": 0.86, \"dy\": 1.02, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": 1.31, \"dy\": 1.52, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -0.48, \"droll\": 0.0}, {\"dx\": 1.75, \"dy\": 2.0, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -0.6, \"droll\": 0.0}, {\"dx\": 2.16, \"dy\": 2.48, \"dz\": 0.0, \"dpitch\": 0.21, \"dyaw\": -0.81, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.65, "window_alt_abs_m": 0.0, "target_px_mean_hist": 509.8, "cur_frame_id": 44, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00064/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.75, 76.59, 20.0, -42.31, 71.5, 0.0]\n  Target bbox: [621.31, 326.75, 658.52, 392.63]\n\nFrame 2:\n  Drone pose: [-104.6, 77.04, 20.0, -42.31, 71.92, 0.0]\n  Target bbox: [620.67, 327.17, 659.18, 392.2]\n\nFrame 3:\n  Drone pose: [-104.46, 77.5, 20.0, -42.31, 72.32, 0.0]\n  Target bbox: [621.58, 327.66, 658.26, 391.74]\n\nFrame 4:\n  Drone pose: [-104.34, 77.96, 20.0, -42.31, 72.69, 0.0]\n  Target bbox: [617.5, 326.1, 662.33, 393.32]\n\nFrame 5 (current):\n  Drone pose: [-104.22, 78.42, 20.0, -42.31, 73.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.65, \"ymin\": 327.93, \"xmax\": 656.19, \"ymax\": 391.41}, \"waypoint_deltas\": [{\"dx\": 0.11, \"dy\": 0.46, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.32, \"droll\": 0.0}, {\"dx\": 0.21, \"dy\": 0.93, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.61, \"droll\": 0.0}, {\"dx\": 0.31, \"dy\": 1.39, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.89, \"droll\": 0.0}, {\"dx\": 0.41, \"dy\": 1.86, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 1.17, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": 2.33, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 1.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.53, "window_alt_abs_m": 0.0, "target_px_mean_hist": 514.5, "cur_frame_id": 64, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.72, 86.64, 20.0, -42.47, 77.19, 0.0]\n  Target bbox: [618.39, 327.43, 661.39, 391.95]\n\nFrame 2:\n  Drone pose: [-101.59, 87.21, 20.0, -42.6, 77.51, 0.0]\n  Target bbox: [619.64, 327.81, 660.17, 391.52]\n\nFrame 3:\n  Drone pose: [-101.46, 87.79, 20.0, -42.75, 77.82, 0.0]\n  Target bbox: [620.41, 328.04, 659.39, 391.27]\n\nFrame 4:\n  Drone pose: [-101.34, 88.37, 20.0, -42.89, 78.1, 0.0]\n  Target bbox: [626.69, 328.33, 653.13, 390.95]\n\nFrame 5 (current):\n  Drone pose: [-101.24, 88.93, 20.0, -43.0, 78.33, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.64, \"ymin\": 328.74, \"xmax\": 654.17, \"ymax\": 390.56}, \"waypoint_deltas\": [{\"dx\": 0.07, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.18, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": 1.03, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.33, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 0.45, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": 1.98, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 0.57, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": 2.45, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.14, "window_alt_abs_m": 0.0, "target_px_mean_hist": 526.2, "cur_frame_id": 85, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00105/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.62, 96.82, 20.0, -42.74, 77.36, 0.0]\n  Target bbox: [624.34, 328.08, 655.47, 391.2]\n\nFrame 2:\n  Drone pose: [-101.89, 97.4, 20.0, -42.77, 76.57, 0.0]\n  Target bbox: [621.77, 328.0, 658.03, 391.34]\n\nFrame 3:\n  Drone pose: [-102.19, 98.0, 20.0, -42.8, 75.69, 0.0]\n  Target bbox: [619.41, 327.44, 660.41, 391.89]\n\nFrame 4:\n  Drone pose: [-102.5, 98.61, 20.0, -42.84, 74.79, 0.0]\n  Target bbox: [621.99, 327.59, 657.84, 391.7]\n\nFrame 5 (current):\n  Drone pose: [-102.78, 99.23, 20.0, -42.89, 73.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.09, \"ymin\": 326.81, \"xmax\": 660.75, \"ymax\": 392.5}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": 0.62, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.72, \"droll\": 0.0}, {\"dx\": -0.39, \"dy\": 1.23, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": -1.23, \"droll\": 0.0}, {\"dx\": -0.46, \"dy\": 1.84, \"dz\": 0.0, \"dpitch\": -0.26, \"dyaw\": -1.51, \"droll\": 0.0}, {\"dx\": -0.46, \"dy\": 2.42, \"dz\": 0.0, \"dpitch\": -0.37, \"dyaw\": -1.59, \"droll\": 0.0}, {\"dx\": -0.42, \"dy\": 2.97, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": -2.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.41, "window_alt_abs_m": 0.0, "target_px_mean_hist": 525.5, "cur_frame_id": 105, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00121/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00122/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00125/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.7, 106.86, 20.0, -43.87, 54.22, 0.0]\n  Target bbox: [622.66, 325.7, 657.3, 393.61]\n\nFrame 2:\n  Drone pose: [-102.64, 107.2, 20.0, -43.88, 52.6, 0.0]\n  Target bbox: [616.52, 322.42, 663.59, 397.09]\n\nFrame 3:\n  Drone pose: [-102.58, 107.55, 20.0, -43.89, 50.99, 0.0]\n  Target bbox: [624.36, 326.73, 655.59, 392.56]\n\nFrame 4:\n  Drone pose: [-102.51, 107.91, 20.0, -43.88, 49.35, 0.0]\n  Target bbox: [620.18, 324.02, 659.86, 395.38]\n\nFrame 5 (current):\n  Drone pose: [-102.45, 108.29, 20.0, -43.88, 47.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.09, \"ymin\": 324.6, \"xmax\": 658.94, \"ymax\": 394.79}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.41, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -1.72, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 0.84, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -3.5, \"droll\": 0.0}, {\"dx\": 0.19, \"dy\": 1.29, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -5.31, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": 1.77, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -7.16, \"droll\": 0.0}, {\"dx\": 0.34, \"dy\": 2.27, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -9.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.54, "window_alt_abs_m": 0.0, "target_px_mean_hist": 549.8, "cur_frame_id": 125, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00146/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-99.05, 115.35, 20.0, -43.69, 22.38, 0.0]\n  Target bbox: [625.01, 327.24, 655.13, 392.03]\n\nFrame 2:\n  Drone pose: [-98.66, 115.59, 20.0, -43.68, 21.62, 0.0]\n  Target bbox: [621.8, 321.53, 658.59, 398.03]\n\nFrame 3:\n  Drone pose: [-98.27, 115.82, 20.0, -43.65, 20.89, 0.0]\n  Target bbox: [622.16, 321.78, 658.22, 397.75]\n\nFrame 4:\n  Drone pose: [-97.87, 116.04, 20.0, -43.63, 20.2, 0.0]\n  Target bbox: [623.63, 323.21, 656.71, 396.31]\n\nFrame 5 (current):\n  Drone pose: [-97.46, 116.25, 20.0, -43.61, 19.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.4, \"ymin\": 323.35, \"xmax\": 656.94, \"ymax\": 396.18}, \"waypoint_deltas\": [{\"dx\": 0.4, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": 0.82, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": 1.23, \"dy\": 0.59, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": -1.82, \"droll\": 0.0}, {\"dx\": 1.65, \"dy\": 0.77, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -2.37, \"droll\": 0.0}, {\"dx\": 2.08, \"dy\": 0.94, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": -2.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.85, "window_alt_abs_m": 0.0, "target_px_mean_hist": 535.5, "cur_frame_id": 146, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00164/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00165/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00166/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-90.49, 118.23, 20.0, -42.77, 13.33, 0.0]\n  Target bbox: [626.62, 328.26, 653.57, 391.06]\n\nFrame 2:\n  Drone pose: [-90.03, 118.24, 20.0, -42.73, 13.28, 0.0]\n  Target bbox: [626.06, 326.45, 654.19, 392.96]\n\nFrame 3:\n  Drone pose: [-89.56, 118.24, 20.0, -42.68, 13.27, 0.0]\n  Target bbox: [626.22, 326.9, 654.01, 392.49]\n\nFrame 4:\n  Drone pose: [-89.09, 118.22, 20.0, -42.64, 13.29, 0.0]\n  Target bbox: [626.25, 326.65, 653.98, 392.69]\n\nFrame 5 (current):\n  Drone pose: [-88.62, 118.21, 20.0, -42.59, 13.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.26, \"ymin\": 322.37, \"xmax\": 656.16, \"ymax\": 397.27}, \"waypoint_deltas\": [{\"dx\": 0.47, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.42, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.9, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.13, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": 2.37, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -0.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.1, "window_alt_abs_m": 0.0, "target_px_mean_hist": 534.0, "cur_frame_id": 166, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00182/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00183/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00184/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00185/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/ORI/frames_playback/frame_00186/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-81.06, 120.11, 20.0, -42.2, 13.29, 0.0]\n  Target bbox: [625.32, 324.47, 655.02, 395.12]\n\nFrame 2:\n  Drone pose: [-80.61, 120.15, 20.0, -42.14, 13.17, 0.0]\n  Target bbox: [626.88, 328.86, 653.3, 390.5]\n\nFrame 3:\n  Drone pose: [-80.15, 120.17, 20.0, -42.09, 13.09, 0.0]\n  Target bbox: [626.65, 327.84, 653.57, 391.57]\n\nFrame 4:\n  Drone pose: [-79.69, 120.18, 20.0, -42.05, 13.05, 0.0]\n  Target bbox: [625.08, 323.9, 655.3, 395.77]\n\nFrame 5 (current):\n  Drone pose: [-79.22, 120.17, 20.0, -42.01, 13.04, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.23, \"ymin\": 324.2, \"xmax\": 655.14, \"ymax\": 395.44}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.03, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 1.49, \"dy\": -0.08, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": -0.11, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.3, \"droll\": 0.0}, {\"dx\": 2.53, \"dy\": -0.15, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.41, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.25, "window_alt_abs_m": 0.0, "target_px_mean_hist": 524.8, "cur_frame_id": 186, "source": "ORI", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-111.85, 47.29, 22.0, -50.74, 89.94, 0.0]\n  Target bbox: [596.03, 272.45, 633.8, 342.29]\n\nFrame 2:\n  Drone pose: [-112.12, 46.82, 21.34, -45.97, 81.09, 0.0]\n  Target bbox: [675.93, 315.72, 713.91, 381.94]\n\nFrame 3:\n  Drone pose: [-112.15, 46.99, 20.61, -43.66, 84.3, 0.0]\n  Target bbox: [618.62, 322.54, 661.39, 396.91]\n\nFrame 4:\n  Drone pose: [-111.78, 47.22, 20.58, -43.22, 84.0, 0.0]\n  Target bbox: [617.21, 321.97, 662.83, 397.61]\n\nFrame 5 (current):\n  Drone pose: [-111.83, 47.77, 20.64, -40.16, 79.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 660.61, \"ymin\": 376.66, \"xmax\": 699.21, \"ymax\": 449.5}, \"waypoint_deltas\": [{\"dx\": 0.36, \"dy\": 0.44, \"dz\": -0.05, \"dpitch\": -3.03, \"dyaw\": 2.88, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": 0.95, \"dz\": -0.07, \"dpitch\": -2.94, \"dyaw\": 1.92, \"droll\": 0.0}, {\"dx\": 0.61, \"dy\": 1.44, \"dz\": -0.09, \"dpitch\": -2.92, \"dyaw\": 2.2, \"droll\": 0.0}, {\"dx\": 0.68, \"dy\": 1.93, \"dz\": -0.11, \"dpitch\": -2.89, \"dyaw\": 2.38, \"droll\": 0.0}, {\"dx\": 0.72, \"dy\": 2.42, \"dz\": -0.22, \"dpitch\": -2.73, \"dyaw\": 2.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.11, "window_alt_abs_m": 1.48, "target_px_mean_hist": 515.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-111.55, 55.56, 20.08, -42.04, 80.65, 0.0]\n  Target bbox: [625.13, 327.07, 654.6, 392.34]\n\nFrame 2:\n  Drone pose: [-111.59, 56.23, 20.3, -42.58, 80.45, 0.0]\n  Target bbox: [623.05, 326.79, 656.66, 392.6]\n\nFrame 3:\n  Drone pose: [-111.69, 56.76, 20.1, -42.31, 80.17, 0.0]\n  Target bbox: [621.55, 325.43, 658.15, 393.86]\n\nFrame 4:\n  Drone pose: [-111.87, 57.32, 20.09, -42.33, 79.66, 0.0]\n  Target bbox: [616.16, 324.32, 663.46, 395.05]\n\nFrame 5 (current):\n  Drone pose: [-112.07, 57.91, 20.08, -42.38, 79.07, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.65, \"ymin\": 324.52, \"xmax\": 658.02, \"ymax\": 394.84}, \"waypoint_deltas\": [{\"dx\": -0.23, \"dy\": 0.61, \"dz\": -0.01, \"dpitch\": -0.15, \"dyaw\": -0.67, \"droll\": 0.0}, {\"dx\": -0.48, \"dy\": 1.23, \"dz\": -0.02, \"dpitch\": -0.23, \"dyaw\": -1.4, \"droll\": 0.0}, {\"dx\": -0.71, \"dy\": 1.86, \"dz\": -0.03, \"dpitch\": -0.31, \"dyaw\": -2.12, \"droll\": 0.0}, {\"dx\": -0.92, \"dy\": 2.47, \"dz\": -0.04, \"dpitch\": -0.39, \"dyaw\": -2.75, \"droll\": 0.0}, {\"dx\": -1.07, \"dy\": 3.06, \"dz\": -0.04, \"dpitch\": -0.46, \"dyaw\": -3.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.58, "window_alt_abs_m": 0.44, "target_px_mean_hist": 536.2, "cur_frame_id": 24, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00044/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.77, 67.04, 20.0, -45.19, 65.98, 0.0]\n  Target bbox: [649.09, 275.4, 693.61, 349.36]\n\nFrame 2:\n  Drone pose: [-110.45, 67.59, 20.0, -44.39, 66.87, 0.0]\n  Target bbox: [631.4, 286.09, 675.87, 363.95]\n\nFrame 3:\n  Drone pose: [-110.12, 68.13, 20.0, -41.51, 71.97, 0.0]\n  Target bbox: [562.25, 339.13, 605.31, 408.9]\n\nFrame 4:\n  Drone pose: [-109.78, 68.73, 20.16, -44.5, 71.96, 0.0]\n  Target bbox: [557.08, 293.14, 598.57, 364.42]\n\nFrame 5 (current):\n  Drone pose: [-109.28, 69.09, 19.98, -39.57, 70.77, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 571.14, \"ymin\": 365.8, \"xmax\": 616.68, \"ymax\": 440.87}, \"waypoint_deltas\": [{\"dx\": 0.35, \"dy\": 0.62, \"dz\": 0.02, \"dpitch\": -2.71, \"dyaw\": -4.16, \"droll\": 0.0}, {\"dx\": 0.79, \"dy\": 1.12, \"dz\": 0.02, \"dpitch\": -2.68, \"dyaw\": -4.31, \"droll\": 0.0}, {\"dx\": 1.24, \"dy\": 1.62, \"dz\": 0.02, \"dpitch\": -2.65, \"dyaw\": -4.44, \"droll\": 0.0}, {\"dx\": 1.68, \"dy\": 2.1, \"dz\": 0.02, \"dpitch\": -2.6, \"dyaw\": -4.56, \"droll\": 0.0}, {\"dx\": 2.09, \"dy\": 2.58, \"dz\": 0.02, \"dpitch\": -2.52, \"dyaw\": -4.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.18, "window_alt_abs_m": 0.34, "target_px_mean_hist": 522.8, "cur_frame_id": 44, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00064/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-104.76, 76.74, 20.08, -44.29, 73.12, 0.0]\n  Target bbox: [593.5, 293.47, 640.92, 368.12]\n\nFrame 2:\n  Drone pose: [-104.6, 77.04, 20.0, -44.03, 76.92, 0.0]\n  Target bbox: [555.68, 296.86, 598.84, 366.07]\n\nFrame 3:\n  Drone pose: [-104.32, 77.6, 20.02, -41.73, 70.21, 0.0]\n  Target bbox: [647.99, 336.46, 692.2, 407.99]\n\nFrame 4:\n  Drone pose: [-104.34, 77.96, 20.0, -46.35, 70.91, 0.0]\n  Target bbox: [643.28, 254.38, 681.75, 327.06]\n\nFrame 5 (current):\n  Drone pose: [-104.27, 78.51, 20.1, -46.3, 72.56, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 617.86, \"ymin\": 259.25, \"xmax\": 668.59, \"ymax\": 331.72}, \"waypoint_deltas\": [{\"dx\": 0.16, \"dy\": 0.37, \"dz\": -0.1, \"dpitch\": 4.0, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": 0.84, \"dz\": -0.1, \"dpitch\": 4.0, \"dyaw\": 1.08, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": 1.3, \"dz\": -0.1, \"dpitch\": 4.01, \"dyaw\": 1.36, \"droll\": 0.0}, {\"dx\": 0.46, \"dy\": 1.77, \"dz\": -0.1, \"dpitch\": 4.02, \"dyaw\": 1.64, \"droll\": 0.0}, {\"dx\": 0.56, \"dy\": 2.24, \"dz\": -0.1, \"dpitch\": 4.02, \"dyaw\": 1.93, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.87, "window_alt_abs_m": 0.22, "target_px_mean_hist": 545.5, "cur_frame_id": 64, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.65, 86.62, 20.09, -46.52, 73.14, 0.0]\n  Target bbox: [667.92, 257.77, 718.69, 330.06]\n\nFrame 2:\n  Drone pose: [-101.44, 87.23, 20.11, -42.44, 78.84, 0.0]\n  Target bbox: [603.16, 330.19, 652.71, 400.32]\n\nFrame 3:\n  Drone pose: [-101.31, 87.78, 20.11, -40.32, 82.99, 0.0]\n  Target bbox: [559.32, 370.06, 601.89, 437.78]\n\nFrame 4:\n  Drone pose: [-101.34, 88.37, 20.0, -43.82, 77.47, 0.0]\n  Target bbox: [632.61, 309.29, 662.74, 376.37]\n\nFrame 5 (current):\n  Drone pose: [-101.24, 88.93, 20.0, -46.35, 80.27, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 600.32, \"ymin\": 269.08, \"xmax\": 631.04, \"ymax\": 335.67}, \"waypoint_deltas\": [{\"dx\": 0.07, \"dy\": 0.53, \"dz\": 0.0, \"dpitch\": 3.29, \"dyaw\": -1.76, \"droll\": 0.0}, {\"dx\": 0.12, \"dy\": 1.03, \"dz\": 0.0, \"dpitch\": 3.27, \"dyaw\": -1.61, \"droll\": 0.0}, {\"dx\": 0.16, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": 3.29, \"dyaw\": -1.49, \"droll\": 0.0}, {\"dx\": 0.2, \"dy\": 1.98, \"dz\": 0.0, \"dpitch\": 3.32, \"dyaw\": -1.37, \"droll\": 0.0}, {\"dx\": 0.24, \"dy\": 2.45, \"dz\": 0.0, \"dpitch\": 3.35, \"dyaw\": -1.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.16, "window_alt_abs_m": 0.13, "target_px_mean_hist": 551.8, "cur_frame_id": 85, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00102/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00103/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00104/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00105/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.62, 96.82, 20.0, -42.67, 77.36, 0.0]\n  Target bbox: [619.66, 325.02, 660.04, 394.35]\n\nFrame 2:\n  Drone pose: [-101.89, 97.4, 20.0, -42.69, 76.57, 0.0]\n  Target bbox: [626.16, 325.12, 653.59, 394.18]\n\nFrame 3:\n  Drone pose: [-102.3, 98.03, 20.11, -42.89, 75.38, 0.0]\n  Target bbox: [615.49, 322.86, 664.18, 396.53]\n\nFrame 4:\n  Drone pose: [-102.5, 98.61, 20.0, -41.48, 79.79, 0.0]\n  Target bbox: [556.95, 348.64, 598.66, 417.21]\n\nFrame 5 (current):\n  Drone pose: [-102.86, 99.17, 19.97, -40.11, 74.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.58, \"ymin\": 365.03, \"xmax\": 661.62, \"ymax\": 439.85}, \"waypoint_deltas\": [{\"dx\": -0.15, \"dy\": 0.68, \"dz\": 0.03, \"dpitch\": -2.85, \"dyaw\": -0.77, \"droll\": 0.0}, {\"dx\": -0.31, \"dy\": 1.29, \"dz\": 0.03, \"dpitch\": -2.94, \"dyaw\": -1.28, \"droll\": 0.0}, {\"dx\": -0.38, \"dy\": 1.9, \"dz\": 0.03, \"dpitch\": -3.04, \"dyaw\": -1.56, \"droll\": 0.0}, {\"dx\": -0.38, \"dy\": 2.48, \"dz\": 0.03, \"dpitch\": -3.15, \"dyaw\": -1.64, \"droll\": 0.0}, {\"dx\": -0.34, \"dy\": 3.03, \"dz\": 0.03, \"dpitch\": -3.02, \"dyaw\": -2.91, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.19, "window_alt_abs_m": 0.24, "target_px_mean_hist": 543.5, "cur_frame_id": 105, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00121/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00122/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00124/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00125/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-102.7, 106.8, 19.87, -39.87, 51.72, 0.0]\n  Target bbox: [648.06, 382.43, 696.54, 460.73]\n\nFrame 2:\n  Drone pose: [-102.71, 107.11, 19.84, -43.41, 52.61, 0.0]\n  Target bbox: [618.1, 321.43, 661.88, 397.97]\n\nFrame 3:\n  Drone pose: [-102.72, 107.57, 19.89, -43.53, 50.64, 0.0]\n  Target bbox: [613.02, 319.14, 667.11, 400.54]\n\nFrame 4:\n  Drone pose: [-102.45, 107.95, 20.08, -45.61, 54.43, 0.0]\n  Target bbox: [557.02, 297.15, 601.2, 372.96]\n\nFrame 5 (current):\n  Drone pose: [-102.45, 108.29, 20.0, -43.8, 47.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.04, \"ymin\": 322.04, \"xmax\": 661.98, \"ymax\": 397.44}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.41, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -1.72, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 0.84, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -3.5, \"droll\": 0.0}, {\"dx\": 0.19, \"dy\": 1.29, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -5.31, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": 1.77, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -7.16, \"droll\": 0.0}, {\"dx\": 0.34, \"dy\": 2.27, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -9.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.41, "window_alt_abs_m": 0.34, "target_px_mean_hist": 561.0, "cur_frame_id": 125, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00142/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00143/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00144/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00145/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00146/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-99.14, 115.29, 20.0, -43.46, 22.45, 0.0]\n  Target bbox: [621.84, 324.57, 658.31, 394.65]\n\nFrame 2:\n  Drone pose: [-98.6, 115.53, 20.09, -43.81, 26.85, 0.0]\n  Target bbox: [559.41, 325.91, 598.83, 396.47]\n\nFrame 3:\n  Drone pose: [-98.27, 115.82, 20.0, -43.58, 20.89, 0.0]\n  Target bbox: [620.62, 322.19, 659.74, 397.35]\n\nFrame 4:\n  Drone pose: [-97.87, 116.04, 20.0, -44.08, 17.59, 0.0]\n  Target bbox: [651.0, 310.59, 693.83, 392.43]\n\nFrame 5 (current):\n  Drone pose: [-97.46, 116.25, 20.0, -43.53, 19.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.16, \"ymin\": 321.09, \"xmax\": 659.23, \"ymax\": 398.39}, \"waypoint_deltas\": [{\"dx\": 0.4, \"dy\": 0.21, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": 0.82, \"dy\": 0.4, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.24, \"droll\": 0.0}, {\"dx\": 1.23, \"dy\": 0.59, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -1.82, \"droll\": 0.0}, {\"dx\": 1.65, \"dy\": 0.77, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -2.37, \"droll\": 0.0}, {\"dx\": 2.08, \"dy\": 0.94, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": -2.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.61, "window_alt_abs_m": 0.18, "target_px_mean_hist": 558.8, "cur_frame_id": 146, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00162/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00163/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00164/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00165/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00166/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-90.33, 118.24, 20.05, -43.01, 13.43, 0.0]\n  Target bbox: [621.19, 321.77, 659.21, 397.64]\n\nFrame 2:\n  Drone pose: [-90.03, 118.24, 20.0, -42.65, 13.28, 0.0]\n  Target bbox: [622.68, 321.73, 657.74, 397.81]\n\nFrame 3:\n  Drone pose: [-89.54, 118.15, 20.06, -41.42, 12.38, 0.0]\n  Target bbox: [636.98, 343.09, 672.09, 419.31]\n\nFrame 4:\n  Drone pose: [-89.01, 118.33, 20.05, -42.78, 13.06, 0.0]\n  Target bbox: [622.65, 322.91, 657.73, 396.56]\n\nFrame 5 (current):\n  Drone pose: [-88.62, 118.21, 20.0, -46.23, 17.19, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 572.98, \"ymin\": 259.68, \"xmax\": 610.68, \"ymax\": 337.15}, \"waypoint_deltas\": [{\"dx\": 0.47, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 3.68, \"dyaw\": -3.84, \"droll\": 0.0}, {\"dx\": 0.95, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": 3.72, \"dyaw\": -3.82, \"droll\": 0.0}, {\"dx\": 1.42, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": 3.75, \"dyaw\": -3.86, \"droll\": 0.0}, {\"dx\": 1.9, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 3.77, \"dyaw\": -3.97, \"droll\": 0.0}, {\"dx\": 2.37, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": 3.79, \"dyaw\": -4.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.86, "window_alt_abs_m": 0.18, "target_px_mean_hist": 553.0, "cur_frame_id": 166, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00182/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00183/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00184/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00185/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580/aug_001/frames_playback/frame_00186/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-81.06, 120.11, 20.0, -43.95, 11.98, 0.0]\n  Target bbox: [639.77, 294.27, 673.97, 363.83]\n\nFrame 2:\n  Drone pose: [-80.69, 120.25, 20.15, -42.23, 12.84, 0.0]\n  Target bbox: [622.52, 322.46, 657.93, 397.22]\n\nFrame 3:\n  Drone pose: [-80.03, 120.26, 19.89, -42.03, 17.93, 0.0]\n  Target bbox: [558.69, 324.89, 596.37, 399.01]\n\nFrame 4:\n  Drone pose: [-79.66, 120.22, 20.15, -42.25, 12.94, 0.0]\n  Target bbox: [622.89, 323.49, 657.48, 396.05]\n\nFrame 5 (current):\n  Drone pose: [-79.2, 120.3, 20.01, -37.01, 11.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 631.91, \"ymin\": 405.46, \"xmax\": 668.43, \"ymax\": 482.63}, \"waypoint_deltas\": [{\"dx\": 0.47, \"dy\": -0.14, \"dz\": -0.01, \"dpitch\": -4.98, \"dyaw\": 1.14, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": -0.17, \"dz\": -0.01, \"dpitch\": -4.96, \"dyaw\": 1.21, \"droll\": 0.0}, {\"dx\": 1.47, \"dy\": -0.21, \"dz\": -0.01, \"dpitch\": -4.96, \"dyaw\": 1.3, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -0.24, \"dz\": -0.01, \"dpitch\": -4.97, \"dyaw\": 1.41, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": -0.28, \"dz\": -0.01, \"dpitch\": -5.0, \"dyaw\": 1.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.96, "window_alt_abs_m": 0.81, "target_px_mean_hist": 535.2, "cur_frame_id": 186, "source": "aug_001", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-16/trajectory_580", "difficulty_score": 0.4523, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-112.11, 7.42, 22.0, -46.42, -87.14, 0.0]\n  Target bbox: [626.91, 329.55, 653.32, 389.65]\n\nFrame 2:\n  Drone pose: [-113.25, 5.77, 21.2, -46.85, -83.5, 0.0]\n  Target bbox: [620.66, 325.95, 659.68, 393.15]\n\nFrame 3:\n  Drone pose: [-113.77, 4.76, 20.67, -46.77, -81.73, 0.0]\n  Target bbox: [618.81, 325.67, 661.49, 393.34]\n\nFrame 4:\n  Drone pose: [-113.95, 4.06, 20.64, -47.0, -81.09, 0.0]\n  Target bbox: [620.81, 325.61, 659.48, 393.43]\n\nFrame 5 (current):\n  Drone pose: [-113.97, 3.48, 20.62, -47.06, -80.97, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.75, \"ymin\": 326.52, \"xmax\": 654.48, \"ymax\": 392.56}, \"waypoint_deltas\": [{\"dx\": 0.04, \"dy\": -0.52, \"dz\": -0.03, \"dpitch\": -0.01, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": -0.02, \"dyaw\": -0.33, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -1.54, \"dz\": -0.07, \"dpitch\": -0.02, \"dyaw\": -0.56, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": -2.05, \"dz\": -0.09, \"dpitch\": 0.1, \"dyaw\": 0.76, \"droll\": 0.0}, {\"dx\": 0.33, \"dy\": -2.55, \"dz\": -0.2, \"dpitch\": 0.25, \"dyaw\": 0.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.16, "window_alt_abs_m": 1.38, "target_px_mean_hist": 555.0, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-113.57, 0.44, 20.39, -46.77, -80.66, 0.0]\n  Target bbox: [616.5, 324.97, 663.85, 394.14]\n\nFrame 2:\n  Drone pose: [-113.5, -0.04, 20.36, -46.71, -80.87, 0.0]\n  Target bbox: [625.9, 325.96, 654.34, 393.11]\n\nFrame 3:\n  Drone pose: [-113.45, -0.52, 20.33, -46.65, -81.05, 0.0]\n  Target bbox: [621.43, 325.17, 658.85, 393.82]\n\nFrame 4:\n  Drone pose: [-113.4, -1.0, 20.3, -46.59, -81.22, 0.0]\n  Target bbox: [622.1, 325.81, 658.17, 393.25]\n\nFrame 5 (current):\n  Drone pose: [-113.35, -1.48, 20.27, -46.53, -81.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.47, \"ymin\": 324.93, \"xmax\": 653.77, \"ymax\": 394.07}, \"waypoint_deltas\": [{\"dx\": 0.04, \"dy\": -0.48, \"dz\": -0.03, \"dpitch\": 0.06, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -0.97, \"dz\": -0.05, \"dpitch\": 0.11, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.47, \"dz\": -0.08, \"dpitch\": 0.14, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": 0.14, \"dy\": -1.98, \"dz\": -0.1, \"dpitch\": 0.15, \"dyaw\": -0.46, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -2.5, \"dz\": -0.12, \"dpitch\": 0.14, \"dyaw\": -0.54, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.7, "window_alt_abs_m": 0.12, "target_px_mean_hist": 592.8, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-113.15, -4.51, 20.14, -46.42, -81.99, 0.0]\n  Target bbox: [624.91, 324.58, 655.36, 394.45]\n\nFrame 2:\n  Drone pose: [-113.12, -5.05, 20.12, -46.47, -82.06, 0.0]\n  Target bbox: [616.29, 324.99, 664.07, 394.11]\n\nFrame 3:\n  Drone pose: [-113.1, -5.59, 20.1, -46.52, -82.11, 0.0]\n  Target bbox: [620.31, 324.92, 660.0, 394.09]\n\nFrame 4:\n  Drone pose: [-113.08, -6.12, 20.09, -46.54, -82.14, 0.0]\n  Target bbox: [624.4, 324.62, 655.87, 394.34]\n\nFrame 5 (current):\n  Drone pose: [-113.08, -6.63, 20.08, -46.54, -82.15, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.01, \"ymin\": 325.89, \"xmax\": 654.23, \"ymax\": 393.14}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": -0.5, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.01, \"dy\": -0.99, \"dz\": -0.02, \"dpitch\": 0.05, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.02, \"dy\": -1.48, \"dz\": -0.03, \"dpitch\": 0.07, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 0.04, \"dy\": -1.97, \"dz\": -0.04, \"dpitch\": 0.09, \"dyaw\": -0.15, \"droll\": 0.0}, {\"dx\": 0.09, \"dy\": -2.47, \"dz\": -0.04, \"dpitch\": 0.09, \"dyaw\": -0.29, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.16, "window_alt_abs_m": 0.06, "target_px_mean_hist": 606.0, "cur_frame_id": 24, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00034/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-112.91, -9.61, 20.03, -46.47, -82.7, 0.0]\n  Target bbox: [618.38, 324.81, 661.95, 394.15]\n\nFrame 2:\n  Drone pose: [-112.78, -10.12, 20.03, -46.5, -83.09, 0.0]\n  Target bbox: [618.76, 325.25, 661.56, 393.73]\n\nFrame 3:\n  Drone pose: [-112.61, -10.63, 20.02, -46.55, -83.64, 0.0]\n  Target bbox: [616.82, 319.73, 663.06, 399.51]\n\nFrame 4:\n  Drone pose: [-112.38, -11.15, 20.02, -46.52, -82.76, 0.0]\n  Target bbox: [620.59, 322.69, 659.3, 396.47]\n\nFrame 5 (current):\n  Drone pose: [-112.12, -11.68, 20.02, -46.52, -81.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.06, \"ymin\": 319.73, \"xmax\": 663.81, \"ymax\": 399.59}, \"waypoint_deltas\": [{\"dx\": 0.28, \"dy\": -0.55, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": -1.12, \"dz\": -0.01, \"dpitch\": -0.08, \"dyaw\": 1.37, \"droll\": 0.0}, {\"dx\": 0.87, \"dy\": -1.72, \"dz\": -0.01, \"dpitch\": -0.16, \"dyaw\": 2.08, \"droll\": 0.0}, {\"dx\": 1.15, \"dy\": -2.33, \"dz\": -0.01, \"dpitch\": -0.28, \"dyaw\": 2.85, \"droll\": 0.0}, {\"dx\": 1.4, \"dy\": -2.98, \"dz\": -0.01, \"dpitch\": -0.43, \"dyaw\": 3.71, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.59, "window_alt_abs_m": 0.02, "target_px_mean_hist": 596.0, "cur_frame_id": 34, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00044/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.49, -15.32, 20.0, -47.1, -77.3, 0.0]\n  Target bbox: [620.22, 322.9, 659.75, 396.11]\n\nFrame 2:\n  Drone pose: [-110.3, -15.99, 20.0, -47.27, -76.19, 0.0]\n  Target bbox: [617.94, 319.45, 661.82, 399.68]\n\nFrame 3:\n  Drone pose: [-110.14, -16.66, 20.0, -47.4, -74.99, 0.0]\n  Target bbox: [619.72, 320.66, 660.03, 398.47]\n\nFrame 4:\n  Drone pose: [-109.98, -17.32, 20.0, -47.49, -73.78, 0.0]\n  Target bbox: [619.03, 320.01, 660.71, 399.1]\n\nFrame 5 (current):\n  Drone pose: [-109.81, -17.96, 20.0, -47.56, -72.59, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.27, \"ymin\": 319.33, \"xmax\": 661.45, \"ymax\": 399.76}, \"waypoint_deltas\": [{\"dx\": 0.18, \"dy\": -0.64, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 1.15, \"droll\": 0.0}, {\"dx\": 0.36, \"dy\": -1.26, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 2.26, \"droll\": 0.0}, {\"dx\": 0.56, \"dy\": -1.86, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": 3.31, \"droll\": 0.0}, {\"dx\": 0.77, \"dy\": -2.44, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 4.31, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": -3.01, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 5.28, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.71, "window_alt_abs_m": 0.0, "target_px_mean_hist": 617.8, "cur_frame_id": 44, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.44, -22.08, 20.0, -47.25, -65.36, 0.0]\n  Target bbox: [618.1, 320.88, 661.59, 398.28]\n\nFrame 2:\n  Drone pose: [-108.27, -22.62, 20.0, -47.08, -64.32, 0.0]\n  Target bbox: [616.68, 321.15, 663.46, 397.94]\n\nFrame 3:\n  Drone pose: [-108.13, -23.15, 20.0, -47.22, -64.7, 0.0]\n  Target bbox: [618.83, 320.32, 660.84, 398.77]\n\nFrame 4:\n  Drone pose: [-108.0, -23.68, 20.0, -47.0, -63.57, 0.0]\n  Target bbox: [621.99, 322.49, 658.21, 396.55]\n\nFrame 5 (current):\n  Drone pose: [-107.89, -24.19, 20.0, -47.11, -63.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.78, \"ymin\": 320.43, \"xmax\": 666.34, \"ymax\": 398.79}, \"waypoint_deltas\": [{\"dx\": 0.12, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -0.32, \"droll\": 0.0}, {\"dx\": 0.25, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": 0.79, \"droll\": 0.0}, {\"dx\": 0.38, \"dy\": -1.54, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.41, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": 0.29, \"dyaw\": 1.43, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -2.55, \"dz\": 0.0, \"dpitch\": 0.17, \"dyaw\": 0.98, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.85, "window_alt_abs_m": 0.0, "target_px_mean_hist": 595.8, "cur_frame_id": 55, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-107.04, -27.24, 20.0, -46.69, -61.94, 0.0]\n  Target bbox: [619.38, 322.76, 660.74, 396.36]\n\nFrame 2:\n  Drone pose: [-106.87, -27.74, 20.0, -46.81, -62.42, 0.0]\n  Target bbox: [620.74, 321.74, 658.96, 397.34]\n\nFrame 3:\n  Drone pose: [-106.7, -28.23, 20.0, -46.56, -61.49, 0.0]\n  Target bbox: [622.5, 322.98, 657.67, 396.07]\n\nFrame 4:\n  Drone pose: [-106.54, -28.73, 20.0, -46.68, -61.95, 0.0]\n  Target bbox: [617.61, 321.73, 662.07, 397.42]\n\nFrame 5 (current):\n  Drone pose: [-106.38, -29.23, 20.0, -46.43, -61.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.81, \"ymin\": 322.48, \"xmax\": 661.3, \"ymax\": 396.69}, \"waypoint_deltas\": [{\"dx\": 0.16, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.45, \"droll\": 0.0}, {\"dx\": 0.33, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": -0.93, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": -0.53, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": 0.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.83, "window_alt_abs_m": 0.0, "target_px_mean_hist": 619.0, "cur_frame_id": 65, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-105.31, -32.22, 20.0, -46.47, -61.21, 0.0]\n  Target bbox: [625.48, 324.6, 654.29, 394.48]\n\nFrame 2:\n  Drone pose: [-105.11, -32.72, 20.0, -46.23, -60.37, 0.0]\n  Target bbox: [615.88, 320.69, 664.18, 398.48]\n\nFrame 3:\n  Drone pose: [-104.92, -33.22, 20.0, -46.38, -60.9, 0.0]\n  Target bbox: [621.12, 322.09, 658.57, 397.02]\n\nFrame 4:\n  Drone pose: [-104.73, -33.72, 20.0, -46.14, -60.05, 0.0]\n  Target bbox: [614.06, 320.35, 666.0, 398.98]\n\nFrame 5 (current):\n  Drone pose: [-104.55, -34.22, 20.0, -46.27, -60.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.62, \"ymin\": 321.95, \"xmax\": 660.04, \"ymax\": 397.25}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": 0.89, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 0.42, \"droll\": 0.0}, {\"dx\": 0.52, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 0.71, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 0.22, \"dyaw\": 0.78, \"droll\": 0.0}, {\"dx\": 0.91, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.72, "window_alt_abs_m": 0.0, "target_px_mean_hist": 592.8, "cur_frame_id": 75, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.44, -37.22, 20.0, -45.97, -59.49, 0.0]\n  Target bbox: [617.03, 321.25, 663.05, 397.95]\n\nFrame 2:\n  Drone pose: [-103.23, -37.72, 20.0, -46.13, -60.05, 0.0]\n  Target bbox: [623.1, 322.24, 656.57, 396.94]\n\nFrame 3:\n  Drone pose: [-103.02, -38.22, 20.0, -45.9, -59.28, 0.0]\n  Target bbox: [617.32, 321.42, 662.76, 397.79]\n\nFrame 4:\n  Drone pose: [-102.81, -38.72, 20.0, -46.07, -59.85, 0.0]\n  Target bbox: [622.86, 323.45, 656.88, 395.59]\n\nFrame 5 (current):\n  Drone pose: [-102.6, -39.22, 20.0, -45.84, -59.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.01, \"ymin\": 321.71, \"xmax\": 662.05, \"ymax\": 397.47}, \"waypoint_deltas\": [{\"dx\": 0.2, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": 0.39, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": 0.28, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.23, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": -1.99, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": -0.75, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.7, "window_alt_abs_m": 0.0, "target_px_mean_hist": 580.2, "cur_frame_id": 85, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/ORI/frames_playback/frame_00095/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.42, -42.22, 20.0, -45.98, -59.56, 0.0]\n  Target bbox: [622.03, 325.56, 657.78, 393.43]\n\nFrame 2:\n  Drone pose: [-101.2, -42.72, 20.0, -45.76, -58.79, 0.0]\n  Target bbox: [621.88, 323.7, 658.26, 395.49]\n\nFrame 3:\n  Drone pose: [-100.99, -43.22, 20.0, -45.93, -59.37, 0.0]\n  Target bbox: [618.69, 321.94, 660.94, 397.31]\n\nFrame 4:\n  Drone pose: [-100.77, -43.71, 20.0, -45.7, -58.63, 0.0]\n  Target bbox: [614.24, 320.67, 665.77, 398.69]\n\nFrame 5 (current):\n  Drone pose: [-100.55, -44.2, 20.0, -45.86, -59.23, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.32, \"ymin\": 324.24, \"xmax\": 656.45, \"ymax\": 394.79}, \"waypoint_deltas\": [{\"dx\": 0.22, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 0.74, \"droll\": 0.0}, {\"dx\": 0.43, \"dy\": -0.98, \"dz\": 0.0, \"dpitch\": 0.08, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": 0.64, \"dy\": -1.48, \"dz\": 0.0, \"dpitch\": 0.32, \"dyaw\": 0.91, \"droll\": 0.0}, {\"dx\": 0.85, \"dy\": -1.97, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -2.46, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.68, "window_alt_abs_m": 0.0, "target_px_mean_hist": 607.5, "cur_frame_id": 95, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-112.11, 7.42, 22.0, -46.42, -87.14, 0.0]\n  Target bbox: [622.83, 328.13, 657.49, 391.04]\n\nFrame 2:\n  Drone pose: [-113.25, 5.77, 21.2, -48.64, -80.35, 0.0]\n  Target bbox: [581.47, 296.21, 625.52, 364.31]\n\nFrame 3:\n  Drone pose: [-113.77, 4.76, 20.67, -46.77, -81.73, 0.0]\n  Target bbox: [624.74, 326.54, 655.51, 392.55]\n\nFrame 4:\n  Drone pose: [-113.95, 4.06, 20.64, -47.23, -76.09, 0.0]\n  Target bbox: [558.4, 321.64, 606.35, 393.49]\n\nFrame 5 (current):\n  Drone pose: [-113.97, 3.48, 20.62, -42.48, -79.02, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 604.05, \"ymin\": 402.5, \"xmax\": 630.88, \"ymax\": 471.1}, \"waypoint_deltas\": [{\"dx\": 0.04, \"dy\": -0.52, \"dz\": -0.03, \"dpitch\": -4.59, \"dyaw\": -2.08, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.03, \"dz\": -0.05, \"dpitch\": -4.6, \"dyaw\": -2.28, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": -1.54, \"dz\": -0.07, \"dpitch\": -4.6, \"dyaw\": -2.51, \"droll\": 0.0}, {\"dx\": 0.26, \"dy\": -2.05, \"dz\": -0.09, \"dpitch\": -4.48, \"dyaw\": -1.19, \"droll\": 0.0}, {\"dx\": 0.33, \"dy\": -2.55, \"dz\": -0.2, \"dpitch\": -4.33, \"dyaw\": -1.43, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.72, "window_alt_abs_m": 1.38, "target_px_mean_hist": 556.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-113.57, 0.44, 20.39, -46.77, -80.66, 0.0]\n  Target bbox: [620.16, 325.74, 660.11, 393.3]\n\nFrame 2:\n  Drone pose: [-113.5, -0.04, 20.36, -41.73, -78.5, 0.0]\n  Target bbox: [590.42, 408.13, 634.37, 479.2]\n\nFrame 3:\n  Drone pose: [-113.45, -0.52, 20.33, -50.13, -83.13, 0.0]\n  Target bbox: [643.39, 267.46, 685.89, 335.35]\n\nFrame 4:\n  Drone pose: [-113.4, -1.0, 20.3, -45.52, -86.22, 0.0]\n  Target bbox: [680.28, 346.12, 716.16, 412.37]\n\nFrame 5 (current):\n  Drone pose: [-113.35, -1.48, 20.27, -46.53, -81.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.14, \"ymin\": 325.84, \"xmax\": 656.12, \"ymax\": 393.23}, \"waypoint_deltas\": [{\"dx\": 0.04, \"dy\": -0.48, \"dz\": -0.03, \"dpitch\": 0.06, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -0.97, \"dz\": -0.05, \"dpitch\": 0.11, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": 0.11, \"dy\": -1.47, \"dz\": -0.08, \"dpitch\": 0.14, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": 0.14, \"dy\": -1.98, \"dz\": -0.1, \"dpitch\": 0.15, \"dyaw\": -0.46, \"droll\": 0.0}, {\"dx\": 0.17, \"dy\": -2.5, \"dz\": -0.12, \"dpitch\": 0.14, \"dyaw\": -0.54, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.72, "window_alt_abs_m": 0.12, "target_px_mean_hist": 592.5, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00024/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-113.23, -4.38, 20.04, -41.8, -87.56, 0.0]\n  Target bbox: [651.05, 381.53, 682.39, 448.6]\n\nFrame 2:\n  Drone pose: [-113.12, -5.05, 20.12, -43.81, -77.06, 0.0]\n  Target bbox: [564.22, 371.19, 598.99, 440.79]\n\nFrame 3:\n  Drone pose: [-112.98, -5.52, 20.03, -48.53, -78.52, 0.0]\n  Target bbox: [657.97, 304.82, 700.1, 383.41]\n\nFrame 4:\n  Drone pose: [-113.08, -6.12, 20.09, -46.54, -82.14, 0.0]\n  Target bbox: [624.39, 325.85, 655.92, 393.08]\n\nFrame 5 (current):\n  Drone pose: [-113.14, -6.72, 19.94, -49.69, -78.54, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 588.18, \"ymin\": 358.64, \"xmax\": 629.78, \"ymax\": 430.75}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": -0.41, \"dz\": 0.13, \"dpitch\": 3.17, \"dyaw\": -3.62, \"droll\": 0.0}, {\"dx\": 0.07, \"dy\": -0.9, \"dz\": 0.12, \"dpitch\": 3.2, \"dyaw\": -3.64, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": -1.39, \"dz\": 0.11, \"dpitch\": 3.22, \"dyaw\": -3.68, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": -1.88, \"dz\": 0.1, \"dpitch\": 3.24, \"dyaw\": -3.76, \"droll\": 0.0}, {\"dx\": 0.15, \"dy\": -2.38, \"dz\": 0.1, \"dpitch\": 3.24, \"dyaw\": -3.9, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.16, "window_alt_abs_m": 0.38, "target_px_mean_hist": 558.5, "cur_frame_id": 24, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00034/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-112.91, -9.61, 20.03, -47.36, -84.53, 0.0]\n  Target bbox: [638.48, 310.79, 685.37, 378.78]\n\nFrame 2:\n  Drone pose: [-112.78, -10.12, 20.03, -46.5, -83.09, 0.0]\n  Target bbox: [619.75, 325.18, 660.57, 393.79]\n\nFrame 3:\n  Drone pose: [-112.61, -10.63, 20.02, -46.55, -83.64, 0.0]\n  Target bbox: [619.79, 321.81, 660.05, 397.38]\n\nFrame 4:\n  Drone pose: [-112.38, -11.15, 20.02, -44.57, -79.81, 0.0]\n  Target bbox: [581.2, 354.63, 629.96, 431.27]\n\nFrame 5 (current):\n  Drone pose: [-112.16, -11.71, 19.94, -43.44, -85.13, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.45, \"ymin\": 320.89, \"xmax\": 663.52, \"ymax\": 398.69}, \"waypoint_deltas\": [{\"dx\": 0.32, \"dy\": -0.52, \"dz\": 0.07, \"dpitch\": -3.1, \"dyaw\": 3.84, \"droll\": 0.0}, {\"dx\": 0.62, \"dy\": -1.09, \"dz\": 0.07, \"dpitch\": -3.16, \"dyaw\": 4.51, \"droll\": 0.0}, {\"dx\": 0.91, \"dy\": -1.69, \"dz\": 0.07, \"dpitch\": -3.24, \"dyaw\": 5.22, \"droll\": 0.0}, {\"dx\": 1.19, \"dy\": -2.3, \"dz\": 0.07, \"dpitch\": -3.36, \"dyaw\": 5.99, \"droll\": 0.0}, {\"dx\": 1.44, \"dy\": -2.95, \"dz\": 0.07, \"dpitch\": -3.51, \"dyaw\": 6.85, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.13, "window_alt_abs_m": 0.09, "target_px_mean_hist": 595.5, "cur_frame_id": 34, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00044/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-110.49, -15.32, 20.0, -47.1, -77.3, 0.0]\n  Target bbox: [620.02, 321.38, 659.83, 397.67]\n\nFrame 2:\n  Drone pose: [-110.3, -15.99, 20.0, -51.4, -81.19, 0.0]\n  Target bbox: [671.31, 250.94, 724.07, 333.13]\n\nFrame 3:\n  Drone pose: [-110.14, -16.66, 20.0, -47.4, -74.99, 0.0]\n  Target bbox: [616.07, 319.58, 663.72, 399.65]\n\nFrame 4:\n  Drone pose: [-109.98, -17.32, 20.0, -42.63, -73.43, 0.0]\n  Target bbox: [622.13, 406.74, 649.46, 475.42]\n\nFrame 5 (current):\n  Drone pose: [-109.85, -17.95, 19.93, -49.06, -75.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 673.13, \"ymin\": 313.97, \"xmax\": 719.64, \"ymax\": 387.51}, \"waypoint_deltas\": [{\"dx\": 0.22, \"dy\": -0.65, \"dz\": 0.07, \"dpitch\": 1.45, \"dyaw\": 3.59, \"droll\": 0.0}, {\"dx\": 0.4, \"dy\": -1.27, \"dz\": 0.07, \"dpitch\": 1.44, \"dyaw\": 4.7, \"droll\": 0.0}, {\"dx\": 0.6, \"dy\": -1.87, \"dz\": 0.07, \"dpitch\": 1.45, \"dyaw\": 5.75, \"droll\": 0.0}, {\"dx\": 0.81, \"dy\": -2.45, \"dz\": 0.07, \"dpitch\": 1.5, \"dyaw\": 6.75, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": -3.02, \"dz\": 0.07, \"dpitch\": 1.57, \"dyaw\": 7.72, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.25, "window_alt_abs_m": 0.07, "target_px_mean_hist": 619.5, "cur_frame_id": 44, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00055/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-108.44, -22.08, 20.0, -47.25, -65.36, 0.0]\n  Target bbox: [623.91, 323.72, 655.86, 395.33]\n\nFrame 2:\n  Drone pose: [-108.27, -22.62, 20.0, -47.08, -64.32, 0.0]\n  Target bbox: [622.19, 322.55, 658.01, 396.48]\n\nFrame 3:\n  Drone pose: [-108.13, -23.15, 20.0, -47.22, -64.7, 0.0]\n  Target bbox: [621.6, 322.78, 658.14, 396.32]\n\nFrame 4:\n  Drone pose: [-108.0, -23.68, 20.0, -47.0, -63.57, 0.0]\n  Target bbox: [613.73, 320.76, 666.36, 398.47]\n\nFrame 5 (current):\n  Drone pose: [-107.89, -24.19, 20.0, -42.11, -63.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 613.56, \"ymin\": 404.17, \"xmax\": 665.97, \"ymax\": 483.3}, \"waypoint_deltas\": [{\"dx\": 0.12, \"dy\": -0.52, \"dz\": 0.0, \"dpitch\": -5.1, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": 0.25, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -4.85, \"dyaw\": 0.76, \"droll\": 0.0}, {\"dx\": 0.38, \"dy\": -1.54, \"dz\": 0.0, \"dpitch\": -4.96, \"dyaw\": 0.38, \"droll\": 0.0}, {\"dx\": 0.53, \"dy\": -2.04, \"dz\": 0.0, \"dpitch\": -4.71, \"dyaw\": 1.4, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -2.55, \"dz\": 0.0, \"dpitch\": -4.83, \"dyaw\": 0.95, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.83, "window_alt_abs_m": 0.0, "target_px_mean_hist": 619.2, "cur_frame_id": 55, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00062/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00065/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-107.04, -27.24, 20.0, -43.86, -63.99, 0.0]\n  Target bbox: [638.25, 368.68, 690.17, 446.12]\n\nFrame 2:\n  Drone pose: [-106.87, -27.74, 20.0, -43.8, -63.03, 0.0]\n  Target bbox: [624.17, 371.58, 669.87, 448.73]\n\nFrame 3:\n  Drone pose: [-106.7, -28.23, 20.0, -46.56, -61.49, 0.0]\n  Target bbox: [613.89, 320.61, 666.17, 398.67]\n\nFrame 4:\n  Drone pose: [-106.54, -28.73, 20.0, -43.69, -56.95, 0.0]\n  Target bbox: [563.76, 374.1, 599.8, 449.28]\n\nFrame 5 (current):\n  Drone pose: [-106.38, -29.23, 20.0, -48.63, -66.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 674.61, \"ymin\": 286.76, \"xmax\": 722.44, \"ymax\": 361.91}, \"waypoint_deltas\": [{\"dx\": 0.16, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 2.09, \"dyaw\": 4.55, \"droll\": 0.0}, {\"dx\": 0.33, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 1.96, \"dyaw\": 4.07, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": 2.21, \"dyaw\": 4.98, \"droll\": 0.0}, {\"dx\": 0.69, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 2.07, \"dyaw\": 4.47, \"droll\": 0.0}, {\"dx\": 0.88, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 2.31, \"dyaw\": 5.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 16.09, "window_alt_abs_m": 0.0, "target_px_mean_hist": 621.0, "cur_frame_id": 65, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00075/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-105.36, -32.28, 19.83, -50.66, -57.46, 0.0]\n  Target bbox: [615.11, 253.64, 660.49, 330.83]\n\nFrame 2:\n  Drone pose: [-105.05, -32.83, 20.11, -48.99, -66.97, 0.0]\n  Target bbox: [614.28, 318.05, 665.63, 401.09]\n\nFrame 3:\n  Drone pose: [-104.92, -33.22, 20.0, -45.79, -55.9, 0.0]\n  Target bbox: [563.78, 334.14, 599.29, 408.5]\n\nFrame 4:\n  Drone pose: [-104.73, -33.72, 20.0, -41.34, -58.57, 0.0]\n  Target bbox: [598.29, 401.41, 646.35, 479.57]\n\nFrame 5 (current):\n  Drone pose: [-104.55, -34.22, 20.0, -46.27, -60.55, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.85, \"ymin\": 325.33, \"xmax\": 651.9, \"ymax\": 393.74}, \"waypoint_deltas\": [{\"dx\": 0.17, \"dy\": -0.5, \"dz\": 0.0, \"dpitch\": 0.25, \"dyaw\": 0.89, \"droll\": 0.0}, {\"dx\": 0.35, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": 0.42, \"droll\": 0.0}, {\"dx\": 0.52, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 0.71, \"dy\": -2.0, \"dz\": 0.0, \"dpitch\": 0.22, \"dyaw\": 0.78, \"droll\": 0.0}, {\"dx\": 0.91, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": 0.25, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 25.22, "window_alt_abs_m": 0.4, "target_px_mean_hist": 603.5, "cur_frame_id": 75, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00085/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-103.44, -37.22, 20.0, -49.26, -57.96, 0.0]\n  Target bbox: [602.51, 267.89, 640.94, 341.02]\n\nFrame 2:\n  Drone pose: [-103.23, -37.72, 20.0, -45.57, -55.05, 0.0]\n  Target bbox: [560.62, 337.25, 601.74, 404.27]\n\nFrame 3:\n  Drone pose: [-102.97, -38.34, 19.94, -43.65, -64.1, 0.0]\n  Target bbox: [685.6, 311.92, 718.15, 379.59]\n\nFrame 4:\n  Drone pose: [-102.81, -38.72, 20.0, -51.07, -60.82, 0.0]\n  Target bbox: [634.7, 237.98, 668.33, 313.09]\n\nFrame 5 (current):\n  Drone pose: [-102.6, -39.22, 20.0, -44.81, -64.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 674.23, \"ymin\": 340.47, \"xmax\": 723.95, \"ymax\": 416.83}, \"waypoint_deltas\": [{\"dx\": 0.2, \"dy\": -0.49, \"dz\": 0.0, \"dpitch\": -1.18, \"dyaw\": 4.46, \"droll\": 0.0}, {\"dx\": 0.39, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.94, \"dyaw\": 5.28, \"droll\": 0.0}, {\"dx\": 0.58, \"dy\": -1.49, \"dz\": 0.0, \"dpitch\": -1.09, \"dyaw\": 4.77, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": -1.99, \"dz\": 0.0, \"dpitch\": -1.24, \"dyaw\": 4.25, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": -2.5, \"dz\": 0.0, \"dpitch\": -1.01, \"dyaw\": 5.05, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.48, "window_alt_abs_m": 0.12, "target_px_mean_hist": 575.0, "cur_frame_id": 85, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00092/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247/aug_001/frames_playback/frame_00095/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-101.42, -42.22, 20.0, -47.51, -64.55, 0.0]\n  Target bbox: [680.16, 298.09, 716.73, 373.4]\n\nFrame 2:\n  Drone pose: [-101.2, -42.72, 20.0, -45.76, -58.79, 0.0]\n  Target bbox: [614.93, 318.9, 665.04, 400.49]\n\nFrame 3:\n  Drone pose: [-101.16, -43.24, 20.07, -44.58, -57.32, 0.0]\n  Target bbox: [569.13, 384.95, 611.52, 458.36]\n\nFrame 4:\n  Drone pose: [-100.77, -43.71, 20.0, -45.91, -57.89, 0.0]\n  Target bbox: [612.36, 318.92, 649.55, 393.24]\n\nFrame 5 (current):\n  Drone pose: [-100.66, -44.2, 20.12, -41.9, -52.14, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.54, \"ymin\": 349.38, \"xmax\": 599.52, \"ymax\": 420.77}, \"waypoint_deltas\": [{\"dx\": 0.33, \"dy\": -0.49, \"dz\": -0.12, \"dpitch\": -3.72, \"dyaw\": -6.35, \"droll\": 0.0}, {\"dx\": 0.54, \"dy\": -0.98, \"dz\": -0.12, \"dpitch\": -3.88, \"dyaw\": -6.93, \"droll\": 0.0}, {\"dx\": 0.75, \"dy\": -1.48, \"dz\": -0.12, \"dpitch\": -3.64, \"dyaw\": -6.18, \"droll\": 0.0}, {\"dx\": 0.96, \"dy\": -1.97, \"dz\": -0.12, \"dpitch\": -3.8, \"dyaw\": -6.74, \"droll\": 0.0}, {\"dx\": 1.18, \"dy\": -2.46, \"dz\": -0.12, \"dpitch\": -3.96, \"dyaw\": -7.32, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.56, "window_alt_abs_m": 0.27, "target_px_mean_hist": 609.2, "cur_frame_id": 95, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776260247", "difficulty_score": 0.2261, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [5.67, 9.41, 22.0, -46.66, -171.47, 0.0]\n  Target bbox: [628.07, 338.04, 651.91, 381.68]\n\nFrame 2:\n  Drone pose: [4.23, 8.16, 21.2, -47.08, -173.29, 0.0]\n  Target bbox: [630.35, 336.31, 649.8, 383.36]\n\nFrame 3:\n  Drone pose: [3.34, 7.48, 20.67, -47.03, -175.25, 0.0]\n  Target bbox: [625.45, 334.76, 654.52, 384.94]\n\nFrame 4:\n  Drone pose: [2.7, 7.09, 20.64, -47.18, -174.87, 0.0]\n  Target bbox: [626.66, 336.18, 653.33, 383.51]\n\nFrame 5 (current):\n  Drone pose: [2.16, 6.85, 20.62, -47.16, -174.03, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.92, \"ymin\": 336.7, \"xmax\": 649.21, \"ymax\": 382.89}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": -0.16, \"dz\": -0.03, \"dpitch\": -0.01, \"dyaw\": -0.5, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -0.28, \"dz\": -0.05, \"dpitch\": 0.0, \"dyaw\": -0.89, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": -0.39, \"dz\": -0.07, \"dpitch\": 0.01, \"dyaw\": -1.21, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": -0.48, \"dz\": -0.09, \"dpitch\": 0.01, \"dyaw\": -1.5, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": -0.57, \"dz\": -0.2, \"dpitch\": 0.15, \"dyaw\": -1.77, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.99, "window_alt_abs_m": 1.38, "target_px_mean_hist": 270.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-0.37, 6.28, 20.42, -47.01, -175.8, 0.0]\n  Target bbox: [631.13, 335.6, 649.01, 384.0]\n\nFrame 2:\n  Drone pose: [-0.88, 6.21, 20.39, -46.98, -176.04, 0.0]\n  Target bbox: [631.06, 337.58, 649.05, 382.0]\n\nFrame 3:\n  Drone pose: [-1.38, 6.13, 20.36, -46.96, -176.26, 0.0]\n  Target bbox: [630.87, 339.97, 649.22, 379.61]\n\nFrame 4:\n  Drone pose: [-1.89, 6.07, 20.33, -46.93, -176.46, 0.0]\n  Target bbox: [630.65, 338.28, 649.46, 381.3]\n\nFrame 5 (current):\n  Drone pose: [-2.4, 6.01, 20.3, -46.9, -176.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.98, \"ymin\": 339.69, \"xmax\": 649.11, \"ymax\": 379.89}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.04, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -0.07, \"dz\": -0.06, \"dpitch\": 0.05, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": -0.08, \"dz\": -0.08, \"dpitch\": 0.07, \"dyaw\": -0.25, \"droll\": 0.0}, {\"dx\": -2.04, \"dy\": -0.07, \"dz\": -0.11, \"dpitch\": 0.09, \"dyaw\": -0.21, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.03, \"dz\": -0.13, \"dpitch\": 0.09, \"dyaw\": -0.1, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.84, "window_alt_abs_m": 0.12, "target_px_mean_hist": 296.0, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-5.48, 6.03, 20.15, -46.81, -176.56, 0.0]\n  Target bbox: [630.69, 339.59, 649.4, 379.98]\n\nFrame 2:\n  Drone pose: [-6.0, 6.11, 20.14, -46.82, -176.3, 0.0]\n  Target bbox: [630.7, 335.59, 649.45, 384.04]\n\nFrame 3:\n  Drone pose: [-6.53, 6.22, 20.12, -46.83, -175.97, 0.0]\n  Target bbox: [630.53, 336.58, 649.62, 383.04]\n\nFrame 4:\n  Drone pose: [-7.07, 6.34, 20.1, -46.85, -175.56, 0.0]\n  Target bbox: [630.82, 337.28, 649.31, 382.35]\n\nFrame 5 (current):\n  Drone pose: [-7.62, 6.49, 20.09, -46.88, -175.09, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.9, \"ymin\": 336.24, \"xmax\": 649.23, \"ymax\": 383.33}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": 0.16, \"dz\": -0.01, \"dpitch\": -0.04, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": -1.1, \"dy\": 0.34, \"dz\": -0.02, \"dpitch\": -0.09, \"dyaw\": 1.1, \"droll\": 0.0}, {\"dx\": -1.67, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": -0.15, \"dyaw\": 1.68, \"droll\": 0.0}, {\"dx\": -2.24, \"dy\": 0.67, \"dz\": -0.04, \"dpitch\": -0.22, \"dyaw\": 2.22, \"droll\": 0.0}, {\"dx\": -2.82, \"dy\": 0.81, \"dz\": -0.05, \"dpitch\": -0.3, \"dyaw\": 2.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.47, "window_alt_abs_m": 0.06, "target_px_mean_hist": 298.8, "cur_frame_id": 23, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-11.02, 7.38, 20.04, -47.29, -172.12, 0.0]\n  Target bbox: [629.81, 335.08, 650.36, 384.58]\n\nFrame 2:\n  Drone pose: [-11.62, 7.4, 20.03, -47.44, -172.0, 0.0]\n  Target bbox: [628.43, 336.77, 651.56, 382.82]\n\nFrame 3:\n  Drone pose: [-12.23, 7.37, 20.03, -47.49, -170.45, 0.0]\n  Target bbox: [626.4, 335.25, 653.55, 384.39]\n\nFrame 4:\n  Drone pose: [-12.85, 7.28, 20.02, -47.56, -169.05, 0.0]\n  Target bbox: [628.34, 336.44, 651.62, 383.16]\n\nFrame 5 (current):\n  Drone pose: [-13.48, 7.17, 20.02, -47.63, -167.73, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.1, \"ymin\": 336.95, \"xmax\": 651.88, \"ymax\": 382.64}, \"waypoint_deltas\": [{\"dx\": -0.64, \"dy\": -0.13, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 1.28, \"droll\": 0.0}, {\"dx\": -1.28, \"dy\": -0.25, \"dz\": -0.01, \"dpitch\": -0.14, \"dyaw\": 2.59, \"droll\": 0.0}, {\"dx\": -1.92, \"dy\": -0.36, \"dz\": -0.01, \"dpitch\": -0.19, \"dyaw\": 3.97, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.45, \"dz\": -0.01, \"dpitch\": -0.21, \"dyaw\": 5.41, \"droll\": 0.0}, {\"dx\": -3.19, \"dy\": -0.53, \"dz\": -0.01, \"dpitch\": -0.46, \"dyaw\": 5.27, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.38, "window_alt_abs_m": 0.02, "target_px_mean_hist": 302.0, "cur_frame_id": 33, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00043/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-17.31, 6.54, 20.01, -48.36, -162.63, 0.0]\n  Target bbox: [628.39, 336.94, 651.57, 382.61]\n\nFrame 2:\n  Drone pose: [-17.96, 6.42, 20.0, -48.4, -161.25, 0.0]\n  Target bbox: [627.65, 336.14, 652.3, 383.4]\n\nFrame 3:\n  Drone pose: [-18.61, 6.28, 20.0, -48.44, -159.95, 0.0]\n  Target bbox: [627.88, 337.02, 652.07, 382.55]\n\nFrame 4:\n  Drone pose: [-19.27, 6.11, 20.0, -48.48, -158.72, 0.0]\n  Target bbox: [626.33, 334.42, 653.57, 385.11]\n\nFrame 5 (current):\n  Drone pose: [-19.92, 5.91, 20.0, -48.52, -157.6, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.95, \"ymin\": 336.76, \"xmax\": 651.98, \"ymax\": 382.81}, \"waypoint_deltas\": [{\"dx\": -0.65, \"dy\": -0.23, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 1.05, \"droll\": 0.0}, {\"dx\": -1.3, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 2.01, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": -0.75, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": 2.94, \"droll\": 0.0}, {\"dx\": -2.59, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -0.41, \"dyaw\": 3.84, \"droll\": 0.0}, {\"dx\": -3.22, \"dy\": -1.31, \"dz\": 0.0, \"dpitch\": -0.5, \"dyaw\": 4.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.03, "window_alt_abs_m": 0.0, "target_px_mean_hist": 306.8, "cur_frame_id": 43, "source": "ORI", "fut_invisible_cnt": 1}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00053/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-23.75, 4.32, 20.0, -49.01, -152.08, 0.0]\n  Target bbox: [627.91, 336.22, 652.0, 383.3]\n\nFrame 2:\n  Drone pose: [-24.33, 4.03, 20.0, -48.95, -151.31, 0.0]\n  Target bbox: [627.3, 336.45, 652.6, 383.12]\n\nFrame 3:\n  Drone pose: [-24.89, 3.76, 20.0, -48.86, -150.53, 0.0]\n  Target bbox: [626.77, 335.98, 653.11, 383.61]\n\nFrame 4:\n  Drone pose: [-25.41, 3.41, 20.0, -48.76, -150.05, 0.0]\n  Target bbox: [626.98, 335.65, 652.9, 383.91]\n\nFrame 5 (current):\n  Drone pose: [-25.86, 3.08, 20.0, -48.54, -149.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.98, \"ymin\": 333.42, \"xmax\": 654.14, \"ymax\": 386.19}, \"waypoint_deltas\": [{\"dx\": -0.44, \"dy\": -0.32, \"dz\": 0.0, \"dpitch\": -0.49, \"dyaw\": 1.26, \"droll\": 0.0}, {\"dx\": -0.89, \"dy\": -0.66, \"dz\": 0.0, \"dpitch\": -0.99, \"dyaw\": 2.58, \"droll\": 0.0}, {\"dx\": -1.37, \"dy\": -1.0, \"dz\": 0.0, \"dpitch\": -0.81, \"dyaw\": 2.98, \"droll\": 0.0}, {\"dx\": -1.87, \"dy\": -1.36, \"dz\": 0.0, \"dpitch\": -0.68, \"dyaw\": 3.4, \"droll\": 0.0}, {\"dx\": -2.39, \"dy\": -1.72, \"dz\": 0.0, \"dpitch\": -0.58, \"dyaw\": 3.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.44, "window_alt_abs_m": 0.0, "target_px_mean_hist": 304.2, "cur_frame_id": 53, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-28.25, 1.36, 20.0, -49.12, -145.8, 0.0]\n  Target bbox: [628.72, 335.97, 651.15, 383.57]\n\nFrame 2:\n  Drone pose: [-28.79, 0.99, 20.0, -49.05, -145.36, 0.0]\n  Target bbox: [630.16, 337.65, 649.74, 381.88]\n\nFrame 3:\n  Drone pose: [-29.35, 0.61, 20.0, -49.0, -144.9, 0.0]\n  Target bbox: [627.63, 336.24, 652.23, 383.3]\n\nFrame 4:\n  Drone pose: [-29.91, 0.23, 20.0, -48.98, -144.45, 0.0]\n  Target bbox: [629.74, 336.73, 650.14, 382.81]\n\nFrame 5 (current):\n  Drone pose: [-30.48, -0.15, 20.0, -48.96, -143.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.41, \"ymin\": 336.46, \"xmax\": 651.46, \"ymax\": 383.06}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": -0.39, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": -1.15, \"dy\": -0.77, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.92, \"droll\": 0.0}, {\"dx\": -1.73, \"dy\": -1.16, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 1.39, \"droll\": 0.0}, {\"dx\": -2.31, \"dy\": -1.54, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 1.89, \"droll\": 0.0}, {\"dx\": -2.91, \"dy\": -1.92, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 2.41, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.81, "window_alt_abs_m": 0.0, "target_px_mean_hist": 308.0, "cur_frame_id": 62, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-33.99, -2.44, 20.0, -48.91, -141.04, 0.0]\n  Target bbox: [628.61, 337.66, 651.26, 381.89]\n\nFrame 2:\n  Drone pose: [-34.6, -2.81, 20.0, -48.91, -140.46, 0.0]\n  Target bbox: [628.1, 336.38, 651.74, 383.18]\n\nFrame 3:\n  Drone pose: [-35.21, -3.18, 20.0, -48.92, -139.84, 0.0]\n  Target bbox: [628.57, 336.4, 651.28, 383.14]\n\nFrame 4:\n  Drone pose: [-35.84, -3.54, 20.0, -48.93, -139.19, 0.0]\n  Target bbox: [628.58, 336.08, 651.26, 383.44]\n\nFrame 5 (current):\n  Drone pose: [-36.48, -3.89, 20.0, -48.94, -138.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.63, \"ymin\": 337.1, \"xmax\": 651.22, \"ymax\": 382.43}, \"waypoint_deltas\": [{\"dx\": -0.65, \"dy\": -0.35, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": -1.29, \"dy\": -0.7, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 1.42, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": -1.05, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": 2.12, \"droll\": 0.0}, {\"dx\": -2.57, \"dy\": -1.4, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 2.79, \"droll\": 0.0}, {\"dx\": -3.17, \"dy\": -1.76, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 3.38, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.53, "window_alt_abs_m": 0.0, "target_px_mean_hist": 316.8, "cur_frame_id": 72, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00082/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-40.21, -6.01, 20.0, -48.81, -134.67, 0.0]\n  Target bbox: [629.98, 339.05, 650.12, 380.48]\n\nFrame 2:\n  Drone pose: [-40.7, -6.38, 20.0, -48.64, -134.38, 0.0]\n  Target bbox: [631.67, 338.64, 648.43, 380.88]\n\nFrame 3:\n  Drone pose: [-41.1, -6.75, 20.0, -48.38, -134.29, 0.0]\n  Target bbox: [626.53, 335.6, 653.49, 384.03]\n\nFrame 4:\n  Drone pose: [-41.43, -7.12, 20.0, -48.6, -133.21, 0.0]\n  Target bbox: [626.2, 334.95, 653.8, 384.65]\n\nFrame 5 (current):\n  Drone pose: [-41.68, -7.5, 20.0, -48.74, -132.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.21, \"ymin\": 334.92, \"xmax\": 653.8, \"ymax\": 384.67}, \"waypoint_deltas\": [{\"dx\": -0.19, \"dy\": -0.39, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.74, \"droll\": 0.0}, {\"dx\": -0.34, \"dy\": -0.79, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": 1.33, \"droll\": 0.0}, {\"dx\": -0.46, \"dy\": -1.2, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": 1.83, \"droll\": 0.0}, {\"dx\": -0.57, \"dy\": -1.63, \"dz\": 0.0, \"dpitch\": -0.16, \"dyaw\": 2.29, \"droll\": 0.0}, {\"dx\": -0.71, \"dy\": -2.07, \"dz\": 0.0, \"dpitch\": -0.23, \"dyaw\": 2.76, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.36, "window_alt_abs_m": 0.0, "target_px_mean_hist": 300.8, "cur_frame_id": 82, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/ORI/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-42.55, -10.03, 20.0, -49.1, -129.02, 0.0]\n  Target bbox: [625.99, 334.54, 654.02, 385.0]\n\nFrame 2:\n  Drone pose: [-42.77, -10.51, 20.0, -49.3, -128.39, 0.0]\n  Target bbox: [628.83, 336.84, 651.31, 382.7]\n\nFrame 3:\n  Drone pose: [-43.05, -11.01, 20.0, -49.07, -129.0, 0.0]\n  Target bbox: [628.56, 336.22, 651.57, 383.26]\n\nFrame 4:\n  Drone pose: [-43.37, -11.54, 20.0, -48.9, -129.52, 0.0]\n  Target bbox: [629.24, 337.93, 650.88, 381.62]\n\nFrame 5 (current):\n  Drone pose: [-43.72, -12.07, 20.0, -48.79, -129.97, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.91, \"ymin\": 336.43, \"xmax\": 651.23, \"ymax\": 383.07}, \"waypoint_deltas\": [{\"dx\": -0.39, \"dy\": -0.55, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -0.42, \"droll\": 0.0}, {\"dx\": -0.78, \"dy\": -1.11, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.82, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": -1.68, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -1.23, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": -2.25, \"dz\": 0.0, \"dpitch\": 0.36, \"dyaw\": -1.64, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": -2.82, \"dz\": 0.0, \"dpitch\": 0.39, \"dyaw\": -2.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.21, "window_alt_abs_m": 0.0, "target_px_mean_hist": 312.2, "cur_frame_id": 92, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [5.67, 9.41, 22.0, -46.66, -171.47, 0.0]\n  Target bbox: [627.95, 337.64, 652.03, 382.06]\n\nFrame 2:\n  Drone pose: [4.23, 8.16, 21.2, -45.65, -174.53, 0.0]\n  Target bbox: [645.05, 361.47, 663.59, 406.19]\n\nFrame 3:\n  Drone pose: [3.34, 7.48, 20.67, -47.03, -175.25, 0.0]\n  Target bbox: [625.59, 334.66, 654.37, 385.02]\n\nFrame 4:\n  Drone pose: [2.7, 7.09, 20.64, -49.35, -169.87, 0.0]\n  Target bbox: [569.47, 302.26, 596.06, 348.35]\n\nFrame 5 (current):\n  Drone pose: [2.16, 6.85, 20.62, -49.94, -175.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 643.47, \"ymin\": 290.9, \"xmax\": 661.74, \"ymax\": 335.64}, \"waypoint_deltas\": [{\"dx\": -0.51, \"dy\": -0.16, \"dz\": -0.03, \"dpitch\": 2.77, \"dyaw\": 0.59, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -0.28, \"dz\": -0.05, \"dpitch\": 2.78, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": -0.39, \"dz\": -0.07, \"dpitch\": 2.79, \"dyaw\": -0.12, \"droll\": 0.0}, {\"dx\": -2.03, \"dy\": -0.48, \"dz\": -0.09, \"dpitch\": 2.79, \"dyaw\": -0.41, \"droll\": 0.0}, {\"dx\": -2.53, \"dy\": -0.57, \"dz\": -0.2, \"dpitch\": 2.93, \"dyaw\": -0.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.41, "window_alt_abs_m": 1.38, "target_px_mean_hist": 272.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-0.37, 6.28, 20.42, -47.01, -175.8, 0.0]\n  Target bbox: [630.72, 335.62, 649.44, 384.0]\n\nFrame 2:\n  Drone pose: [-0.88, 6.21, 20.39, -47.59, -171.71, 0.0]\n  Target bbox: [580.25, 330.78, 600.38, 371.28]\n\nFrame 3:\n  Drone pose: [-1.38, 6.13, 20.36, -48.89, -181.26, 0.0]\n  Target bbox: [688.18, 305.04, 706.78, 353.23]\n\nFrame 4:\n  Drone pose: [-1.89, 6.07, 20.33, -43.37, -174.6, 0.0]\n  Target bbox: [608.86, 398.62, 628.22, 440.96]\n\nFrame 5 (current):\n  Drone pose: [-2.4, 6.01, 20.3, -48.07, -176.2, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.79, \"ymin\": 316.67, \"xmax\": 644.15, \"ymax\": 363.9}, \"waypoint_deltas\": [{\"dx\": -0.5, \"dy\": -0.04, \"dz\": -0.03, \"dpitch\": 1.19, \"dyaw\": -0.57, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": -0.07, \"dz\": -0.06, \"dpitch\": 1.22, \"dyaw\": -0.66, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": -0.08, \"dz\": -0.08, \"dpitch\": 1.24, \"dyaw\": -0.69, \"droll\": 0.0}, {\"dx\": -2.04, \"dy\": -0.07, \"dz\": -0.11, \"dpitch\": 1.26, \"dyaw\": -0.65, \"droll\": 0.0}, {\"dx\": -2.56, \"dy\": -0.03, \"dz\": -0.13, \"dpitch\": 1.26, \"dyaw\": -0.54, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.91, "window_alt_abs_m": 0.12, "target_px_mean_hist": 294.0, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-5.52, 6.18, 20.07, -43.83, -168.73, 0.0]\n  Target bbox: [581.14, 312.41, 605.53, 357.36]\n\nFrame 2:\n  Drone pose: [-6.0, 6.11, 20.14, -49.31, -173.22, 0.0]\n  Target bbox: [590.19, 293.24, 618.49, 343.93]\n\nFrame 3:\n  Drone pose: [-6.47, 6.07, 20.03, -51.48, 174.63, 0.0]\n  Target bbox: [679.15, 314.37, 708.66, 365.36]\n\nFrame 4:\n  Drone pose: [-7.07, 6.34, 20.1, -42.38, -177.97, 0.0]\n  Target bbox: [658.02, 414.4, 677.46, 456.2]\n\nFrame 5 (current):\n  Drone pose: [-7.59, 6.6, 20.13, -44.57, -179.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 690.53, \"ymin\": 325.62, \"xmax\": 709.87, \"ymax\": 370.76}, \"waypoint_deltas\": [{\"dx\": -0.58, \"dy\": 0.05, \"dz\": -0.05, \"dpitch\": -2.35, \"dyaw\": 4.87, \"droll\": 0.0}, {\"dx\": -1.13, \"dy\": 0.23, \"dz\": -0.06, \"dpitch\": -2.4, \"dyaw\": 5.44, \"droll\": 0.0}, {\"dx\": -1.7, \"dy\": 0.4, \"dz\": -0.07, \"dpitch\": -2.46, \"dyaw\": 6.02, \"droll\": 0.0}, {\"dx\": -2.27, \"dy\": 0.56, \"dz\": -0.08, \"dpitch\": -2.53, \"dyaw\": 6.56, \"droll\": 0.0}, {\"dx\": -2.85, \"dy\": 0.7, \"dz\": -0.09, \"dpitch\": -2.61, \"dyaw\": 7.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 25.5, "window_alt_abs_m": 0.26, "target_px_mean_hist": 295.2, "cur_frame_id": 23, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00033/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-11.03, 7.42, 20.2, -41.09, -165.99, 0.0]\n  Target bbox: [586.88, 415.85, 613.03, 462.99]\n\nFrame 2:\n  Drone pose: [-11.62, 7.4, 20.03, -47.44, -172.0, 0.0]\n  Target bbox: [625.06, 333.6, 654.89, 386.03]\n\nFrame 3:\n  Drone pose: [-12.23, 7.37, 20.03, -47.49, -170.45, 0.0]\n  Target bbox: [626.99, 335.33, 652.98, 384.26]\n\nFrame 4:\n  Drone pose: [-12.85, 7.28, 20.02, -47.56, -169.05, 0.0]\n  Target bbox: [627.86, 335.43, 652.24, 384.21]\n\nFrame 5 (current):\n  Drone pose: [-13.57, 7.3, 19.94, -49.85, -172.94, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.17, \"ymin\": 333.75, \"xmax\": 654.86, \"ymax\": 385.8}, \"waypoint_deltas\": [{\"dx\": -0.55, \"dy\": -0.26, \"dz\": 0.08, \"dpitch\": 2.15, \"dyaw\": 6.49, \"droll\": 0.0}, {\"dx\": -1.19, \"dy\": -0.38, \"dz\": 0.07, \"dpitch\": 2.08, \"dyaw\": 7.8, \"droll\": 0.0}, {\"dx\": -1.83, \"dy\": -0.49, \"dz\": 0.07, \"dpitch\": 2.03, \"dyaw\": 9.18, \"droll\": 0.0}, {\"dx\": -2.47, \"dy\": -0.58, \"dz\": 0.07, \"dpitch\": 2.01, \"dyaw\": 10.62, \"droll\": 0.0}, {\"dx\": -3.1, \"dy\": -0.66, \"dz\": 0.07, \"dpitch\": 1.76, \"dyaw\": 10.48, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.85, "window_alt_abs_m": 0.26, "target_px_mean_hist": 290.8, "cur_frame_id": 33, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00043/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-17.31, 6.54, 20.01, -48.36, -162.63, 0.0]\n  Target bbox: [626.22, 334.37, 653.7, 385.18]\n\nFrame 2:\n  Drone pose: [-17.96, 6.42, 20.0, -45.78, -162.25, 0.0]\n  Target bbox: [635.59, 377.7, 667.03, 429.78]\n\nFrame 3:\n  Drone pose: [-18.51, 6.42, 20.02, -50.13, -170.41, 0.0]\n  Target bbox: [681.14, 281.39, 704.01, 329.29]\n\nFrame 4:\n  Drone pose: [-19.27, 6.11, 20.0, -46.74, -159.11, 0.0]\n  Target bbox: [630.57, 363.55, 658.21, 414.39]\n\nFrame 5 (current):\n  Drone pose: [-19.92, 5.91, 20.0, -48.52, -157.6, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.57, \"ymin\": 334.63, \"xmax\": 653.33, \"ymax\": 384.9}, \"waypoint_deltas\": [{\"dx\": -0.65, \"dy\": -0.23, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": 1.05, \"droll\": 0.0}, {\"dx\": -1.3, \"dy\": -0.48, \"dz\": 0.0, \"dpitch\": -0.12, \"dyaw\": 2.01, \"droll\": 0.0}, {\"dx\": -1.94, \"dy\": -0.75, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": 2.94, \"droll\": 0.0}, {\"dx\": -2.59, \"dy\": -1.03, \"dz\": 0.0, \"dpitch\": -0.41, \"dyaw\": 3.84, \"droll\": 0.0}, {\"dx\": -3.22, \"dy\": -1.31, \"dz\": 0.0, \"dpitch\": -0.5, \"dyaw\": 4.69, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.35, "window_alt_abs_m": 0.03, "target_px_mean_hist": 298.0, "cur_frame_id": 43, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00051/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00052/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00053/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-23.75, 4.32, 20.0, -45.12, -150.17, 0.0]\n  Target bbox: [606.98, 401.47, 630.33, 449.31]\n\nFrame 2:\n  Drone pose: [-24.33, 4.03, 20.0, -48.95, -151.31, 0.0]\n  Target bbox: [627.66, 336.15, 652.22, 383.41]\n\nFrame 3:\n  Drone pose: [-24.89, 3.76, 20.0, -46.07, -154.44, 0.0]\n  Target bbox: [671.67, 385.65, 694.92, 429.89]\n\nFrame 4:\n  Drone pose: [-25.41, 3.41, 20.0, -48.76, -150.05, 0.0]\n  Target bbox: [627.49, 334.28, 652.67, 385.32]\n\nFrame 5 (current):\n  Drone pose: [-25.71, 3.11, 19.96, -44.02, -147.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 686.21, \"ymin\": 408.38, \"xmax\": 706.27, \"ymax\": 451.65}, \"waypoint_deltas\": [{\"dx\": -0.59, \"dy\": -0.35, \"dz\": 0.04, \"dpitch\": -5.01, \"dyaw\": -0.71, \"droll\": 0.0}, {\"dx\": -1.04, \"dy\": -0.69, \"dz\": 0.04, \"dpitch\": -5.51, \"dyaw\": 0.61, \"droll\": 0.0}, {\"dx\": -1.52, \"dy\": -1.03, \"dz\": 0.04, \"dpitch\": -5.33, \"dyaw\": 1.01, \"droll\": 0.0}, {\"dx\": -2.02, \"dy\": -1.39, \"dz\": 0.04, \"dpitch\": -5.2, \"dyaw\": 1.43, \"droll\": 0.0}, {\"dx\": -2.54, \"dy\": -1.75, \"dz\": 0.04, \"dpitch\": -5.1, \"dyaw\": 1.87, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.04, "window_alt_abs_m": 0.04, "target_px_mean_hist": 296.5, "cur_frame_id": 53, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00061/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00062/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-28.25, 1.36, 20.0, -48.27, -150.4, 0.0]\n  Target bbox: [677.4, 351.84, 703.88, 399.23]\n\nFrame 2:\n  Drone pose: [-28.79, 0.99, 20.0, -48.31, -142.87, 0.0]\n  Target bbox: [599.94, 348.16, 624.84, 396.98]\n\nFrame 3:\n  Drone pose: [-29.35, 0.61, 20.0, -49.0, -144.9, 0.0]\n  Target bbox: [628.08, 335.92, 651.79, 383.59]\n\nFrame 4:\n  Drone pose: [-29.91, 0.23, 20.0, -44.81, -143.91, 0.0]\n  Target bbox: [621.37, 406.2, 646.45, 453.27]\n\nFrame 5 (current):\n  Drone pose: [-30.48, -0.15, 20.0, -48.96, -143.99, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.53, \"ymin\": 337.45, \"xmax\": 651.36, \"ymax\": 382.07}, \"waypoint_deltas\": [{\"dx\": -0.57, \"dy\": -0.39, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": -1.15, \"dy\": -0.77, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.92, \"droll\": 0.0}, {\"dx\": -1.73, \"dy\": -1.16, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 1.39, \"droll\": 0.0}, {\"dx\": -2.31, \"dy\": -1.54, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 1.89, \"droll\": 0.0}, {\"dx\": -2.91, \"dy\": -1.92, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": 2.41, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.64, "window_alt_abs_m": 0.0, "target_px_mean_hist": 309.8, "cur_frame_id": 62, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00072/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-33.99, -2.44, 20.0, -50.14, -136.04, 0.0]\n  Target bbox: [574.1, 320.37, 595.23, 361.28]\n\nFrame 2:\n  Drone pose: [-34.6, -2.81, 20.0, -48.91, -140.46, 0.0]\n  Target bbox: [629.79, 338.46, 650.11, 381.08]\n\nFrame 3:\n  Drone pose: [-35.21, -3.18, 20.0, -49.54, -144.84, 0.0]\n  Target bbox: [683.0, 327.53, 707.47, 374.65]\n\nFrame 4:\n  Drone pose: [-35.84, -3.54, 20.0, -48.93, -139.19, 0.0]\n  Target bbox: [629.03, 337.76, 650.88, 381.79]\n\nFrame 5 (current):\n  Drone pose: [-36.47, -3.92, 19.98, -49.05, -145.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.94, \"ymin\": 334.09, \"xmax\": 654.16, \"ymax\": 385.54}, \"waypoint_deltas\": [{\"dx\": -0.66, \"dy\": -0.32, \"dz\": 0.02, \"dpitch\": 0.09, \"dyaw\": 7.89, \"droll\": 0.0}, {\"dx\": -1.3, \"dy\": -0.67, \"dz\": 0.02, \"dpitch\": 0.08, \"dyaw\": 8.61, \"droll\": 0.0}, {\"dx\": -1.95, \"dy\": -1.02, \"dz\": 0.02, \"dpitch\": 0.08, \"dyaw\": 9.31, \"droll\": 0.0}, {\"dx\": -2.58, \"dy\": -1.37, \"dz\": 0.02, \"dpitch\": 0.09, \"dyaw\": 9.98, \"droll\": 0.0}, {\"dx\": -3.18, \"dy\": -1.73, \"dz\": 0.02, \"dpitch\": 0.14, \"dyaw\": 10.57, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.96, "window_alt_abs_m": 0.02, "target_px_mean_hist": 301.8, "cur_frame_id": 72, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00082/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-40.21, -6.01, 20.0, -51.94, -139.67, 0.0]\n  Target bbox: [684.65, 285.87, 706.15, 332.34]\n\nFrame 2:\n  Drone pose: [-40.7, -6.38, 20.0, -48.64, -134.38, 0.0]\n  Target bbox: [630.96, 338.75, 649.14, 380.76]\n\nFrame 3:\n  Drone pose: [-41.1, -6.75, 20.0, -47.83, -135.21, 0.0]\n  Target bbox: [635.23, 343.38, 665.58, 395.1]\n\nFrame 4:\n  Drone pose: [-41.43, -7.12, 20.0, -48.6, -133.21, 0.0]\n  Target bbox: [624.72, 333.61, 655.32, 386.0]\n\nFrame 5 (current):\n  Drone pose: [-41.68, -7.5, 20.0, -45.2, -127.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 572.04, \"ymin\": 397.53, \"xmax\": 596.74, \"ymax\": 444.34}, \"waypoint_deltas\": [{\"dx\": -0.19, \"dy\": -0.39, \"dz\": 0.0, \"dpitch\": -3.61, \"dyaw\": -4.26, \"droll\": 0.0}, {\"dx\": -0.34, \"dy\": -0.79, \"dz\": 0.0, \"dpitch\": -3.65, \"dyaw\": -3.67, \"droll\": 0.0}, {\"dx\": -0.46, \"dy\": -1.2, \"dz\": 0.0, \"dpitch\": -3.67, \"dyaw\": -3.17, \"droll\": 0.0}, {\"dx\": -0.57, \"dy\": -1.63, \"dz\": 0.0, \"dpitch\": -3.7, \"dyaw\": -2.71, \"droll\": 0.0}, {\"dx\": -0.71, \"dy\": -2.07, \"dz\": 0.0, \"dpitch\": -3.77, \"dyaw\": -2.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.02, "window_alt_abs_m": 0.0, "target_px_mean_hist": 311.5, "cur_frame_id": 82, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00089/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00090/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00091/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833/aug_001/frames_playback/frame_00092/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-42.64, -10.07, 19.94, -47.02, -138.54, 0.0]\n  Target bbox: [654.17, 387.64, 680.78, 436.6]\n\nFrame 2:\n  Drone pose: [-42.59, -10.47, 19.95, -50.05, -129.19, 0.0]\n  Target bbox: [617.17, 375.07, 644.46, 427.0]\n\nFrame 3:\n  Drone pose: [-43.02, -10.96, 19.81, -48.33, -135.21, 0.0]\n  Target bbox: [627.44, 336.35, 652.44, 383.16]\n\nFrame 4:\n  Drone pose: [-43.36, -11.56, 19.9, -47.27, -134.99, 0.0]\n  Target bbox: [627.27, 335.5, 652.58, 384.09]\n\nFrame 5 (current):\n  Drone pose: [-43.72, -12.07, 20.0, -48.79, -129.97, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.53, \"ymin\": 337.65, \"xmax\": 650.58, \"ymax\": 381.85}, \"waypoint_deltas\": [{\"dx\": -0.39, \"dy\": -0.55, \"dz\": 0.0, \"dpitch\": 0.06, \"dyaw\": -0.42, \"droll\": 0.0}, {\"dx\": -0.78, \"dy\": -1.11, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": -0.82, \"droll\": 0.0}, {\"dx\": -1.18, \"dy\": -1.68, \"dz\": 0.0, \"dpitch\": 0.12, \"dyaw\": -1.23, \"droll\": 0.0}, {\"dx\": -1.58, \"dy\": -2.25, \"dz\": 0.0, \"dpitch\": 0.36, \"dyaw\": -1.64, \"droll\": 0.0}, {\"dx\": -1.98, \"dy\": -2.82, \"dz\": 0.0, \"dpitch\": 0.39, \"dyaw\": -2.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 20.62, "window_alt_abs_m": 0.33, "target_px_mean_hist": 322.5, "cur_frame_id": 92, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-13/trajectory_1776057833", "difficulty_score": 0.4353, "seen_in_selected_250k": true, "seen_group": "seen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.39, -20.56, 22.0, -46.85, 90.0, 0.0]\n  Target bbox: [631.85, 346.83, 648.15, 387.58]\n\nFrame 2:\n  Drone pose: [104.36, -20.96, 21.2, -44.46, 87.19, 0.0]\n  Target bbox: [630.13, 346.39, 649.74, 388.86]\n\nFrame 3:\n  Drone pose: [103.93, -20.83, 20.67, -43.18, 86.08, 0.0]\n  Target bbox: [630.39, 346.38, 649.49, 389.15]\n\nFrame 4:\n  Drone pose: [103.8, -20.43, 20.64, -42.99, 85.75, 0.0]\n  Target bbox: [625.45, 346.0, 654.34, 388.09]\n\nFrame 5 (current):\n  Drone pose: [103.79, -19.93, 20.62, -42.95, 85.71, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.84, \"ymin\": 346.62, \"xmax\": 652.01, \"ymax\": 388.7}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.03, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.54, \"dz\": -0.07, \"dpitch\": 0.05, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.05, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.56, \"dz\": -0.2, \"dpitch\": 0.2, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.29, "window_alt_abs_m": 1.38, "target_px_mean_hist": 172.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00018/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.79, -14.81, 20.27, -42.61, 85.68, 0.0]\n  Target bbox: [625.22, 346.24, 654.58, 388.28]\n\nFrame 2:\n  Drone pose: [103.79, -14.3, 20.24, -42.58, 85.68, 0.0]\n  Target bbox: [625.49, 345.72, 654.31, 389.53]\n\nFrame 3:\n  Drone pose: [103.79, -13.79, 20.22, -42.56, 85.68, 0.0]\n  Target bbox: [627.88, 345.84, 651.95, 389.91]\n\nFrame 4:\n  Drone pose: [103.79, -13.28, 20.19, -42.53, 85.68, 0.0]\n  Target bbox: [630.04, 345.87, 649.82, 390.38]\n\nFrame 5 (current):\n  Drone pose: [103.79, -12.78, 20.17, -42.51, 85.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 631.23, \"ymin\": 346.3, \"xmax\": 648.65, \"ymax\": 389.53}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.02, \"dz\": -0.04, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.53, \"dz\": -0.05, \"dpitch\": 0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.04, \"dz\": -0.07, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.54, \"dz\": -0.08, \"dpitch\": 0.07, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.1, "target_px_mean_hist": 175.5, "cur_frame_id": 18, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00032/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.79, -7.71, 20.04, -42.42, 85.66, 0.0]\n  Target bbox: [629.61, 346.49, 650.25, 388.71]\n\nFrame 2:\n  Drone pose: [103.79, -7.2, 20.04, -42.41, 85.65, 0.0]\n  Target bbox: [627.53, 346.2, 652.31, 389.71]\n\nFrame 3:\n  Drone pose: [103.79, -6.7, 20.03, -42.41, 85.65, 0.0]\n  Target bbox: [627.64, 345.72, 652.19, 390.15]\n\nFrame 4:\n  Drone pose: [103.79, -6.19, 20.03, -42.41, 85.65, 0.0]\n  Target bbox: [629.62, 345.79, 650.24, 390.58]\n\nFrame 5 (current):\n  Drone pose: [103.79, -5.69, 20.02, -42.41, 85.65, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.42, \"ymin\": 346.44, \"xmax\": 649.45, \"ymax\": 388.93}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 1.52, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 2.02, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 2.52, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.02, "target_px_mean_hist": 173.0, "cur_frame_id": 32, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.78, -0.65, 20.0, -42.44, 85.64, 0.0]\n  Target bbox: [629.99, 345.76, 649.87, 390.65]\n\nFrame 2:\n  Drone pose: [103.78, -0.15, 20.0, -42.44, 85.64, 0.0]\n  Target bbox: [628.69, 345.81, 651.16, 390.49]\n\nFrame 3:\n  Drone pose: [103.78, 0.36, 20.0, -42.44, 85.64, 0.0]\n  Target bbox: [629.17, 345.78, 650.68, 390.58]\n\nFrame 4:\n  Drone pose: [103.78, 0.86, 20.0, -42.45, 85.64, 0.0]\n  Target bbox: [628.98, 346.79, 650.88, 388.21]\n\nFrame 5 (current):\n  Drone pose: [103.78, 1.36, 20.0, -42.45, 85.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.56, \"ymin\": 346.32, \"xmax\": 652.28, \"ymax\": 389.48}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.52, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 176.2, "cur_frame_id": 46, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00060/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.78, 6.39, 20.0, -42.48, 85.63, 0.0]\n  Target bbox: [627.42, 346.12, 652.42, 389.83]\n\nFrame 2:\n  Drone pose: [103.78, 6.89, 20.0, -42.48, 85.63, 0.0]\n  Target bbox: [629.43, 346.85, 650.44, 388.02]\n\nFrame 3:\n  Drone pose: [103.78, 7.39, 20.0, -42.48, 85.63, 0.0]\n  Target bbox: [629.1, 345.77, 650.75, 390.58]\n\nFrame 4:\n  Drone pose: [103.78, 7.89, 20.0, -42.48, 85.63, 0.0]\n  Target bbox: [629.89, 345.75, 649.97, 390.64]\n\nFrame 5 (current):\n  Drone pose: [103.78, 8.39, 20.0, -42.48, 85.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.15, \"ymin\": 345.67, \"xmax\": 654.63, \"ymax\": 388.87}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 172.8, "cur_frame_id": 60, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.78, 13.4, 20.0, -42.5, 85.63, 0.0]\n  Target bbox: [625.0, 345.68, 654.78, 388.87]\n\nFrame 2:\n  Drone pose: [103.78, 13.9, 20.0, -42.5, 85.63, 0.0]\n  Target bbox: [629.95, 345.73, 649.91, 390.68]\n\nFrame 3:\n  Drone pose: [103.79, 14.41, 20.0, -42.5, 85.63, 0.0]\n  Target bbox: [629.89, 346.86, 649.98, 387.92]\n\nFrame 4:\n  Drone pose: [103.79, 14.91, 20.0, -42.51, 85.64, 0.0]\n  Target bbox: [627.05, 346.02, 652.78, 389.92]\n\nFrame 5 (current):\n  Drone pose: [103.79, 15.41, 20.0, -42.51, 85.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 630.83, \"ymin\": 346.39, \"xmax\": 649.04, \"ymax\": 389.07}, \"waypoint_deltas\": [{\"dx\": 0.01, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 0.03, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": 0.05, \"dy\": 1.5, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 2.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.21, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 2.51, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.03, "window_alt_abs_m": 0.0, "target_px_mean_hist": 176.5, "cur_frame_id": 74, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.42, 20.47, 20.0, -42.65, 87.35, 0.0]\n  Target bbox: [626.22, 346.0, 653.59, 389.76]\n\nFrame 2:\n  Drone pose: [104.59, 21.0, 20.0, -42.69, 87.81, 0.0]\n  Target bbox: [628.16, 345.97, 651.67, 390.08]\n\nFrame 3:\n  Drone pose: [104.78, 21.53, 20.0, -42.74, 88.33, 0.0]\n  Target bbox: [626.2, 346.58, 653.6, 388.25]\n\nFrame 4:\n  Drone pose: [105.0, 22.07, 20.0, -42.8, 88.92, 0.0]\n  Target bbox: [626.57, 346.25, 653.24, 389.58]\n\nFrame 5 (current):\n  Drone pose: [105.24, 22.62, 20.0, -42.87, 89.57, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.84, \"ymin\": 346.23, \"xmax\": 654.0, \"ymax\": 388.94}, \"waypoint_deltas\": [{\"dx\": 0.25, \"dy\": 0.55, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.7, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": 1.12, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": 1.43, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": 1.7, \"dz\": 0.0, \"dpitch\": -0.27, \"dyaw\": 2.17, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 2.28, \"dz\": 0.0, \"dpitch\": -0.37, \"dyaw\": 2.88, \"droll\": 0.0}, {\"dx\": 1.25, \"dy\": 2.88, \"dz\": 0.0, \"dpitch\": -0.49, \"dyaw\": 3.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.22, "window_alt_abs_m": 0.0, "target_px_mean_hist": 181.5, "cur_frame_id": 88, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00102/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.68, 28.59, 20.0, -43.96, 98.0, 0.0]\n  Target bbox: [627.49, 344.12, 652.4, 390.92]\n\nFrame 2:\n  Drone pose: [106.56, 29.22, 20.0, -44.06, 99.13, 0.0]\n  Target bbox: [627.63, 344.09, 652.26, 390.92]\n\nFrame 3:\n  Drone pose: [106.41, 29.85, 20.0, -44.16, 100.17, 0.0]\n  Target bbox: [625.57, 342.46, 654.32, 393.56]\n\nFrame 4:\n  Drone pose: [106.23, 30.47, 20.0, -44.26, 101.15, 0.0]\n  Target bbox: [628.32, 344.38, 651.57, 390.46]\n\nFrame 5 (current):\n  Drone pose: [106.03, 31.1, 20.0, -44.35, 102.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.04, \"ymin\": 342.82, \"xmax\": 654.91, \"ymax\": 393.46}, \"waypoint_deltas\": [{\"dx\": -0.22, \"dy\": 0.62, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": 0.87, \"droll\": 0.0}, {\"dx\": -0.46, \"dy\": 1.22, \"dz\": 0.0, \"dpitch\": -0.14, \"dyaw\": 1.68, \"droll\": 0.0}, {\"dx\": -0.72, \"dy\": 1.82, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": 2.42, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": 2.41, \"dz\": 0.0, \"dpitch\": -0.24, \"dyaw\": 3.1, \"droll\": 0.0}, {\"dx\": -1.32, \"dy\": 2.98, \"dz\": 0.0, \"dpitch\": -0.27, \"dyaw\": 3.7, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.07, "window_alt_abs_m": 0.0, "target_px_mean_hist": 178.5, "cur_frame_id": 102, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00113/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00115/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00116/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.77, 36.74, 20.0, -44.6, 107.44, 0.0]\n  Target bbox: [625.94, 343.85, 654.01, 392.57]\n\nFrame 2:\n  Drone pose: [102.31, 37.23, 20.0, -44.57, 107.54, 0.0]\n  Target bbox: [625.84, 343.8, 654.1, 392.46]\n\nFrame 3:\n  Drone pose: [101.83, 37.7, 20.0, -44.53, 107.58, 0.0]\n  Target bbox: [626.25, 342.4, 653.6, 393.64]\n\nFrame 4:\n  Drone pose: [101.35, 38.17, 20.0, -44.47, 107.62, 0.0]\n  Target bbox: [628.42, 344.3, 651.46, 390.44]\n\nFrame 5 (current):\n  Drone pose: [100.9, 38.63, 20.0, -44.4, 107.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.46, \"ymin\": 342.27, \"xmax\": 655.41, \"ymax\": 393.48}, \"waypoint_deltas\": [{\"dx\": -0.43, \"dy\": 0.45, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": 0.89, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 0.44, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": 1.33, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": -1.44, \"dy\": 1.76, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": -1.41, \"droll\": 0.0}, {\"dx\": -1.7, \"dy\": 2.18, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -2.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.26, "window_alt_abs_m": 0.0, "target_px_mean_hist": 188.2, "cur_frame_id": 116, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00129/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/ORI/frames_playback/frame_00130/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.07, 42.96, 20.0, -44.15, 102.1, 0.0]\n  Target bbox: [626.25, 345.07, 653.89, 389.96]\n\nFrame 2:\n  Drone pose: [97.86, 43.4, 20.0, -44.12, 101.46, 0.0]\n  Target bbox: [628.92, 344.94, 651.21, 391.13]\n\nFrame 3:\n  Drone pose: [97.65, 43.84, 20.0, -44.08, 100.84, 0.0]\n  Target bbox: [629.11, 345.36, 651.02, 390.03]\n\nFrame 4:\n  Drone pose: [97.44, 44.27, 20.0, -44.05, 100.24, 0.0]\n  Target bbox: [628.4, 345.4, 651.74, 389.84]\n\nFrame 5 (current):\n  Drone pose: [97.25, 44.72, 20.0, -44.02, 99.66, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.94, \"ymin\": 345.07, \"xmax\": 651.2, \"ymax\": 391.03}, \"waypoint_deltas\": [{\"dx\": -0.18, \"dy\": 0.44, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": -0.36, \"dy\": 0.89, \"dz\": 0.0, \"dpitch\": 0.07, \"dyaw\": -1.06, \"droll\": 0.0}, {\"dx\": -0.52, \"dy\": 1.34, \"dz\": 0.0, \"dpitch\": 0.11, \"dyaw\": -1.54, \"droll\": 0.0}, {\"dx\": -0.67, \"dy\": 1.79, \"dz\": 0.0, \"dpitch\": 0.14, \"dyaw\": -1.98, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": 2.25, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": -2.4, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.44, "window_alt_abs_m": 0.0, "target_px_mean_hist": 185.0, "cur_frame_id": 130, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [105.4, -20.58, 22.06, -41.59, 91.9, 0.0]\n  Target bbox: [661.39, 368.96, 688.74, 414.89]\n\nFrame 2:\n  Drone pose: [104.36, -20.96, 21.2, -44.46, 87.19, 0.0]\n  Target bbox: [628.82, 347.13, 651.04, 387.17]\n\nFrame 3:\n  Drone pose: [103.93, -20.83, 20.67, -43.18, 86.08, 0.0]\n  Target bbox: [630.74, 346.22, 649.4, 389.45]\n\nFrame 4:\n  Drone pose: [103.84, -20.4, 20.65, -40.31, 85.59, 0.0]\n  Target bbox: [628.85, 346.39, 651.16, 388.56]\n\nFrame 5 (current):\n  Drone pose: [103.83, -19.87, 20.71, -38.97, 81.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.83, \"ymin\": 345.76, \"xmax\": 653.2, \"ymax\": 389.53}, \"waypoint_deltas\": [{\"dx\": -0.04, \"dy\": 0.45, \"dz\": -0.12, \"dpitch\": -3.96, \"dyaw\": 4.34, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 0.97, \"dz\": -0.14, \"dpitch\": -3.95, \"dyaw\": 4.34, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 1.48, \"dz\": -0.16, \"dpitch\": -3.93, \"dyaw\": 4.33, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 1.99, \"dz\": -0.18, \"dpitch\": -3.92, \"dyaw\": 4.33, \"droll\": 0.0}, {\"dx\": -0.04, \"dy\": 2.5, \"dz\": -0.29, \"dpitch\": -3.78, \"dyaw\": 4.33, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.53, "window_alt_abs_m": 1.48, "target_px_mean_hist": 162.5, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00017/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00018/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.79, -14.81, 20.27, -43.23, 90.68, 0.0]\n  Target bbox: [569.34, 338.11, 587.81, 380.61]\n\nFrame 2:\n  Drone pose: [103.79, -14.3, 20.24, -47.57, 86.31, 0.0]\n  Target bbox: [618.22, 262.15, 645.87, 305.91]\n\nFrame 3:\n  Drone pose: [103.79, -13.79, 20.22, -43.17, 82.18, 0.0]\n  Target bbox: [671.47, 337.2, 694.42, 378.98]\n\nFrame 4:\n  Drone pose: [103.79, -13.28, 20.19, -42.53, 85.68, 0.0]\n  Target bbox: [625.65, 345.18, 654.42, 390.12]\n\nFrame 5 (current):\n  Drone pose: [103.73, -12.74, 20.11, -40.87, 83.78, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 572.89, \"ymin\": 396.43, \"xmax\": 608.03, \"ymax\": 448.76}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.47, \"dz\": 0.04, \"dpitch\": -1.63, \"dyaw\": 1.89, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 0.98, \"dz\": 0.02, \"dpitch\": -1.61, \"dyaw\": 1.89, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 1.49, \"dz\": 0.01, \"dpitch\": -1.6, \"dyaw\": 1.89, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 2.0, \"dz\": -0.01, \"dpitch\": -1.59, \"dyaw\": 1.89, \"droll\": 0.0}, {\"dx\": 0.06, \"dy\": 2.5, \"dz\": -0.02, \"dpitch\": -1.57, \"dyaw\": 1.88, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.89, "window_alt_abs_m": 0.17, "target_px_mean_hist": 178.2, "cur_frame_id": 18, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00032/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.83, -7.84, 20.11, -36.12, 79.21, 0.0]\n  Target bbox: [690.11, 386.22, 720.78, 432.1]\n\nFrame 2:\n  Drone pose: [103.77, -7.05, 20.04, -41.77, 81.02, 0.0]\n  Target bbox: [625.06, 345.66, 654.73, 388.85]\n\nFrame 3:\n  Drone pose: [103.79, -6.7, 20.03, -38.03, 81.96, 0.0]\n  Target bbox: [671.74, 419.25, 699.6, 465.42]\n\nFrame 4:\n  Drone pose: [103.73, -6.02, 20.07, -39.08, 72.99, 0.0]\n  Target bbox: [687.14, 384.1, 718.25, 431.37]\n\nFrame 5 (current):\n  Drone pose: [103.79, -5.69, 20.02, -42.41, 85.65, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 625.93, \"ymin\": 346.12, \"xmax\": 653.87, \"ymax\": 388.69}, \"waypoint_deltas\": [{\"dx\": -0.01, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 1.52, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 2.02, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": -0.01, \"dy\": 2.52, \"dz\": -0.01, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 24.37, "window_alt_abs_m": 0.17, "target_px_mean_hist": 164.2, "cur_frame_id": 32, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.78, -0.65, 20.0, -44.68, 90.64, 0.0]\n  Target bbox: [561.26, 306.92, 595.38, 357.06]\n\nFrame 2:\n  Drone pose: [103.82, -0.15, 19.98, -43.93, 91.25, 0.0]\n  Target bbox: [593.96, 367.85, 620.83, 415.19]\n\nFrame 3:\n  Drone pose: [103.72, 0.32, 19.89, -39.98, 88.17, 0.0]\n  Target bbox: [560.66, 388.11, 596.19, 439.0]\n\nFrame 4:\n  Drone pose: [103.78, 0.86, 20.0, -44.44, 90.64, 0.0]\n  Target bbox: [565.41, 314.97, 591.21, 357.73]\n\nFrame 5 (current):\n  Drone pose: [103.78, 1.36, 20.0, -42.45, 85.64, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 626.3, \"ymin\": 345.56, \"xmax\": 653.51, \"ymax\": 390.14}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.51, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.52, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.16, "window_alt_abs_m": 0.23, "target_px_mean_hist": 189.8, "cur_frame_id": 46, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00060/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.78, 6.39, 20.0, -40.24, 86.75, 0.0]\n  Target bbox: [611.69, 381.92, 640.22, 429.52]\n\nFrame 2:\n  Drone pose: [103.79, 6.79, 20.0, -45.71, 89.42, 0.0]\n  Target bbox: [577.53, 310.28, 600.92, 353.28]\n\nFrame 3:\n  Drone pose: [103.78, 7.39, 20.0, -42.48, 85.63, 0.0]\n  Target bbox: [627.45, 346.13, 652.39, 389.81]\n\nFrame 4:\n  Drone pose: [103.78, 7.89, 20.0, -41.39, 90.63, 0.0]\n  Target bbox: [566.54, 367.22, 590.15, 407.61]\n\nFrame 5 (current):\n  Drone pose: [103.78, 8.39, 20.0, -41.9, 80.63, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 689.41, \"ymin\": 356.91, \"xmax\": 713.5, \"ymax\": 402.84}, \"waypoint_deltas\": [{\"dx\": 0.0, \"dy\": 0.5, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": 5.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.0, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": 5.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 1.51, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": 5.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.01, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": 5.0, \"droll\": 0.0}, {\"dx\": 0.0, \"dy\": 2.51, \"dz\": 0.0, \"dpitch\": -0.59, \"dyaw\": 5.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 21.46, "window_alt_abs_m": 0.01, "target_px_mean_hist": 149.5, "cur_frame_id": 60, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00071/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00072/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00073/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00074/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [103.78, 13.4, 20.0, -42.5, 85.63, 0.0]\n  Target bbox: [629.81, 345.77, 650.05, 390.59]\n\nFrame 2:\n  Drone pose: [103.78, 13.9, 20.0, -42.5, 85.63, 0.0]\n  Target bbox: [625.64, 345.68, 654.15, 389.58]\n\nFrame 3:\n  Drone pose: [103.79, 14.41, 20.0, -43.31, 80.63, 0.0]\n  Target bbox: [684.96, 333.24, 718.03, 377.9]\n\nFrame 4:\n  Drone pose: [103.79, 14.91, 20.0, -37.6, 87.33, 0.0]\n  Target bbox: [607.44, 427.85, 630.76, 473.06]\n\nFrame 5 (current):\n  Drone pose: [103.74, 15.47, 19.95, -39.79, 92.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.8, \"ymin\": 346.13, \"xmax\": 650.27, \"ymax\": 389.51}, \"waypoint_deltas\": [{\"dx\": 0.06, \"dy\": 0.44, \"dz\": 0.05, \"dpitch\": -2.72, \"dyaw\": -6.99, \"droll\": 0.0}, {\"dx\": 0.08, \"dy\": 0.94, \"dz\": 0.05, \"dpitch\": -2.72, \"dyaw\": -6.95, \"droll\": 0.0}, {\"dx\": 0.1, \"dy\": 1.44, \"dz\": 0.05, \"dpitch\": -2.73, \"dyaw\": -6.89, \"droll\": 0.0}, {\"dx\": 0.13, \"dy\": 1.94, \"dz\": 0.05, \"dpitch\": -2.73, \"dyaw\": -6.8, \"droll\": 0.0}, {\"dx\": 0.18, \"dy\": 2.45, \"dz\": 0.05, \"dpitch\": -2.74, \"dyaw\": -6.68, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.03, "window_alt_abs_m": 0.05, "target_px_mean_hist": 183.8, "cur_frame_id": 74, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [104.42, 20.47, 20.0, -39.09, 82.35, 0.0]\n  Target bbox: [689.15, 406.73, 713.7, 452.56]\n\nFrame 2:\n  Drone pose: [104.59, 21.0, 20.0, -41.01, 90.35, 0.0]\n  Target bbox: [598.69, 374.96, 618.7, 418.49]\n\nFrame 3:\n  Drone pose: [104.78, 21.53, 20.0, -42.74, 88.33, 0.0]\n  Target bbox: [626.57, 344.41, 653.4, 391.65]\n\nFrame 4:\n  Drone pose: [105.0, 22.15, 20.17, -43.31, 94.62, 0.0]\n  Target bbox: [623.35, 272.24, 649.27, 318.73]\n\nFrame 5 (current):\n  Drone pose: [105.24, 22.62, 20.0, -44.16, 91.72, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 600.16, \"ymin\": 324.62, \"xmax\": 627.12, \"ymax\": 368.7}, \"waypoint_deltas\": [{\"dx\": 0.25, \"dy\": 0.55, \"dz\": 0.0, \"dpitch\": 1.21, \"dyaw\": -1.45, \"droll\": 0.0}, {\"dx\": 0.51, \"dy\": 1.12, \"dz\": 0.0, \"dpitch\": 1.12, \"dyaw\": -0.72, \"droll\": 0.0}, {\"dx\": 0.78, \"dy\": 1.7, \"dz\": 0.0, \"dpitch\": 1.02, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 2.28, \"dz\": 0.0, \"dpitch\": 0.92, \"dyaw\": 0.73, \"droll\": 0.0}, {\"dx\": 1.25, \"dy\": 2.88, \"dz\": 0.0, \"dpitch\": 0.8, \"dyaw\": 1.34, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 19.2, "window_alt_abs_m": 0.33, "target_px_mean_hist": 173.0, "cur_frame_id": 88, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00099/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00100/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00101/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00102/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [106.73, 28.76, 19.91, -46.36, 99.72, 0.0]\n  Target bbox: [665.11, 333.86, 694.15, 380.31]\n\nFrame 2:\n  Drone pose: [106.56, 29.22, 20.0, -42.5, 94.13, 0.0]\n  Target bbox: [683.6, 370.21, 716.41, 421.21]\n\nFrame 3:\n  Drone pose: [106.41, 29.85, 20.0, -45.58, 100.3, 0.0]\n  Target bbox: [624.5, 319.94, 652.48, 369.06]\n\nFrame 4:\n  Drone pose: [106.23, 30.47, 20.0, -44.26, 101.15, 0.0]\n  Target bbox: [623.89, 341.89, 656.02, 393.97]\n\nFrame 5 (current):\n  Drone pose: [106.03, 31.1, 20.0, -47.26, 100.44, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 647.41, \"ymin\": 295.47, \"xmax\": 671.57, \"ymax\": 342.2}, \"waypoint_deltas\": [{\"dx\": -0.22, \"dy\": 0.62, \"dz\": 0.0, \"dpitch\": 2.83, \"dyaw\": 2.49, \"droll\": 0.0}, {\"dx\": -0.46, \"dy\": 1.22, \"dz\": 0.0, \"dpitch\": 2.77, \"dyaw\": 3.3, \"droll\": 0.0}, {\"dx\": -0.72, \"dy\": 1.82, \"dz\": 0.0, \"dpitch\": 2.71, \"dyaw\": 4.04, \"droll\": 0.0}, {\"dx\": -1.01, \"dy\": 2.41, \"dz\": 0.0, \"dpitch\": 2.67, \"dyaw\": 4.72, \"droll\": 0.0}, {\"dx\": -1.32, \"dy\": 2.98, \"dz\": 0.0, \"dpitch\": 2.64, \"dyaw\": 5.32, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.32, "window_alt_abs_m": 0.09, "target_px_mean_hist": 176.0, "cur_frame_id": 102, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00112/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00113/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00114/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00115/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00116/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [102.77, 36.74, 20.0, -44.6, 107.44, 0.0]\n  Target bbox: [623.8, 341.82, 656.07, 393.85]\n\nFrame 2:\n  Drone pose: [102.31, 37.23, 20.0, -44.57, 107.54, 0.0]\n  Target bbox: [626.24, 342.39, 653.61, 393.65]\n\nFrame 3:\n  Drone pose: [101.83, 37.7, 20.0, -47.93, 109.31, 0.0]\n  Target bbox: [604.34, 286.14, 634.17, 336.45]\n\nFrame 4:\n  Drone pose: [101.35, 38.17, 20.0, -44.47, 107.62, 0.0]\n  Target bbox: [624.51, 342.16, 655.36, 393.74]\n\nFrame 5 (current):\n  Drone pose: [100.9, 38.63, 20.0, -44.4, 107.7, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.92, \"ymin\": 344.49, \"xmax\": 650.97, \"ymax\": 390.22}, \"waypoint_deltas\": [{\"dx\": -0.43, \"dy\": 0.45, \"dz\": 0.0, \"dpitch\": 0.1, \"dyaw\": 0.16, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": 0.89, \"dz\": 0.0, \"dpitch\": 0.24, \"dyaw\": 0.44, \"droll\": 0.0}, {\"dx\": -1.14, \"dy\": 1.33, \"dz\": 0.0, \"dpitch\": 0.18, \"dyaw\": -0.54, \"droll\": 0.0}, {\"dx\": -1.44, \"dy\": 1.76, \"dz\": 0.0, \"dpitch\": 0.16, \"dyaw\": -1.41, \"droll\": 0.0}, {\"dx\": -1.7, \"dy\": 2.18, \"dz\": 0.0, \"dpitch\": 0.15, \"dyaw\": -2.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.63, "window_alt_abs_m": 0.0, "target_px_mean_hist": 189.0, "cur_frame_id": 116, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00126/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00127/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00128/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00129/rgb.png", "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364/aug_001/frames_playback/frame_00130/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [98.07, 42.96, 20.0, -41.14, 104.2, 0.0]\n  Target bbox: [600.71, 394.71, 628.74, 442.78]\n\nFrame 2:\n  Drone pose: [97.86, 43.4, 20.0, -44.77, 102.51, 0.0]\n  Target bbox: [609.17, 330.09, 645.49, 383.28]\n\nFrame 3:\n  Drone pose: [97.66, 43.79, 19.93, -41.03, 95.49, 0.0]\n  Target bbox: [623.16, 342.79, 656.82, 391.93]\n\nFrame 4:\n  Drone pose: [97.44, 44.27, 20.0, -44.05, 100.24, 0.0]\n  Target bbox: [624.07, 344.3, 656.14, 390.27]\n\nFrame 5 (current):\n  Drone pose: [97.25, 44.72, 20.0, -41.42, 102.37, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 593.58, \"ymin\": 388.97, \"xmax\": 621.32, \"ymax\": 434.82}, \"waypoint_deltas\": [{\"dx\": -0.18, \"dy\": 0.44, \"dz\": 0.0, \"dpitch\": -2.56, \"dyaw\": -3.25, \"droll\": 0.0}, {\"dx\": -0.36, \"dy\": 0.89, \"dz\": 0.0, \"dpitch\": -2.53, \"dyaw\": -3.77, \"droll\": 0.0}, {\"dx\": -0.52, \"dy\": 1.34, \"dz\": 0.0, \"dpitch\": -2.49, \"dyaw\": -4.25, \"droll\": 0.0}, {\"dx\": -0.67, \"dy\": 1.79, \"dz\": 0.0, \"dpitch\": -2.46, \"dyaw\": -4.69, \"droll\": 0.0}, {\"dx\": -0.81, \"dy\": 2.25, \"dz\": 0.0, \"dpitch\": -2.42, \"dyaw\": -5.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.6, "window_alt_abs_m": 0.14, "target_px_mean_hist": 169.2, "cur_frame_id": 130, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip193/2026-04-15/trajectory_1776247364", "difficulty_score": 0.4877, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [23.89, 5.44, 22.0, -46.27, 0.0, 0.0]\n  Target bbox: [629.46, 328.97, 650.54, 390.3]\n\nFrame 2:\n  Drone pose: [23.49, 4.42, 21.2, -43.85, 2.81, 0.0]\n  Target bbox: [628.95, 327.32, 651.31, 392.07]\n\nFrame 3:\n  Drone pose: [23.62, 3.99, 20.67, -42.55, 3.92, 0.0]\n  Target bbox: [628.2, 324.0, 652.15, 395.59]\n\nFrame 4:\n  Drone pose: [24.02, 3.86, 20.64, -42.37, 4.25, 0.0]\n  Target bbox: [628.72, 326.64, 651.55, 392.83]\n\nFrame 5 (current):\n  Drone pose: [24.52, 3.84, 20.62, -42.32, 4.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.02, \"ymin\": 330.58, \"xmax\": 651.15, \"ymax\": 388.75}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.0, \"dz\": -0.07, \"dpitch\": 0.04, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": 0.0, \"dz\": -0.09, \"dpitch\": 0.06, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": 0.0, \"dz\": -0.2, \"dpitch\": 0.2, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.29, "window_alt_abs_m": 1.38, "target_px_mean_hist": 483.2, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [27.59, 3.84, 20.39, -42.09, 4.31, 0.0]\n  Target bbox: [628.29, 324.54, 652.05, 395.11]\n\nFrame 2:\n  Drone pose: [28.11, 3.84, 20.36, -42.06, 4.31, 0.0]\n  Target bbox: [628.47, 324.59, 651.87, 395.04]\n\nFrame 3:\n  Drone pose: [28.62, 3.84, 20.33, -42.03, 4.31, 0.0]\n  Target bbox: [627.88, 326.68, 652.4, 392.76]\n\nFrame 4:\n  Drone pose: [29.13, 3.84, 20.3, -42.0, 4.31, 0.0]\n  Target bbox: [628.57, 327.04, 651.7, 392.44]\n\nFrame 5 (current):\n  Drone pose: [29.64, 3.84, 20.27, -41.97, 4.32, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.13, \"ymin\": 326.96, \"xmax\": 652.13, \"ymax\": 392.47}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.0, \"dz\": -0.05, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.0, \"dz\": -0.08, \"dpitch\": 0.08, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": 0.0, \"dz\": -0.1, \"dpitch\": 0.1, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.54, \"dy\": 0.0, \"dz\": -0.12, \"dpitch\": 0.11, \"dyaw\": 0.01, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.12, "target_px_mean_hist": 494.2, "cur_frame_id": 14, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [33.2, 3.84, 20.12, -41.83, 4.33, 0.0]\n  Target bbox: [628.44, 324.99, 651.88, 394.6]\n\nFrame 2:\n  Drone pose: [33.7, 3.84, 20.1, -41.81, 4.33, 0.0]\n  Target bbox: [628.69, 328.27, 651.53, 391.09]\n\nFrame 3:\n  Drone pose: [34.21, 3.84, 20.09, -41.8, 4.34, 0.0]\n  Target bbox: [628.5, 326.11, 651.78, 393.35]\n\nFrame 4:\n  Drone pose: [34.72, 3.84, 20.08, -41.79, 4.34, 0.0]\n  Target bbox: [627.87, 323.31, 652.5, 396.3]\n\nFrame 5 (current):\n  Drone pose: [35.22, 3.84, 20.07, -41.79, 4.34, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.38, \"ymin\": 325.2, \"xmax\": 651.93, \"ymax\": 394.33}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.52, \"dy\": 0.0, \"dz\": -0.04, \"dpitch\": 0.03, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.05, "target_px_mean_hist": 510.5, "cur_frame_id": 25, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [38.23, 3.85, 20.03, -41.74, 4.31, 0.0]\n  Target bbox: [628.74, 329.85, 651.44, 389.48]\n\nFrame 2:\n  Drone pose: [38.72, 3.87, 20.02, -41.72, 4.25, 0.0]\n  Target bbox: [628.87, 329.84, 651.31, 389.5]\n\nFrame 3:\n  Drone pose: [39.21, 3.89, 20.02, -41.71, 4.18, 0.0]\n  Target bbox: [628.32, 325.85, 651.97, 393.62]\n\nFrame 4:\n  Drone pose: [39.71, 3.92, 20.02, -41.7, 4.11, 0.0]\n  Target bbox: [628.39, 325.71, 651.92, 393.83]\n\nFrame 5 (current):\n  Drone pose: [40.21, 3.94, 20.01, -41.7, 4.05, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.56, \"ymin\": 327.19, \"xmax\": 651.68, \"ymax\": 392.21}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": 1.53, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": 2.04, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.16, \"droll\": 0.0}, {\"dx\": 2.55, \"dy\": 0.07, \"dz\": -0.01, \"dpitch\": -0.06, \"dyaw\": -0.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.26, "window_alt_abs_m": 0.01, "target_px_mean_hist": 501.0, "cur_frame_id": 35, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [43.78, 4.01, 20.0, -41.78, 3.87, 0.0]\n  Target bbox: [628.49, 325.35, 651.81, 394.16]\n\nFrame 2:\n  Drone pose: [44.28, 4.01, 20.0, -41.78, 3.87, 0.0]\n  Target bbox: [628.05, 323.28, 652.31, 396.3]\n\nFrame 3:\n  Drone pose: [44.78, 4.01, 20.0, -41.78, 3.87, 0.0]\n  Target bbox: [628.52, 327.16, 651.74, 392.26]\n\nFrame 4:\n  Drone pose: [45.28, 4.01, 20.0, -41.79, 3.88, 0.0]\n  Target bbox: [628.76, 328.8, 651.44, 390.53]\n\nFrame 5 (current):\n  Drone pose: [45.79, 4.01, 20.0, -41.79, 3.88, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.72, \"ymin\": 324.99, \"xmax\": 652.6, \"ymax\": 394.48}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.0, "target_px_mean_hist": 505.0, "cur_frame_id": 46, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [49.3, 4.05, 20.0, -41.81, 3.79, 0.0]\n  Target bbox: [628.55, 326.52, 651.73, 392.97]\n\nFrame 2:\n  Drone pose: [49.81, 4.06, 20.0, -41.82, 3.75, 0.0]\n  Target bbox: [628.12, 323.2, 652.24, 396.38]\n\nFrame 3:\n  Drone pose: [50.31, 4.09, 20.0, -41.82, 3.69, 0.0]\n  Target bbox: [628.75, 326.57, 651.51, 392.85]\n\nFrame 4:\n  Drone pose: [50.81, 4.12, 20.0, -41.83, 3.6, 0.0]\n  Target bbox: [628.56, 326.2, 651.73, 393.3]\n\nFrame 5 (current):\n  Drone pose: [51.32, 4.16, 20.0, -41.84, 3.5, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.37, \"ymin\": 323.43, \"xmax\": 651.99, \"ymax\": 396.18}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.11, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.3, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.18, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.49, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": 0.26, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.71, \"droll\": 0.0}, {\"dx\": 2.54, \"dy\": 0.35, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.3, "window_alt_abs_m": 0.0, "target_px_mean_hist": 515.2, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [54.38, 4.62, 20.0, -41.95, 2.26, 0.0]\n  Target bbox: [628.45, 324.52, 651.87, 394.96]\n\nFrame 2:\n  Drone pose: [54.89, 4.73, 20.0, -41.98, 1.95, 0.0]\n  Target bbox: [628.31, 323.68, 652.04, 395.83]\n\nFrame 3:\n  Drone pose: [55.41, 4.85, 20.0, -42.01, 1.62, 0.0]\n  Target bbox: [629.34, 327.62, 650.9, 391.79]\n\nFrame 4:\n  Drone pose: [55.94, 4.98, 20.0, -42.05, 1.27, 0.0]\n  Target bbox: [629.22, 328.45, 651.0, 390.91]\n\nFrame 5 (current):\n  Drone pose: [56.46, 5.11, 20.0, -42.09, 0.92, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 629.03, \"ymin\": 330.17, \"xmax\": 651.15, \"ymax\": 389.07}, \"waypoint_deltas\": [{\"dx\": 0.53, \"dy\": 0.12, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.34, \"droll\": 0.0}, {\"dx\": 1.06, \"dy\": 0.23, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.65, \"droll\": 0.0}, {\"dx\": 1.6, \"dy\": 0.32, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": -0.88, \"droll\": 0.0}, {\"dx\": 2.13, \"dy\": 0.38, \"dz\": 0.0, \"dpitch\": -0.17, \"dyaw\": -1.04, \"droll\": 0.0}, {\"dx\": 2.67, \"dy\": 0.41, \"dz\": 0.0, \"dpitch\": -0.21, \"dyaw\": -2.5, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.34, "window_alt_abs_m": 0.0, "target_px_mean_hist": 512.0, "cur_frame_id": 67, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [60.21, 5.5, 20.0, -42.39, -2.93, 0.0]\n  Target bbox: [628.57, 329.21, 651.22, 390.11]\n\nFrame 2:\n  Drone pose: [60.75, 5.47, 20.0, -42.45, -2.85, 0.0]\n  Target bbox: [620.48, 324.46, 659.59, 395.08]\n\nFrame 3:\n  Drone pose: [61.29, 5.43, 20.0, -42.47, -4.14, 0.0]\n  Target bbox: [628.19, 325.7, 651.53, 393.72]\n\nFrame 4:\n  Drone pose: [61.84, 5.39, 20.0, -42.53, -4.03, 0.0]\n  Target bbox: [618.18, 322.84, 661.9, 396.79]\n\nFrame 5 (current):\n  Drone pose: [62.38, 5.35, 20.0, -42.54, -5.31, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.34, \"ymin\": 326.39, \"xmax\": 652.39, \"ymax\": 392.98}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": -0.08, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -1.2, \"droll\": 0.0}, {\"dx\": 1.63, \"dy\": -0.12, \"dz\": 0.0, \"dpitch\": -0.13, \"dyaw\": -1.09, \"droll\": 0.0}, {\"dx\": 2.18, \"dy\": -0.17, \"dz\": 0.0, \"dpitch\": -0.2, \"dyaw\": -0.97, \"droll\": 0.0}, {\"dx\": 2.72, \"dy\": -0.23, \"dz\": 0.0, \"dpitch\": -0.19, \"dyaw\": -2.2, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.75, "window_alt_abs_m": 0.0, "target_px_mean_hist": 505.8, "cur_frame_id": 78, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [65.65, 5.04, 20.0, -42.81, -7.31, 0.0]\n  Target bbox: [623.0, 325.61, 657.11, 393.81]\n\nFrame 2:\n  Drone pose: [66.19, 4.95, 20.0, -42.78, -8.47, 0.0]\n  Target bbox: [627.41, 328.17, 652.4, 391.09]\n\nFrame 3:\n  Drone pose: [66.73, 4.86, 20.0, -42.86, -8.22, 0.0]\n  Target bbox: [625.5, 327.36, 654.61, 392.0]\n\nFrame 4:\n  Drone pose: [67.26, 4.76, 20.0, -42.82, -9.34, 0.0]\n  Target bbox: [625.44, 323.15, 654.2, 396.33]\n\nFrame 5 (current):\n  Drone pose: [67.8, 4.65, 20.0, -42.89, -9.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.55, \"ymin\": 326.29, \"xmax\": 656.5, \"ymax\": 393.05}, \"waypoint_deltas\": [{\"dx\": 0.53, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -1.12, \"droll\": 0.0}, {\"dx\": 1.06, \"dy\": -0.2, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.85, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": -0.3, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -1.98, \"droll\": 0.0}, {\"dx\": 2.11, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.7, \"droll\": 0.0}, {\"dx\": 2.64, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -1.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.81, "window_alt_abs_m": 0.0, "target_px_mean_hist": 498.5, "cur_frame_id": 88, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/ORI/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [71.49, 3.83, 20.0, -42.99, -11.04, 0.0]\n  Target bbox: [618.23, 322.01, 661.92, 397.53]\n\nFrame 2:\n  Drone pose: [72.01, 3.64, 20.0, -42.94, -11.93, 0.0]\n  Target bbox: [626.04, 325.62, 653.68, 393.82]\n\nFrame 3:\n  Drone pose: [72.53, 3.44, 20.0, -43.02, -11.39, 0.0]\n  Target bbox: [620.28, 323.57, 659.81, 395.84]\n\nFrame 4:\n  Drone pose: [73.06, 3.24, 20.0, -42.97, -12.21, 0.0]\n  Target bbox: [625.27, 326.98, 654.86, 392.39]\n\nFrame 5 (current):\n  Drone pose: [73.58, 3.05, 20.0, -42.9, -13.07, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.74, \"ymin\": 326.58, \"xmax\": 655.41, \"ymax\": 392.82}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.17, \"dz\": 0.0, \"dpitch\": 0.09, \"dyaw\": -0.93, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -0.31, \"dz\": 0.0, \"dpitch\": 0.02, \"dyaw\": -0.55, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.22, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": -0.56, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": -0.69, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": 0.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 3.12, "window_alt_abs_m": 0.0, "target_px_mean_hist": 500.5, "cur_frame_id": 99, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [23.86, 5.3, 22.14, -46.41, 0.41, 0.0]\n  Target bbox: [629.67, 331.16, 650.49, 388.05]\n\nFrame 2:\n  Drone pose: [23.56, 4.55, 21.12, -44.08, 5.75, 0.0]\n  Target bbox: [587.12, 323.1, 612.77, 390.07]\n\nFrame 3:\n  Drone pose: [23.61, 4.08, 20.81, -45.39, 5.72, 0.0]\n  Target bbox: [602.09, 279.75, 627.2, 351.68]\n\nFrame 4:\n  Drone pose: [24.15, 3.86, 20.57, -43.39, 8.36, 0.0]\n  Target bbox: [576.68, 315.13, 601.55, 375.12]\n\nFrame 5 (current):\n  Drone pose: [24.5, 3.86, 20.64, -43.19, 5.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 612.77, \"ymin\": 311.88, \"xmax\": 636.02, \"ymax\": 379.41}, \"waypoint_deltas\": [{\"dx\": 0.53, \"dy\": -0.02, \"dz\": -0.05, \"dpitch\": 0.88, \"dyaw\": -1.2, \"droll\": 0.0}, {\"dx\": 1.04, \"dy\": -0.02, \"dz\": -0.07, \"dpitch\": 0.9, \"dyaw\": -1.2, \"droll\": 0.0}, {\"dx\": 1.56, \"dy\": -0.02, \"dz\": -0.09, \"dpitch\": 0.91, \"dyaw\": -1.19, \"droll\": 0.0}, {\"dx\": 2.07, \"dy\": -0.02, \"dz\": -0.11, \"dpitch\": 0.93, \"dyaw\": -1.19, \"droll\": 0.0}, {\"dx\": 2.58, \"dy\": -0.02, \"dz\": -0.22, \"dpitch\": 1.07, \"dyaw\": -1.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.9, "window_alt_abs_m": 1.62, "target_px_mean_hist": 472.0, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00014/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [27.59, 3.78, 20.45, -42.18, 4.46, 0.0]\n  Target bbox: [628.86, 327.94, 651.38, 391.5]\n\nFrame 2:\n  Drone pose: [28.24, 3.72, 20.34, -42.2, 4.67, 0.0]\n  Target bbox: [628.67, 328.44, 651.55, 390.91]\n\nFrame 3:\n  Drone pose: [28.64, 3.81, 20.32, -40.09, 1.24, 0.0]\n  Target bbox: [667.55, 358.17, 692.26, 428.55]\n\nFrame 4:\n  Drone pose: [29.29, 3.95, 20.3, -42.22, 4.05, 0.0]\n  Target bbox: [628.32, 324.6, 652.02, 395.02]\n\nFrame 5 (current):\n  Drone pose: [29.74, 3.71, 20.3, -42.15, 4.69, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.8, \"ymin\": 330.1, \"xmax\": 651.38, \"ymax\": 389.23}, \"waypoint_deltas\": [{\"dx\": 0.41, \"dy\": 0.13, \"dz\": -0.06, \"dpitch\": 0.2, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": 0.13, \"dz\": -0.08, \"dpitch\": 0.23, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": 1.42, \"dy\": 0.13, \"dz\": -0.11, \"dpitch\": 0.26, \"dyaw\": -0.37, \"droll\": 0.0}, {\"dx\": 1.93, \"dy\": 0.13, \"dz\": -0.13, \"dpitch\": 0.28, \"dyaw\": -0.36, \"droll\": 0.0}, {\"dx\": 2.44, \"dy\": 0.13, \"dz\": -0.15, \"dpitch\": 0.29, \"dyaw\": -0.36, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.1, "window_alt_abs_m": 0.16, "target_px_mean_hist": 495.8, "cur_frame_id": 14, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00023/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00024/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00025/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [33.16, 3.73, 20.21, -41.9, 4.62, 0.0]\n  Target bbox: [627.81, 323.44, 652.55, 396.17]\n\nFrame 2:\n  Drone pose: [33.59, 3.98, 20.18, -43.81, 2.78, 0.0]\n  Target bbox: [643.34, 291.95, 666.41, 359.58]\n\nFrame 3:\n  Drone pose: [34.16, 3.73, 20.04, -41.65, 4.63, 0.0]\n  Target bbox: [628.48, 329.99, 651.69, 389.3]\n\nFrame 4:\n  Drone pose: [34.8, 3.75, 19.99, -40.4, 6.94, 0.0]\n  Target bbox: [598.4, 352.81, 622.36, 413.18]\n\nFrame 5 (current):\n  Drone pose: [35.22, 3.84, 20.07, -44.13, 6.97, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 594.74, \"ymin\": 288.7, \"xmax\": 619.18, \"ymax\": 353.18}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.01, \"dpitch\": 2.35, \"dyaw\": -2.63, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.0, \"dz\": -0.02, \"dpitch\": 2.35, \"dyaw\": -2.63, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 2.35, \"dyaw\": -2.62, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 2.36, \"dyaw\": -2.62, \"droll\": 0.0}, {\"dx\": 2.52, \"dy\": 0.0, \"dz\": -0.04, \"dpitch\": 2.37, \"dyaw\": -2.63, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 6.03, "window_alt_abs_m": 0.29, "target_px_mean_hist": 503.2, "cur_frame_id": 25, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00032/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00033/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00034/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00035/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [38.13, 3.75, 20.14, -37.52, 2.54, 0.0]\n  Target bbox: [653.44, 395.01, 677.94, 467.74]\n\nFrame 2:\n  Drone pose: [38.69, 3.91, 19.93, -41.54, 4.13, 0.0]\n  Target bbox: [628.42, 324.55, 651.9, 394.97]\n\nFrame 3:\n  Drone pose: [39.15, 3.78, 19.95, -41.51, 4.48, 0.0]\n  Target bbox: [628.78, 329.81, 651.4, 389.54]\n\nFrame 4:\n  Drone pose: [39.75, 3.82, 20.12, -39.55, 7.49, 0.0]\n  Target bbox: [588.08, 363.54, 614.53, 436.2]\n\nFrame 5 (current):\n  Drone pose: [40.21, 3.94, 20.01, -41.7, 4.05, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.79, \"ymin\": 329.87, \"xmax\": 651.39, \"ymax\": 389.46}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.1, \"droll\": 0.0}, {\"dx\": 1.53, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": 2.04, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.16, \"droll\": 0.0}, {\"dx\": 2.55, \"dy\": 0.07, \"dz\": -0.01, \"dpitch\": -0.06, \"dyaw\": -0.17, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.39, "window_alt_abs_m": 0.51, "target_px_mean_hist": 506.5, "cur_frame_id": 35, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00044/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00045/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00046/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [43.66, 4.12, 20.06, -40.66, 7.66, 0.0]\n  Target bbox: [576.1, 347.31, 600.88, 409.66]\n\nFrame 2:\n  Drone pose: [44.28, 4.01, 20.0, -41.78, 3.87, 0.0]\n  Target bbox: [628.47, 324.58, 651.85, 394.93]\n\nFrame 3:\n  Drone pose: [44.78, 4.01, 20.0, -42.5, 7.27, 0.0]\n  Target bbox: [584.15, 311.75, 610.77, 385.24]\n\nFrame 4:\n  Drone pose: [45.4, 4.01, 20.1, -42.9, 8.91, 0.0]\n  Target bbox: [563.93, 311.95, 591.41, 384.11]\n\nFrame 5 (current):\n  Drone pose: [45.79, 4.01, 20.0, -41.8, -1.12, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 690.55, \"ymin\": 325.02, \"xmax\": 715.4, \"ymax\": 397.66}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 4.99, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 4.99, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 4.99, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 4.98, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 4.97, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 18.86, "window_alt_abs_m": 0.26, "target_px_mean_hist": 510.0, "cur_frame_id": 46, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [49.3, 4.05, 20.0, -41.81, 3.79, 0.0]\n  Target bbox: [628.51, 325.36, 651.8, 394.15]\n\nFrame 2:\n  Drone pose: [49.65, 4.05, 19.99, -41.56, 7.36, 0.0]\n  Target bbox: [582.25, 329.28, 607.29, 393.52]\n\nFrame 3:\n  Drone pose: [50.5, 4.05, 19.98, -42.05, 3.81, 0.0]\n  Target bbox: [627.8, 326.28, 652.48, 393.13]\n\nFrame 4:\n  Drone pose: [50.86, 4.21, 19.88, -38.12, 0.24, 0.0]\n  Target bbox: [667.6, 388.87, 691.06, 452.5]\n\nFrame 5 (current):\n  Drone pose: [51.3, 4.11, 20.1, -46.38, 5.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 608.57, \"ymin\": 251.47, \"xmax\": 632.69, \"ymax\": 320.05}, \"waypoint_deltas\": [{\"dx\": 0.52, \"dy\": 0.1, \"dz\": -0.1, \"dpitch\": 4.53, \"dyaw\": -1.8, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 0.16, \"dz\": -0.1, \"dpitch\": 4.52, \"dyaw\": -1.96, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.23, \"dz\": -0.1, \"dpitch\": 4.5, \"dyaw\": -2.15, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": 0.31, \"dz\": -0.1, \"dpitch\": 4.48, \"dyaw\": -2.37, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": 0.4, \"dz\": -0.1, \"dpitch\": 4.46, \"dyaw\": -2.62, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 15.61, "window_alt_abs_m": 0.35, "target_px_mean_hist": 500.5, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00063/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00064/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00065/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00067/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [54.38, 4.62, 20.0, -41.95, 2.26, 0.0]\n  Target bbox: [628.58, 323.28, 651.77, 396.29]\n\nFrame 2:\n  Drone pose: [54.87, 4.61, 19.99, -39.82, 3.01, 0.0]\n  Target bbox: [618.95, 362.71, 642.59, 427.16]\n\nFrame 3:\n  Drone pose: [55.41, 4.85, 20.0, -42.01, 1.62, 0.0]\n  Target bbox: [629.13, 329.09, 651.09, 390.26]\n\nFrame 4:\n  Drone pose: [55.94, 4.98, 20.0, -40.24, -3.73, 0.0]\n  Target bbox: [690.5, 358.44, 714.3, 425.29]\n\nFrame 5 (current):\n  Drone pose: [56.43, 5.15, 20.0, -38.24, -0.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 650.94, \"ymin\": 389.87, \"xmax\": 673.16, \"ymax\": 457.89}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": -3.89, \"dyaw\": 1.51, \"droll\": 0.0}, {\"dx\": 1.09, \"dy\": 0.19, \"dz\": 0.0, \"dpitch\": -3.93, \"dyaw\": 1.2, \"droll\": 0.0}, {\"dx\": 1.63, \"dy\": 0.28, \"dz\": 0.0, \"dpitch\": -3.98, \"dyaw\": 0.97, \"droll\": 0.0}, {\"dx\": 2.16, \"dy\": 0.34, \"dz\": 0.0, \"dpitch\": -4.02, \"dyaw\": 0.81, \"droll\": 0.0}, {\"dx\": 2.7, \"dy\": 0.37, \"dz\": 0.0, \"dpitch\": -4.06, \"dyaw\": -0.65, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.28, "window_alt_abs_m": 0.02, "target_px_mean_hist": 522.0, "cur_frame_id": 67, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00074/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00075/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00078/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [60.24, 5.44, 20.14, -44.7, -7.76, 0.0]\n  Target bbox: [688.29, 291.28, 715.37, 363.14]\n\nFrame 2:\n  Drone pose: [60.75, 5.47, 20.0, -42.45, -2.85, 0.0]\n  Target bbox: [618.21, 322.64, 661.86, 396.98]\n\nFrame 3:\n  Drone pose: [61.36, 5.51, 20.15, -42.78, -4.36, 0.0]\n  Target bbox: [627.11, 322.9, 652.52, 396.61]\n\nFrame 4:\n  Drone pose: [61.81, 5.36, 20.15, -42.13, -7.64, 0.0]\n  Target bbox: [667.88, 336.55, 704.04, 405.03]\n\nFrame 5 (current):\n  Drone pose: [62.24, 5.34, 19.94, -41.43, -3.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 600.54, \"ymin\": 339.14, \"xmax\": 624.7, \"ymax\": 408.41}, \"waypoint_deltas\": [{\"dx\": 0.68, \"dy\": -0.03, \"dz\": 0.06, \"dpitch\": -1.18, \"dyaw\": -2.15, \"droll\": 0.0}, {\"dx\": 1.23, \"dy\": -0.07, \"dz\": 0.06, \"dpitch\": -1.17, \"dyaw\": -3.45, \"droll\": 0.0}, {\"dx\": 1.77, \"dy\": -0.11, \"dz\": 0.06, \"dpitch\": -1.24, \"dyaw\": -3.34, \"droll\": 0.0}, {\"dx\": 2.32, \"dy\": -0.16, \"dz\": 0.06, \"dpitch\": -1.31, \"dyaw\": -3.22, \"droll\": 0.0}, {\"dx\": 2.86, \"dy\": -0.22, \"dz\": 0.06, \"dpitch\": -1.3, \"dyaw\": -4.45, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.27, "window_alt_abs_m": 0.5, "target_px_mean_hist": 508.8, "cur_frame_id": 78, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00084/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00088/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [65.75, 4.88, 20.06, -45.15, -7.44, 0.0]\n  Target bbox: [628.79, 290.17, 664.73, 359.15]\n\nFrame 2:\n  Drone pose: [66.19, 4.95, 20.0, -42.78, -8.47, 0.0]\n  Target bbox: [627.5, 329.39, 652.34, 389.85]\n\nFrame 3:\n  Drone pose: [66.73, 4.86, 20.0, -44.8, -13.22, 0.0]\n  Target bbox: [684.99, 295.63, 718.9, 362.36]\n\nFrame 4:\n  Drone pose: [67.26, 4.76, 20.0, -39.68, -11.83, 0.0]\n  Target bbox: [657.12, 379.35, 684.48, 446.33]\n\nFrame 5 (current):\n  Drone pose: [67.8, 4.65, 20.0, -42.89, -9.06, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.85, \"ymin\": 326.58, \"xmax\": 656.2, \"ymax\": 392.77}, \"waypoint_deltas\": [{\"dx\": 0.53, \"dy\": -0.1, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -1.12, \"droll\": 0.0}, {\"dx\": 1.06, \"dy\": -0.2, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.85, \"droll\": 0.0}, {\"dx\": 1.59, \"dy\": -0.3, \"dz\": 0.0, \"dpitch\": 0.05, \"dyaw\": -1.98, \"droll\": 0.0}, {\"dx\": 2.11, \"dy\": -0.41, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -1.7, \"droll\": 0.0}, {\"dx\": 2.64, \"dy\": -0.53, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -1.39, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.94, "window_alt_abs_m": 0.06, "target_px_mean_hist": 504.2, "cur_frame_id": 88, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00097/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00098/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540/aug_001/frames_playback/frame_00099/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [71.49, 3.83, 20.0, -43.22, -16.04, 0.0]\n  Target bbox: [685.3, 325.77, 718.12, 389.55]\n\nFrame 2:\n  Drone pose: [72.01, 3.64, 20.0, -47.34, -10.37, 0.0]\n  Target bbox: [605.57, 249.06, 634.9, 322.85]\n\nFrame 3:\n  Drone pose: [72.56, 3.55, 19.98, -43.48, -7.09, 0.0]\n  Target bbox: [563.55, 317.36, 602.56, 388.92]\n\nFrame 4:\n  Drone pose: [73.21, 3.22, 19.98, -47.11, -16.8, 0.0]\n  Target bbox: [677.55, 260.69, 714.75, 328.62]\n\nFrame 5 (current):\n  Drone pose: [73.58, 3.05, 20.0, -41.67, -9.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 576.22, \"ymin\": 343.83, \"xmax\": 619.4, \"ymax\": 418.58}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.17, \"dz\": 0.0, \"dpitch\": -1.14, \"dyaw\": -4.33, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": -0.31, \"dz\": 0.0, \"dpitch\": -1.21, \"dyaw\": -3.95, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": -0.44, \"dz\": 0.0, \"dpitch\": -1.27, \"dyaw\": -3.62, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": -0.56, \"dz\": 0.0, \"dpitch\": -1.32, \"dyaw\": -3.28, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": -0.69, \"dz\": 0.0, \"dpitch\": -1.38, \"dyaw\": -2.94, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 25.78, "window_alt_abs_m": 0.04, "target_px_mean_hist": 504.0, "cur_frame_id": 99, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_1540", "difficulty_score": 0.2611, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-42.61, -61.56, 22.0, -46.47, 0.0, 0.0]\n  Target bbox: [629.16, 325.95, 650.84, 393.37]\n\nFrame 2:\n  Drone pose: [-43.12, -62.71, 21.2, -43.91, 3.13, 0.0]\n  Target bbox: [629.1, 327.89, 651.14, 391.49]\n\nFrame 3:\n  Drone pose: [-43.1, -63.26, 20.67, -42.45, 4.54, 0.0]\n  Target bbox: [629.0, 330.77, 651.17, 388.57]\n\nFrame 4:\n  Drone pose: [-42.79, -63.5, 20.64, -42.14, 5.12, 0.0]\n  Target bbox: [628.32, 325.79, 651.99, 393.83]\n\nFrame 5 (current):\n  Drone pose: [-42.34, -63.57, 20.62, -42.03, 5.3, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.71, \"ymin\": 329.12, \"xmax\": 651.49, \"ymax\": 390.26}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": -0.02, \"dz\": -0.03, \"dpitch\": 0.04, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": -0.02, \"dz\": -0.05, \"dpitch\": 0.06, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -0.02, \"dz\": -0.07, \"dpitch\": 0.08, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": -0.02, \"dz\": -0.09, \"dpitch\": 0.09, \"dyaw\": 0.06, \"droll\": 0.0}, {\"dx\": 2.54, \"dy\": -0.02, \"dz\": -0.2, \"dpitch\": 0.24, \"dyaw\": 0.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 5.3, "window_alt_abs_m": 1.38, "target_px_mean_hist": 470.5, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-39.8, -63.59, 20.42, -41.79, 5.37, 0.0]\n  Target bbox: [628.54, 326.23, 651.76, 393.35]\n\nFrame 2:\n  Drone pose: [-39.29, -63.59, 20.39, -41.76, 5.37, 0.0]\n  Target bbox: [627.76, 326.71, 652.53, 392.78]\n\nFrame 3:\n  Drone pose: [-38.78, -63.59, 20.36, -41.73, 5.37, 0.0]\n  Target bbox: [628.42, 325.8, 651.89, 393.82]\n\nFrame 4:\n  Drone pose: [-38.27, -63.59, 20.33, -41.7, 5.38, 0.0]\n  Target bbox: [628.33, 326.51, 651.96, 393.06]\n\nFrame 5 (current):\n  Drone pose: [-37.76, -63.59, 20.3, -41.67, 5.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.41, \"ymin\": 325.39, \"xmax\": 651.9, \"ymax\": 394.16}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": -0.03, \"dpitch\": 0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.0, \"dz\": -0.06, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.53, \"dy\": -0.01, \"dz\": -0.08, \"dpitch\": 0.08, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 2.04, \"dy\": -0.01, \"dz\": -0.11, \"dpitch\": 0.11, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 2.54, \"dy\": -0.01, \"dz\": -0.13, \"dpitch\": 0.14, \"dyaw\": 0.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.12, "target_px_mean_hist": 479.0, "cur_frame_id": 13, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.72, -63.61, 20.15, -41.5, 5.43, 0.0]\n  Target bbox: [628.25, 326.62, 652.02, 392.88]\n\nFrame 2:\n  Drone pose: [-34.22, -63.63, 20.13, -41.47, 5.47, 0.0]\n  Target bbox: [628.45, 327.58, 651.8, 391.92]\n\nFrame 3:\n  Drone pose: [-33.72, -63.64, 20.12, -41.44, 5.52, 0.0]\n  Target bbox: [628.84, 330.23, 651.33, 389.15]\n\nFrame 4:\n  Drone pose: [-33.23, -63.67, 20.1, -41.42, 5.59, 0.0]\n  Target bbox: [627.78, 325.34, 652.54, 394.18]\n\nFrame 5 (current):\n  Drone pose: [-32.73, -63.7, 20.09, -41.39, 5.67, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.19, \"ymin\": 325.08, \"xmax\": 652.12, \"ymax\": 394.47}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.04, \"dz\": -0.01, \"dpitch\": 0.02, \"dyaw\": 0.09, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": -0.08, \"dz\": -0.02, \"dpitch\": 0.03, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": 1.53, \"dy\": -0.12, \"dz\": -0.03, \"dpitch\": 0.03, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": 2.04, \"dy\": -0.16, \"dz\": -0.04, \"dpitch\": 0.02, \"dyaw\": 0.42, \"droll\": 0.0}, {\"dx\": 2.57, \"dy\": -0.19, \"dz\": -0.05, \"dpitch\": 0.01, \"dyaw\": 0.52, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.24, "window_alt_abs_m": 0.06, "target_px_mean_hist": 485.5, "cur_frame_id": 23, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00032/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-30.16, -63.89, 20.04, -41.38, 6.19, 0.0]\n  Target bbox: [628.62, 330.1, 651.56, 389.28]\n\nFrame 2:\n  Drone pose: [-29.64, -63.92, 20.04, -41.4, 6.28, 0.0]\n  Target bbox: [628.08, 326.67, 652.2, 392.87]\n\nFrame 3:\n  Drone pose: [-29.11, -63.95, 20.03, -41.42, 6.35, 0.0]\n  Target bbox: [627.99, 325.73, 652.31, 393.79]\n\nFrame 4:\n  Drone pose: [-28.59, -63.96, 20.03, -41.44, 6.4, 0.0]\n  Target bbox: [627.14, 324.29, 653.2, 395.28]\n\nFrame 5 (current):\n  Drone pose: [-28.07, -63.97, 20.02, -41.45, 6.43, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.34, \"ymin\": 323.52, \"xmax\": 653.04, \"ymax\": 396.13}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.01, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -0.01, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 2.02, \"dy\": -0.01, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.02, \"droll\": 0.0}, {\"dx\": 2.53, \"dy\": -0.01, \"dz\": -0.01, \"dpitch\": -0.02, \"dyaw\": 0.03, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.23, "window_alt_abs_m": 0.02, "target_px_mean_hist": 490.2, "cur_frame_id": 32, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00042/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-25.04, -63.98, 20.01, -41.48, 6.46, 0.0]\n  Target bbox: [627.61, 324.32, 652.76, 395.37]\n\nFrame 2:\n  Drone pose: [-24.53, -63.98, 20.01, -41.48, 6.46, 0.0]\n  Target bbox: [628.23, 328.16, 652.0, 391.28]\n\nFrame 3:\n  Drone pose: [-24.03, -63.98, 20.0, -41.48, 6.46, 0.0]\n  Target bbox: [627.59, 323.72, 652.77, 395.94]\n\nFrame 4:\n  Drone pose: [-23.53, -63.98, 20.0, -41.48, 6.46, 0.0]\n  Target bbox: [628.45, 329.29, 651.74, 390.06]\n\nFrame 5 (current):\n  Drone pose: [-23.04, -63.98, 20.0, -41.47, 6.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.57, \"ymin\": 329.67, \"xmax\": 651.62, \"ymax\": 389.71}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.07, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.18, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": 0.08, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.01, "window_alt_abs_m": 0.0, "target_px_mean_hist": 501.0, "cur_frame_id": 42, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00051/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-20.53, -63.9, 20.0, -41.49, 6.24, 0.0]\n  Target bbox: [628.03, 326.34, 652.25, 393.16]\n\nFrame 2:\n  Drone pose: [-20.02, -63.89, 20.0, -41.5, 6.21, 0.0]\n  Target bbox: [627.71, 323.83, 652.65, 395.84]\n\nFrame 3:\n  Drone pose: [-19.51, -63.88, 20.0, -41.51, 6.21, 0.0]\n  Target bbox: [628.56, 329.97, 651.62, 389.4]\n\nFrame 4:\n  Drone pose: [-19.01, -63.88, 20.0, -41.52, 6.21, 0.0]\n  Target bbox: [628.02, 325.26, 652.3, 394.35]\n\nFrame 5 (current):\n  Drone pose: [-18.49, -63.88, 20.0, -41.54, 6.21, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.96, \"ymin\": 325.38, \"xmax\": 652.34, \"ymax\": 394.13}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": -0.06, \"dyaw\": -0.02, \"droll\": 0.0}, {\"dx\": 2.05, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 2.56, \"dy\": 0.05, \"dz\": 0.0, \"dpitch\": -0.08, \"dyaw\": -0.13, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.04, "window_alt_abs_m": 0.0, "target_px_mean_hist": 496.5, "cur_frame_id": 51, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00061/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-15.44, -63.79, 20.0, -41.62, 5.98, 0.0]\n  Target bbox: [627.79, 324.87, 652.55, 394.76]\n\nFrame 2:\n  Drone pose: [-14.96, -63.75, 20.0, -41.6, 5.86, 0.0]\n  Target bbox: [628.59, 329.95, 651.59, 389.41]\n\nFrame 3:\n  Drone pose: [-14.49, -63.7, 20.0, -41.56, 5.73, 0.0]\n  Target bbox: [627.38, 325.44, 652.93, 394.06]\n\nFrame 4:\n  Drone pose: [-14.02, -63.66, 20.0, -41.53, 5.61, 0.0]\n  Target bbox: [627.97, 325.21, 652.36, 394.4]\n\nFrame 5 (current):\n  Drone pose: [-13.55, -63.63, 20.0, -41.5, 5.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.69, \"ymin\": 330.04, \"xmax\": 651.49, \"ymax\": 389.33}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 1.48, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.45, "window_alt_abs_m": 0.0, "target_px_mean_hist": 498.8, "cur_frame_id": 61, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00070/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-11.05, -63.7, 20.0, -41.49, 5.72, 0.0]\n  Target bbox: [627.99, 324.27, 652.36, 395.39]\n\nFrame 2:\n  Drone pose: [-10.53, -63.73, 20.0, -41.51, 5.79, 0.0]\n  Target bbox: [628.7, 329.99, 651.48, 389.38]\n\nFrame 3:\n  Drone pose: [-10.02, -63.75, 20.0, -41.53, 5.85, 0.0]\n  Target bbox: [628.59, 329.98, 651.59, 389.38]\n\nFrame 4:\n  Drone pose: [-9.5, -63.77, 20.0, -41.55, 5.91, 0.0]\n  Target bbox: [627.9, 325.33, 652.43, 394.27]\n\nFrame 5 (current):\n  Drone pose: [-8.98, -63.79, 20.0, -41.56, 5.96, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.26, \"ymin\": 326.01, \"xmax\": 652.03, \"ymax\": 393.49}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": -0.03, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.1, \"droll\": 0.0}, {\"dx\": 1.51, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": 0.15, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.19, \"droll\": 0.0}, {\"dx\": 2.51, \"dy\": -0.08, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": 0.23, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.24, "window_alt_abs_m": 0.0, "target_px_mean_hist": 496.8, "cur_frame_id": 70, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-5.98, -63.89, 20.0, -41.56, 6.22, 0.0]\n  Target bbox: [628.21, 327.47, 652.05, 392.01]\n\nFrame 2:\n  Drone pose: [-5.48, -63.89, 20.0, -41.55, 6.24, 0.0]\n  Target bbox: [628.4, 330.03, 651.77, 389.27]\n\nFrame 3:\n  Drone pose: [-4.98, -63.9, 20.0, -41.55, 6.25, 0.0]\n  Target bbox: [628.07, 325.47, 652.24, 394.12]\n\nFrame 4:\n  Drone pose: [-4.49, -63.9, 20.0, -41.55, 6.25, 0.0]\n  Target bbox: [628.0, 324.85, 652.31, 394.7]\n\nFrame 5 (current):\n  Drone pose: [-3.99, -63.89, 20.0, -41.54, 6.24, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.99, \"ymin\": 325.19, \"xmax\": 652.34, \"ymax\": 394.43}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 1.49, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": -0.13, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.19, \"droll\": 0.0}, {\"dx\": 2.49, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": -0.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.04, "window_alt_abs_m": 0.0, "target_px_mean_hist": 501.2, "cur_frame_id": 80, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/ORI/frames_playback/frame_00089/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-1.5, -63.8, 20.0, -41.54, 6.0, 0.0]\n  Target bbox: [628.23, 325.69, 652.07, 393.84]\n\nFrame 2:\n  Drone pose: [-0.99, -63.78, 20.0, -41.55, 5.95, 0.0]\n  Target bbox: [627.52, 323.55, 652.84, 396.05]\n\nFrame 3:\n  Drone pose: [-0.49, -63.76, 20.0, -41.56, 5.9, 0.0]\n  Target bbox: [627.72, 327.62, 652.53, 391.8]\n\nFrame 4:\n  Drone pose: [0.02, -63.75, 20.0, -41.58, 5.85, 0.0]\n  Target bbox: [627.25, 324.42, 653.09, 395.13]\n\nFrame 5 (current):\n  Drone pose: [0.53, -63.73, 20.0, -41.59, 5.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 627.52, \"ymin\": 323.5, \"xmax\": 652.85, \"ymax\": 396.15}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.06, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -0.14, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": -0.07, \"dyaw\": -0.23, \"droll\": 0.0}, {\"dx\": 2.06, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -0.35, \"droll\": 0.0}, {\"dx\": 2.57, \"dy\": 0.2, \"dz\": 0.0, \"dpitch\": -0.11, \"dyaw\": -0.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.2, "window_alt_abs_m": 0.0, "target_px_mean_hist": 493.2, "cur_frame_id": 89, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-42.58, -61.62, 22.17, -46.74, 0.2, 0.0]\n  Target bbox: [629.62, 329.48, 650.48, 389.79]\n\nFrame 2:\n  Drone pose: [-43.1, -62.74, 21.1, -46.5, 5.04, 0.0]\n  Target bbox: [606.48, 280.89, 629.67, 348.28]\n\nFrame 3:\n  Drone pose: [-43.1, -63.26, 20.67, -42.45, 4.54, 0.0]\n  Target bbox: [628.63, 325.71, 651.68, 393.89]\n\nFrame 4:\n  Drone pose: [-42.79, -63.5, 20.64, -46.13, 0.94, 0.0]\n  Target bbox: [680.2, 264.22, 704.6, 323.67]\n\nFrame 5 (current):\n  Drone pose: [-42.34, -63.57, 20.62, -40.27, 3.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 651.11, \"ymin\": 357.52, \"xmax\": 674.14, \"ymax\": 421.4}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": -0.02, \"dz\": -0.03, \"dpitch\": -1.72, \"dyaw\": 1.84, \"droll\": 0.0}, {\"dx\": 1.01, \"dy\": -0.02, \"dz\": -0.05, \"dpitch\": -1.7, \"dyaw\": 1.85, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -0.02, \"dz\": -0.07, \"dpitch\": -1.68, \"dyaw\": 1.85, \"droll\": 0.0}, {\"dx\": 2.03, \"dy\": -0.02, \"dz\": -0.09, \"dpitch\": -1.67, \"dyaw\": 1.85, \"droll\": 0.0}, {\"dx\": 2.54, \"dy\": -0.02, \"dz\": -0.2, \"dpitch\": -1.52, \"dyaw\": 1.86, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 11.5, "window_alt_abs_m": 1.55, "target_px_mean_hist": 468.8, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00009/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00010/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00011/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00012/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00013/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-39.64, -63.56, 20.47, -42.08, 5.32, 0.0]\n  Target bbox: [628.87, 330.54, 651.3, 388.81]\n\nFrame 2:\n  Drone pose: [-39.29, -63.59, 20.39, -41.76, 5.37, 0.0]\n  Target bbox: [628.79, 329.52, 651.4, 389.85]\n\nFrame 3:\n  Drone pose: [-38.78, -63.59, 20.36, -40.27, 7.09, 0.0]\n  Target bbox: [606.06, 349.66, 630.97, 419.59]\n\nFrame 4:\n  Drone pose: [-38.27, -63.59, 20.33, -36.7, 3.96, 0.0]\n  Target bbox: [646.44, 411.34, 669.84, 476.6]\n\nFrame 5 (current):\n  Drone pose: [-37.91, -63.52, 20.27, -36.46, 3.95, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 643.62, \"ymin\": 408.03, \"xmax\": 667.37, \"ymax\": 479.1}, \"waypoint_deltas\": [{\"dx\": 0.66, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": -5.19, \"dyaw\": 1.43, \"droll\": 0.0}, {\"dx\": 1.17, \"dy\": -0.07, \"dz\": -0.03, \"dpitch\": -5.16, \"dyaw\": 1.43, \"droll\": 0.0}, {\"dx\": 1.68, \"dy\": -0.08, \"dz\": -0.05, \"dpitch\": -5.13, \"dyaw\": 1.44, \"droll\": 0.0}, {\"dx\": 2.19, \"dy\": -0.08, \"dz\": -0.08, \"dpitch\": -5.1, \"dyaw\": 1.45, \"droll\": 0.0}, {\"dx\": 2.69, \"dy\": -0.08, \"dz\": -0.1, \"dpitch\": -5.07, \"dyaw\": 1.46, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.9, "window_alt_abs_m": 0.2, "target_px_mean_hist": 481.0, "cur_frame_id": 13, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00019/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00020/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00021/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00022/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00023/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-34.72, -63.61, 20.15, -42.13, 10.43, 0.0]\n  Target bbox: [562.97, 314.79, 591.22, 387.32]\n\nFrame 2:\n  Drone pose: [-34.22, -63.63, 20.13, -41.47, 5.47, 0.0]\n  Target bbox: [628.25, 326.69, 652.02, 392.81]\n\nFrame 3:\n  Drone pose: [-33.78, -63.62, 20.29, -37.37, 3.91, 0.0]\n  Target bbox: [647.94, 401.75, 671.17, 460.81]\n\nFrame 4:\n  Drone pose: [-33.23, -63.67, 20.1, -41.42, 5.59, 0.0]\n  Target bbox: [627.8, 324.03, 652.56, 395.64]\n\nFrame 5 (current):\n  Drone pose: [-32.62, -63.61, 20.02, -41.44, 5.45, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.19, \"ymin\": 325.29, \"xmax\": 652.12, \"ymax\": 394.27}, \"waypoint_deltas\": [{\"dx\": 0.4, \"dy\": -0.13, \"dz\": 0.06, \"dpitch\": 0.07, \"dyaw\": 0.31, \"droll\": 0.0}, {\"dx\": 0.9, \"dy\": -0.17, \"dz\": 0.05, \"dpitch\": 0.08, \"dyaw\": 0.42, \"droll\": 0.0}, {\"dx\": 1.42, \"dy\": -0.21, \"dz\": 0.04, \"dpitch\": 0.08, \"dyaw\": 0.53, \"droll\": 0.0}, {\"dx\": 1.93, \"dy\": -0.25, \"dz\": 0.03, \"dpitch\": 0.07, \"dyaw\": 0.64, \"droll\": 0.0}, {\"dx\": 2.46, \"dy\": -0.28, \"dz\": 0.02, \"dpitch\": 0.06, \"dyaw\": 0.74, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.34, "window_alt_abs_m": 0.44, "target_px_mean_hist": 483.8, "cur_frame_id": 23, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00030/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00031/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00032/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-30.16, -63.89, 20.04, -41.38, 6.19, 0.0]\n  Target bbox: [628.03, 325.24, 652.27, 394.29]\n\nFrame 2:\n  Drone pose: [-29.63, -64.04, 20.02, -41.36, 6.58, 0.0]\n  Target bbox: [628.41, 329.54, 651.77, 389.79]\n\nFrame 3:\n  Drone pose: [-29.25, -64.06, 20.01, -39.84, 10.97, 0.0]\n  Target bbox: [571.07, 348.87, 598.5, 418.74]\n\nFrame 4:\n  Drone pose: [-28.47, -63.96, 20.03, -40.01, 4.25, 0.0]\n  Target bbox: [654.76, 350.83, 680.47, 423.13]\n\nFrame 5 (current):\n  Drone pose: [-28.07, -63.97, 20.02, -39.3, 2.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 672.85, \"ymin\": 362.17, \"xmax\": 697.79, \"ymax\": 431.52}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": -2.16, \"dyaw\": 3.59, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": -0.01, \"dz\": 0.0, \"dpitch\": -2.17, \"dyaw\": 3.6, \"droll\": 0.0}, {\"dx\": 1.52, \"dy\": -0.01, \"dz\": -0.01, \"dpitch\": -2.17, \"dyaw\": 3.6, \"droll\": 0.0}, {\"dx\": 2.02, \"dy\": -0.01, \"dz\": -0.01, \"dpitch\": -2.17, \"dyaw\": 3.6, \"droll\": 0.0}, {\"dx\": 2.53, \"dy\": -0.01, \"dz\": -0.01, \"dpitch\": -2.17, \"dyaw\": 3.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 12.9, "window_alt_abs_m": 0.07, "target_px_mean_hist": 486.5, "cur_frame_id": 32, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00038/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00039/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00042/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-25.01, -63.9, 20.01, -36.65, 4.97, 0.0]\n  Target bbox: [644.33, 406.86, 668.54, 476.9]\n\nFrame 2:\n  Drone pose: [-24.53, -63.98, 20.01, -41.48, 6.46, 0.0]\n  Target bbox: [628.24, 329.78, 651.94, 389.55]\n\nFrame 3:\n  Drone pose: [-24.16, -64.11, 20.04, -46.18, 9.88, 0.0]\n  Target bbox: [586.81, 243.43, 614.3, 315.13]\n\nFrame 4:\n  Drone pose: [-23.65, -63.96, 20.06, -41.4, 6.38, 0.0]\n  Target bbox: [628.05, 325.91, 652.24, 393.63]\n\nFrame 5 (current):\n  Drone pose: [-23.08, -63.81, 19.91, -41.3, 6.0, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.43, \"ymin\": 328.22, \"xmax\": 651.79, \"ymax\": 391.17}, \"waypoint_deltas\": [{\"dx\": 0.54, \"dy\": -0.16, \"dz\": 0.09, \"dpitch\": -0.17, \"dyaw\": 0.42, \"droll\": 0.0}, {\"dx\": 1.04, \"dy\": -0.14, \"dz\": 0.09, \"dpitch\": -0.17, \"dyaw\": 0.38, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": -0.12, \"dz\": 0.09, \"dpitch\": -0.17, \"dyaw\": 0.32, \"droll\": 0.0}, {\"dx\": 2.04, \"dy\": -0.1, \"dz\": 0.09, \"dpitch\": -0.18, \"dyaw\": 0.27, \"droll\": 0.0}, {\"dx\": 2.55, \"dy\": -0.09, \"dz\": 0.09, \"dpitch\": -0.19, \"dyaw\": 0.24, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.78, "window_alt_abs_m": 0.2, "target_px_mean_hist": 493.0, "cur_frame_id": 42, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00047/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00048/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00049/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00050/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00051/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-20.66, -63.86, 19.9, -41.18, 6.11, 0.0]\n  Target bbox: [628.25, 327.8, 652.0, 391.67]\n\nFrame 2:\n  Drone pose: [-20.02, -63.89, 20.0, -37.82, 5.18, 0.0]\n  Target bbox: [641.3, 387.76, 665.45, 455.63]\n\nFrame 3:\n  Drone pose: [-19.6, -63.85, 19.84, -45.82, 3.94, 0.0]\n  Target bbox: [655.16, 245.52, 680.53, 318.31]\n\nFrame 4:\n  Drone pose: [-18.9, -63.95, 19.88, -41.32, 1.42, 0.0]\n  Target bbox: [690.33, 330.25, 716.18, 398.28]\n\nFrame 5 (current):\n  Drone pose: [-18.6, -63.79, 20.0, -45.37, 6.93, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.58, \"ymin\": 263.98, \"xmax\": 638.93, \"ymax\": 322.44}, \"waypoint_deltas\": [{\"dx\": 0.62, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": 3.82, \"dyaw\": -0.72, \"droll\": 0.0}, {\"dx\": 1.14, \"dy\": -0.09, \"dz\": 0.0, \"dpitch\": 3.79, \"dyaw\": -0.72, \"droll\": 0.0}, {\"dx\": 1.65, \"dy\": -0.08, \"dz\": 0.0, \"dpitch\": 3.77, \"dyaw\": -0.74, \"droll\": 0.0}, {\"dx\": 2.16, \"dy\": -0.06, \"dz\": 0.0, \"dpitch\": 3.76, \"dyaw\": -0.78, \"droll\": 0.0}, {\"dx\": 2.67, \"dy\": -0.04, \"dz\": 0.0, \"dpitch\": 3.75, \"dyaw\": -0.85, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.19, "window_alt_abs_m": 0.42, "target_px_mean_hist": 498.8, "cur_frame_id": 51, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00057/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00058/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00059/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00060/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00061/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-15.44, -63.79, 20.0, -41.62, 5.98, 0.0]\n  Target bbox: [627.99, 325.12, 652.33, 394.43]\n\nFrame 2:\n  Drone pose: [-14.81, -63.75, 19.92, -45.02, 6.55, 0.0]\n  Target bbox: [620.03, 273.7, 643.4, 333.81]\n\nFrame 3:\n  Drone pose: [-14.52, -63.74, 20.02, -40.27, 5.17, 0.0]\n  Target bbox: [635.98, 345.81, 660.98, 416.99]\n\nFrame 4:\n  Drone pose: [-14.09, -63.69, 20.07, -41.54, 5.68, 0.0]\n  Target bbox: [628.01, 325.46, 652.31, 394.14]\n\nFrame 5 (current):\n  Drone pose: [-13.55, -63.63, 20.0, -41.5, 5.53, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.51, \"ymin\": 327.18, \"xmax\": 651.75, \"ymax\": 392.31}, \"waypoint_deltas\": [{\"dx\": 0.48, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": -0.03, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": 0.0, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": -0.01, \"droll\": 0.0}, {\"dx\": 1.48, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": 0.04, \"dyaw\": 0.05, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": -0.05, \"dz\": 0.0, \"dpitch\": 0.03, \"dyaw\": 0.12, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -0.07, \"dz\": 0.0, \"dpitch\": 0.01, \"dyaw\": 0.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.63, "window_alt_abs_m": 0.29, "target_px_mean_hist": 504.0, "cur_frame_id": 61, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00066/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00070/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-10.9, -63.74, 20.09, -42.8, 6.06, 0.0]\n  Target bbox: [625.01, 306.84, 650.12, 379.47]\n\nFrame 2:\n  Drone pose: [-10.53, -63.73, 20.0, -46.51, 4.89, 0.0]\n  Target bbox: [639.17, 239.28, 664.35, 312.25]\n\nFrame 3:\n  Drone pose: [-10.03, -63.59, 20.09, -41.66, 5.43, 0.0]\n  Target bbox: [628.23, 324.89, 652.08, 394.64]\n\nFrame 4:\n  Drone pose: [-9.54, -63.7, 20.08, -41.82, 2.64, 0.0]\n  Target bbox: [666.81, 321.92, 691.37, 392.16]\n\nFrame 5 (current):\n  Drone pose: [-8.9, -63.64, 19.97, -41.64, 5.6, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 628.15, \"ymin\": 324.77, \"xmax\": 652.17, \"ymax\": 394.76}, \"waypoint_deltas\": [{\"dx\": 0.43, \"dy\": -0.17, \"dz\": 0.03, \"dpitch\": 0.07, \"dyaw\": 0.41, \"droll\": 0.0}, {\"dx\": 0.93, \"dy\": -0.18, \"dz\": 0.03, \"dpitch\": 0.06, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": 1.43, \"dy\": -0.2, \"dz\": 0.03, \"dpitch\": 0.06, \"dyaw\": 0.51, \"droll\": 0.0}, {\"dx\": 1.93, \"dy\": -0.22, \"dz\": 0.03, \"dpitch\": 0.07, \"dyaw\": 0.55, \"droll\": 0.0}, {\"dx\": 2.43, \"dy\": -0.23, \"dz\": 0.03, \"dpitch\": 0.07, \"dyaw\": 0.59, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 7.47, "window_alt_abs_m": 0.29, "target_px_mean_hist": 494.5, "cur_frame_id": 70, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00076/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00077/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00078/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00079/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00080/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-5.98, -63.89, 20.0, -41.56, 6.22, 0.0]\n  Target bbox: [628.01, 326.24, 652.27, 393.25]\n\nFrame 2:\n  Drone pose: [-5.31, -63.8, 20.02, -45.58, 8.29, 0.0]\n  Target bbox: [598.83, 261.35, 624.64, 332.37]\n\nFrame 3:\n  Drone pose: [-5.1, -63.94, 19.93, -40.27, 4.92, 0.0]\n  Target bbox: [645.47, 340.85, 670.83, 412.85]\n\nFrame 4:\n  Drone pose: [-4.49, -63.9, 20.0, -44.71, 6.99, 0.0]\n  Target bbox: [618.82, 275.99, 642.23, 337.35]\n\nFrame 5 (current):\n  Drone pose: [-3.99, -63.89, 20.0, -46.54, 4.39, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 651.85, \"ymin\": 246.13, \"xmax\": 675.36, \"ymax\": 305.53}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": 0.01, \"dz\": 0.0, \"dpitch\": 5.01, \"dyaw\": 1.82, \"droll\": 0.0}, {\"dx\": 0.99, \"dy\": 0.02, \"dz\": 0.0, \"dpitch\": 5.01, \"dyaw\": 1.77, \"droll\": 0.0}, {\"dx\": 1.49, \"dy\": 0.04, \"dz\": 0.0, \"dpitch\": 5.01, \"dyaw\": 1.72, \"droll\": 0.0}, {\"dx\": 1.99, \"dy\": 0.07, \"dz\": 0.0, \"dpitch\": 5.0, \"dyaw\": 1.66, \"droll\": 0.0}, {\"dx\": 2.49, \"dy\": 0.09, \"dz\": 0.0, \"dpitch\": 5.0, \"dyaw\": 1.61, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.11, "window_alt_abs_m": 0.19, "target_px_mean_hist": 487.2, "cur_frame_id": 80, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00085/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00086/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00087/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00088/rgb.png", "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717/aug_001/frames_playback/frame_00089/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [-1.38, -63.89, 20.03, -44.35, 11.0, 0.0]\n  Target bbox: [566.55, 282.69, 594.17, 352.46]\n\nFrame 2:\n  Drone pose: [-0.99, -63.78, 20.0, -36.55, 3.42, 0.0]\n  Target bbox: [659.99, 409.42, 684.56, 479.24]\n\nFrame 3:\n  Drone pose: [-0.49, -63.76, 20.0, -38.38, 3.01, 0.0]\n  Target bbox: [664.68, 383.16, 688.59, 444.5]\n\nFrame 4:\n  Drone pose: [-0.07, -63.84, 19.96, -38.4, 9.73, 0.0]\n  Target bbox: [580.04, 375.92, 607.92, 445.77]\n\nFrame 5 (current):\n  Drone pose: [0.53, -63.73, 20.0, -39.94, 9.89, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 575.48, \"ymin\": 357.41, \"xmax\": 601.71, \"ymax\": 419.81}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.03, \"dz\": 0.0, \"dpitch\": -1.67, \"dyaw\": -4.15, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 0.06, \"dz\": 0.0, \"dpitch\": -1.69, \"dyaw\": -4.23, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.1, \"dz\": 0.0, \"dpitch\": -1.72, \"dyaw\": -4.32, \"droll\": 0.0}, {\"dx\": 2.06, \"dy\": 0.14, \"dz\": 0.0, \"dpitch\": -1.74, \"dyaw\": -4.44, \"droll\": 0.0}, {\"dx\": 2.57, \"dy\": 0.2, \"dz\": 0.0, \"dpitch\": -1.76, \"dyaw\": -4.58, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.87, "window_alt_abs_m": 0.1, "target_px_mean_hist": 504.2, "cur_frame_id": 89, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip195/2026-04-17/trajectory_717", "difficulty_score": 0.258, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [13.61, 146.91, 22.0, -46.4, -8.53, 0.0]\n  Target bbox: [615.49, 320.84, 664.67, 398.7] (model-predicted box)\n\nFrame 2:\n  Drone pose: [13.1, 145.39, 21.2, -44.06, -5.35, 0.0]\n  Target bbox: [622.99, 326.94, 656.7, 392.33]\n\nFrame 3:\n  Drone pose: [13.15, 144.54, 20.67, -42.77, -2.95, 0.0]\n  Target bbox: [616.71, 321.54, 663.33, 398.11]\n\nFrame 4:\n  Drone pose: [13.51, 144.09, 20.64, -42.54, -3.07, 0.0]\n  Target bbox: [618.65, 322.68, 661.38, 396.88]\n\nFrame 5 (current):\n  Drone pose: [14.02, 143.86, 20.62, -42.49, -3.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 623.53, \"ymin\": 325.59, \"xmax\": 656.09, \"ymax\": 393.8}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": -0.12, \"dz\": -0.03, \"dpitch\": -0.06, \"dyaw\": 0.32, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": -0.21, \"dz\": -0.05, \"dpitch\": -0.11, \"dyaw\": 0.56, \"droll\": 0.0}, {\"dx\": 1.65, \"dy\": -0.31, \"dz\": -0.07, \"dpitch\": -0.12, \"dyaw\": 0.82, \"droll\": 0.0}, {\"dx\": 2.14, \"dy\": -0.43, \"dz\": -0.09, \"dpitch\": -0.08, \"dyaw\": 1.14, \"droll\": 0.0}, {\"dx\": 2.59, \"dy\": -0.57, \"dz\": -0.2, \"dpitch\": 0.13, \"dyaw\": 1.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 1, "current_invisible": false, "window_yaw_abs_deg": 6.43, "window_alt_abs_m": 1.38, "target_px_mean_hist": 462.8, "cur_frame_id": 4, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00017/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [18.32, 142.72, 20.3, -41.82, -0.74, 0.0]\n  Target bbox: [624.07, 326.76, 655.59, 392.53]\n\nFrame 2:\n  Drone pose: [18.77, 142.62, 20.27, -41.71, -0.47, 0.0]\n  Target bbox: [624.54, 325.59, 655.19, 393.76]\n\nFrame 3:\n  Drone pose: [19.23, 142.55, 20.24, -41.62, -0.29, 0.0]\n  Target bbox: [624.5, 323.97, 655.3, 395.56]\n\nFrame 4:\n  Drone pose: [19.71, 142.52, 20.22, -41.56, -0.2, 0.0]\n  Target bbox: [624.38, 325.44, 655.51, 393.97]\n\nFrame 5 (current):\n  Drone pose: [20.21, 142.52, 20.19, -41.51, -0.19, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.17, \"ymin\": 326.37, \"xmax\": 655.73, \"ymax\": 392.96}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.03, \"dz\": -0.02, \"dpitch\": 0.02, \"dyaw\": -0.08, \"droll\": 0.0}, {\"dx\": 1.02, \"dy\": 0.09, \"dz\": -0.04, \"dpitch\": 0.03, \"dyaw\": -0.24, \"droll\": 0.0}, {\"dx\": 1.54, \"dy\": 0.18, \"dz\": -0.06, \"dpitch\": 0.02, \"dyaw\": -0.48, \"droll\": 0.0}, {\"dx\": 2.07, \"dy\": 0.3, \"dz\": -0.07, \"dpitch\": 0.01, \"dyaw\": -0.8, \"droll\": 0.0}, {\"dx\": 2.62, \"dy\": 0.44, \"dz\": -0.09, \"dpitch\": -0.02, \"dyaw\": -1.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.55, "window_alt_abs_m": 0.11, "target_px_mean_hist": 674.2, "cur_frame_id": 17, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00030/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.08, 143.7, 20.06, -41.76, -3.38, 0.0]\n  Target bbox: [623.02, 324.87, 656.57, 394.54]\n\nFrame 2:\n  Drone pose: [25.66, 143.88, 20.05, -41.84, -3.86, 0.0]\n  Target bbox: [623.03, 325.18, 656.58, 394.21]\n\nFrame 3:\n  Drone pose: [26.25, 144.02, 20.04, -41.94, -4.26, 0.0]\n  Target bbox: [623.34, 322.12, 656.17, 397.45]\n\nFrame 4:\n  Drone pose: [26.85, 144.11, 20.04, -42.06, -4.54, 0.0]\n  Target bbox: [623.19, 321.98, 656.31, 397.59]\n\nFrame 5 (current):\n  Drone pose: [27.46, 144.16, 20.03, -42.19, -4.68, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.93, \"ymin\": 321.01, \"xmax\": 663.11, \"ymax\": 398.6}, \"waypoint_deltas\": [{\"dx\": 0.62, \"dy\": -0.02, \"dz\": 0.0, \"dpitch\": -0.09, \"dyaw\": -1.35, \"droll\": 0.0}, {\"dx\": 1.24, \"dy\": -0.08, \"dz\": -0.01, \"dpitch\": -0.18, \"dyaw\": -2.58, \"droll\": 0.0}, {\"dx\": 1.87, \"dy\": -0.19, \"dz\": -0.01, \"dpitch\": -0.28, \"dyaw\": -3.7, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -0.33, \"dz\": -0.01, \"dpitch\": -0.37, \"dyaw\": -4.72, \"droll\": 0.0}, {\"dx\": 3.13, \"dy\": -0.51, \"dz\": -0.02, \"dpitch\": -0.47, \"dyaw\": -5.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.3, "window_alt_abs_m": 0.03, "target_px_mean_hist": 693.2, "cur_frame_id": 30, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00044/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [33.73, 142.51, 20.0, -43.12, -14.45, 0.0]\n  Target bbox: [615.1, 319.54, 665.11, 400.06]\n\nFrame 2:\n  Drone pose: [34.35, 142.25, 20.0, -43.19, -15.18, 0.0]\n  Target bbox: [615.88, 320.36, 664.34, 399.24]\n\nFrame 3:\n  Drone pose: [34.97, 141.97, 20.0, -43.27, -15.89, 0.0]\n  Target bbox: [614.93, 319.54, 665.34, 400.11]\n\nFrame 4:\n  Drone pose: [35.58, 141.69, 20.0, -43.33, -16.57, 0.0]\n  Target bbox: [615.37, 319.66, 664.88, 399.92]\n\nFrame 5 (current):\n  Drone pose: [36.19, 141.4, 20.0, -43.4, -17.22, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 619.87, \"ymin\": 322.14, \"xmax\": 660.31, \"ymax\": 397.2}, \"waypoint_deltas\": [{\"dx\": 0.61, \"dy\": -0.3, \"dz\": 0.0, \"dpitch\": -0.05, \"dyaw\": -0.63, \"droll\": 0.0}, {\"dx\": 1.21, \"dy\": -0.61, \"dz\": 0.0, \"dpitch\": -0.1, \"dyaw\": -1.25, \"droll\": 0.0}, {\"dx\": 1.81, \"dy\": -0.93, \"dz\": 0.0, \"dpitch\": -0.15, \"dyaw\": -1.85, \"droll\": 0.0}, {\"dx\": 2.4, \"dy\": -1.24, \"dz\": 0.0, \"dpitch\": -0.19, \"dyaw\": -2.44, \"droll\": 0.0}, {\"dx\": 2.99, \"dy\": -1.55, \"dz\": 0.0, \"dpitch\": -0.22, \"dyaw\": -3.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.77, "window_alt_abs_m": 0.0, "target_px_mean_hist": 711.0, "cur_frame_id": 44, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [41.52, 138.67, 20.0, -43.63, -22.82, 0.0]\n  Target bbox: [619.42, 322.41, 660.36, 396.88]\n\nFrame 2:\n  Drone pose: [42.09, 138.38, 20.0, -43.89, -22.12, 0.0]\n  Target bbox: [616.32, 320.33, 664.0, 399.16]\n\nFrame 3:\n  Drone pose: [42.66, 138.08, 20.0, -43.88, -22.72, 0.0]\n  Target bbox: [617.62, 320.85, 662.69, 398.59]\n\nFrame 4:\n  Drone pose: [43.24, 137.76, 20.0, -43.88, -23.29, 0.0]\n  Target bbox: [619.75, 322.31, 660.52, 397.04]\n\nFrame 5 (current):\n  Drone pose: [43.81, 137.42, 20.0, -43.88, -23.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 615.63, \"ymin\": 319.91, \"xmax\": 664.77, \"ymax\": 399.68}, \"waypoint_deltas\": [{\"dx\": 0.57, \"dy\": -0.35, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.48, \"droll\": 0.0}, {\"dx\": 1.14, \"dy\": -0.72, \"dz\": 0.0, \"dpitch\": -0.02, \"dyaw\": -0.9, \"droll\": 0.0}, {\"dx\": 1.7, \"dy\": -1.1, \"dz\": 0.0, \"dpitch\": -0.03, \"dyaw\": -1.29, \"droll\": 0.0}, {\"dx\": 2.26, \"dy\": -1.5, \"dz\": 0.0, \"dpitch\": -0.04, \"dyaw\": -1.64, \"droll\": 0.0}, {\"dx\": 2.81, \"dy\": -1.9, \"dz\": 0.01, \"dpitch\": -0.05, \"dyaw\": -1.96, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.38, "window_alt_abs_m": 0.0, "target_px_mean_hist": 708.2, "cur_frame_id": 57, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00071/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [49.26, 133.33, 20.02, -43.71, -26.74, 0.0]\n  Target bbox: [617.47, 320.88, 662.89, 398.51]\n\nFrame 2:\n  Drone pose: [49.77, 132.85, 20.02, -43.7, -26.79, 0.0]\n  Target bbox: [617.09, 320.8, 663.29, 398.66]\n\nFrame 3:\n  Drone pose: [50.26, 132.36, 20.02, -43.69, -26.81, 0.0]\n  Target bbox: [617.25, 321.01, 663.12, 398.44]\n\nFrame 4:\n  Drone pose: [50.75, 131.87, 20.03, -43.68, -26.85, 0.0]\n  Target bbox: [620.47, 322.79, 659.82, 396.53]\n\nFrame 5 (current):\n  Drone pose: [51.24, 131.38, 20.04, -43.67, -26.85, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.38, \"ymin\": 320.22, \"xmax\": 664.06, \"ymax\": 399.34}, \"waypoint_deltas\": [{\"dx\": 0.46, \"dy\": -0.56, \"dz\": 0.0, \"dpitch\": 0.0, \"dyaw\": 0.2, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -1.09, \"dz\": 0.01, \"dpitch\": 0.02, \"dyaw\": 0.35, \"droll\": 0.0}, {\"dx\": 1.38, \"dy\": -1.62, \"dz\": 0.02, \"dpitch\": 0.04, \"dyaw\": 0.46, \"droll\": 0.0}, {\"dx\": 1.84, \"dy\": -2.13, \"dz\": 0.03, \"dpitch\": 0.07, \"dyaw\": 0.54, \"droll\": 0.0}, {\"dx\": 2.31, \"dy\": -2.61, \"dz\": 0.04, \"dpitch\": 0.11, \"dyaw\": 0.53, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.12, "window_alt_abs_m": 0.02, "target_px_mean_hist": 719.5, "cur_frame_id": 71, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00084/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [55.4, 127.43, 20.15, -43.37, -26.55, 0.0]\n  Target bbox: [618.3, 321.07, 661.45, 398.35]\n\nFrame 2:\n  Drone pose: [55.87, 127.26, 20.17, -43.46, -26.07, 0.0]\n  Target bbox: [616.81, 319.25, 662.8, 400.34]\n\nFrame 3:\n  Drone pose: [56.34, 127.14, 20.19, -43.54, -25.71, 0.0]\n  Target bbox: [616.73, 319.59, 662.88, 400.03]\n\nFrame 4:\n  Drone pose: [56.81, 127.04, 20.22, -43.6, -25.43, 0.0]\n  Target bbox: [618.5, 322.2, 661.29, 397.2]\n\nFrame 5 (current):\n  Drone pose: [57.29, 126.94, 20.24, -43.68, -25.16, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.75, \"ymin\": 322.1, \"xmax\": 661.04, \"ymax\": 397.26}, \"waypoint_deltas\": [{\"dx\": 0.49, \"dy\": -0.11, \"dz\": 0.03, \"dpitch\": -0.1, \"dyaw\": 0.33, \"droll\": 0.0}, {\"dx\": 0.98, \"dy\": -0.27, \"dz\": 0.06, \"dpitch\": -0.23, \"dyaw\": 0.75, \"droll\": 0.0}, {\"dx\": 1.49, \"dy\": -0.49, \"dz\": 0.1, \"dpitch\": -0.42, \"dyaw\": 1.31, \"droll\": 0.0}, {\"dx\": 2.01, \"dy\": -0.76, \"dz\": 0.14, \"dpitch\": -0.36, \"dyaw\": 0.69, \"droll\": 0.0}, {\"dx\": 2.53, \"dy\": -1.09, \"dz\": 0.17, \"dpitch\": -0.34, \"dyaw\": 0.21, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 1.39, "window_alt_abs_m": 0.1, "target_px_mean_hist": 703.2, "cur_frame_id": 84, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [62.36, 124.8, 20.59, -44.36, -28.09, 0.0]\n  Target bbox: [620.52, 322.93, 659.81, 396.41]\n\nFrame 2:\n  Drone pose: [63.3, 124.91, 20.64, -44.57, -30.25, 0.0]\n  Target bbox: [618.06, 321.02, 662.4, 398.44]\n\nFrame 3:\n  Drone pose: [64.31, 125.11, 20.69, -44.74, -32.74, 0.0]\n  Target bbox: [621.34, 323.59, 659.0, 395.66]\n\nFrame 4:\n  Drone pose: [66.18, 126.31, 20.74, -44.97, -39.11, 0.0]\n  Target bbox: [621.19, 324.13, 659.22, 395.08]\n\nFrame 5 (current):\n  Drone pose: [68.05, 127.47, 20.79, -44.9, -45.36, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.12, \"ymin\": 325.74, \"xmax\": 655.68, \"ymax\": 393.43}, \"waypoint_deltas\": [{\"dx\": 0.99, \"dy\": 0.07, \"dz\": 0.05, \"dpitch\": 0.03, \"dyaw\": -2.18, \"droll\": 0.0}, {\"dx\": 1.92, \"dy\": 0.08, \"dz\": 0.1, \"dpitch\": 0.1, \"dyaw\": -4.09, \"droll\": 0.0}, {\"dx\": 2.67, \"dy\": -0.13, \"dz\": 0.15, \"dpitch\": 0.12, \"dyaw\": -5.17, \"droll\": 0.0}, {\"dx\": 3.36, \"dy\": -0.4, \"dz\": 0.2, \"dpitch\": 0.12, \"dyaw\": -6.02, \"droll\": 0.0}, {\"dx\": 3.86, \"dy\": -0.9, \"dz\": 0.25, \"dpitch\": 0.05, \"dyaw\": -6.02, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.27, "window_alt_abs_m": 0.2, "target_px_mean_hist": 704.2, "cur_frame_id": 97, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00111/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [74.41, 125.07, 21.25, -45.15, -51.38, 0.0]\n  Target bbox: [619.88, 323.84, 659.67, 395.46]\n\nFrame 2:\n  Drone pose: [74.91, 124.57, 21.29, -45.2, -51.38, 0.0]\n  Target bbox: [617.42, 321.49, 662.64, 397.92]\n\nFrame 3:\n  Drone pose: [75.41, 124.57, 21.32, -45.24, -51.38, 0.0]\n  Target bbox: [617.53, 321.57, 662.53, 397.84]\n\nFrame 4:\n  Drone pose: [75.91, 124.57, 21.34, -45.28, -51.38, 0.0]\n  Target bbox: [621.64, 324.78, 657.97, 394.49]\n\nFrame 5 (current):\n  Drone pose: [76.41, 124.07, 21.37, -45.31, -51.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.22, \"ymin\": 322.33, \"xmax\": 661.88, \"ymax\": 397.1}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.5, \"dz\": 0.03, \"dpitch\": -0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -0.5, \"dz\": 0.03, \"dpitch\": -0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -1.0, \"dz\": 0.04, \"dpitch\": -0.06, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -1.0, \"dz\": 0.03, \"dpitch\": -0.26, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.11, "target_px_mean_hist": 632.8, "cur_frame_id": 111, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00120/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00121/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00122/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/ORI/frames_playback/frame_00124/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [80.91, 122.07, 21.35, -45.5, -51.38, 0.0]\n  Target bbox: [617.02, 321.71, 663.05, 397.75]\n\nFrame 2:\n  Drone pose: [81.41, 122.07, 21.33, -45.47, -51.38, 0.0]\n  Target bbox: [620.71, 324.71, 658.88, 394.59]\n\nFrame 3:\n  Drone pose: [81.91, 121.57, 21.3, -45.43, -51.38, 0.0]\n  Target bbox: [615.12, 320.5, 664.9, 399.01]\n\nFrame 4:\n  Drone pose: [82.41, 121.57, 21.27, -45.39, -51.38, 0.0]\n  Target bbox: [617.88, 322.19, 662.2, 397.25]\n\nFrame 5 (current):\n  Drone pose: [82.91, 121.57, 21.24, -45.34, -51.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 620.81, \"ymin\": 323.16, \"xmax\": 658.73, \"ymax\": 396.18}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.5, \"dz\": -0.04, \"dpitch\": 0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -1.0, \"dz\": -0.08, \"dpitch\": 0.1, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -1.0, \"dz\": -0.12, \"dpitch\": 0.16, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -1.5, \"dz\": -0.16, \"dpitch\": 0.22, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -1.5, \"dz\": -0.2, \"dpitch\": 0.27, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 0.0, "window_alt_abs_m": 0.11, "target_px_mean_hist": 643.2, "cur_frame_id": 124, "source": "ORI", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00000/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00001/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00002/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00003/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00004/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [13.74, 146.81, 21.92, -46.49, -8.3, 0.0]\n  Target bbox: [615.5, 320.67, 664.66, 398.84]\n\nFrame 2:\n  Drone pose: [13.14, 145.58, 21.24, -44.14, -5.87, 0.0]\n  Target bbox: [623.09, 323.88, 656.48, 395.58]\n\nFrame 3:\n  Drone pose: [13.07, 144.65, 20.66, -42.8, -6.33, 0.0]\n  Target bbox: [655.44, 319.89, 701.88, 395.85]\n\nFrame 4:\n  Drone pose: [13.47, 144.2, 20.48, -45.47, -8.36, 0.0]\n  Target bbox: [677.09, 268.31, 728.54, 346.88]\n\nFrame 5 (current):\n  Drone pose: [14.02, 143.86, 20.62, -41.46, 1.2, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 560.47, \"ymin\": 345.66, \"xmax\": 594.72, \"ymax\": 411.89}, \"waypoint_deltas\": [{\"dx\": 0.56, \"dy\": -0.12, \"dz\": -0.03, \"dpitch\": -1.09, \"dyaw\": -4.68, \"droll\": 0.0}, {\"dx\": 1.12, \"dy\": -0.21, \"dz\": -0.05, \"dpitch\": -1.14, \"dyaw\": -4.44, \"droll\": 0.0}, {\"dx\": 1.65, \"dy\": -0.31, \"dz\": -0.07, \"dpitch\": -1.15, \"dyaw\": -4.18, \"droll\": 0.0}, {\"dx\": 2.14, \"dy\": -0.43, \"dz\": -0.09, \"dpitch\": -1.11, \"dyaw\": -3.86, \"droll\": 0.0}, {\"dx\": 2.59, \"dy\": -0.57, \"dz\": -0.2, \"dpitch\": -0.9, \"dyaw\": -3.49, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 14.49, "window_alt_abs_m": 1.57, "target_px_mean_hist": 482.2, "cur_frame_id": 4, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00013/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00014/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00015/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00016/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00017/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [18.25, 142.82, 20.31, -40.17, -3.78, 0.0]\n  Target bbox: [658.46, 352.36, 691.73, 420.7]\n\nFrame 2:\n  Drone pose: [18.79, 142.65, 20.41, -44.49, -0.41, 0.0]\n  Target bbox: [622.48, 283.19, 653.75, 350.8]\n\nFrame 3:\n  Drone pose: [19.24, 142.4, 20.21, -41.59, 0.11, 0.0]\n  Target bbox: [624.75, 323.95, 655.33, 395.54]\n\nFrame 4:\n  Drone pose: [19.71, 142.52, 20.22, -41.56, -0.2, 0.0]\n  Target bbox: [624.22, 323.91, 655.64, 395.62]\n\nFrame 5 (current):\n  Drone pose: [20.2, 142.33, 20.17, -41.48, 0.29, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.56, \"ymin\": 327.7, \"xmax\": 655.57, \"ymax\": 391.55}, \"waypoint_deltas\": [{\"dx\": 0.51, \"dy\": 0.22, \"dz\": 0.0, \"dpitch\": -0.01, \"dyaw\": -0.56, \"droll\": 0.0}, {\"dx\": 1.03, \"dy\": 0.28, \"dz\": -0.02, \"dpitch\": 0.0, \"dyaw\": -0.72, \"droll\": 0.0}, {\"dx\": 1.55, \"dy\": 0.37, \"dz\": -0.04, \"dpitch\": -0.01, \"dyaw\": -0.96, \"droll\": 0.0}, {\"dx\": 2.08, \"dy\": 0.49, \"dz\": -0.05, \"dpitch\": -0.02, \"dyaw\": -1.28, \"droll\": 0.0}, {\"dx\": 2.63, \"dy\": 0.63, \"dz\": -0.07, \"dpitch\": -0.05, \"dyaw\": -1.67, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 4.69, "window_alt_abs_m": 0.35, "target_px_mean_hist": 674.8, "cur_frame_id": 17, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00026/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00027/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00028/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00029/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00030/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [25.05, 143.55, 20.02, -38.75, -4.46, 0.0]\n  Target bbox: [642.36, 371.95, 674.98, 446.25]\n\nFrame 2:\n  Drone pose: [25.58, 143.84, 19.88, -41.47, -8.75, 0.0]\n  Target bbox: [684.65, 324.04, 721.27, 399.66]\n\nFrame 3:\n  Drone pose: [26.22, 144.11, 20.12, -42.01, -4.49, 0.0]\n  Target bbox: [623.26, 325.61, 656.4, 393.68]\n\nFrame 4:\n  Drone pose: [26.88, 143.97, 20.1, -40.16, -5.48, 0.0]\n  Target bbox: [639.5, 356.94, 673.67, 431.65]\n\nFrame 5 (current):\n  Drone pose: [27.33, 144.24, 19.95, -41.87, -6.28, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 634.22, \"ymin\": 321.08, \"xmax\": 681.8, \"ymax\": 399.28}, \"waypoint_deltas\": [{\"dx\": 0.75, \"dy\": -0.1, \"dz\": 0.08, \"dpitch\": -0.41, \"dyaw\": 0.25, \"droll\": 0.0}, {\"dx\": 1.37, \"dy\": -0.16, \"dz\": 0.07, \"dpitch\": -0.5, \"dyaw\": -0.98, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -0.27, \"dz\": 0.07, \"dpitch\": -0.6, \"dyaw\": -2.1, \"droll\": 0.0}, {\"dx\": 2.63, \"dy\": -0.41, \"dz\": 0.07, \"dpitch\": -0.69, \"dyaw\": -3.12, \"droll\": 0.0}, {\"dx\": 3.26, \"dy\": -0.59, \"dz\": 0.06, \"dpitch\": -0.79, \"dyaw\": -4.07, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 10.34, "window_alt_abs_m": 0.56, "target_px_mean_hist": 695.0, "cur_frame_id": 30, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00040/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00041/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00042/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00043/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00044/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [33.73, 142.51, 20.0, -43.12, -14.45, 0.0]\n  Target bbox: [617.5, 320.99, 662.67, 398.48]\n\nFrame 2:\n  Drone pose: [34.29, 142.13, 19.92, -43.03, -14.82, 0.0]\n  Target bbox: [618.16, 321.37, 662.04, 398.12]\n\nFrame 3:\n  Drone pose: [34.79, 141.99, 19.98, -45.42, -13.13, 0.0]\n  Target bbox: [585.29, 281.52, 628.44, 357.33]\n\nFrame 4:\n  Drone pose: [35.71, 141.68, 20.05, -40.82, -20.81, 0.0]\n  Target bbox: [669.71, 369.93, 712.79, 444.73]\n\nFrame 5 (current):\n  Drone pose: [36.3, 141.44, 20.15, -43.73, -17.42, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 616.55, \"ymin\": 319.67, \"xmax\": 663.76, \"ymax\": 399.88}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.34, \"dz\": -0.15, \"dpitch\": 0.28, \"dyaw\": -0.43, \"droll\": 0.0}, {\"dx\": 1.1, \"dy\": -0.65, \"dz\": -0.15, \"dpitch\": 0.23, \"dyaw\": -1.05, \"droll\": 0.0}, {\"dx\": 1.7, \"dy\": -0.97, \"dz\": -0.15, \"dpitch\": 0.18, \"dyaw\": -1.65, \"droll\": 0.0}, {\"dx\": 2.29, \"dy\": -1.28, \"dz\": -0.15, \"dpitch\": 0.14, \"dyaw\": -2.24, \"droll\": 0.0}, {\"dx\": 2.88, \"dy\": -1.59, \"dz\": -0.15, \"dpitch\": 0.11, \"dyaw\": -2.84, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 13.14, "window_alt_abs_m": 0.31, "target_px_mean_hist": 697.2, "cur_frame_id": 44, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00053/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00054/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00055/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00056/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00057/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [41.67, 138.58, 20.07, -44.81, -25.7, 0.0]\n  Target bbox: [655.5, 309.67, 697.1, 383.51]\n\nFrame 2:\n  Drone pose: [42.09, 138.38, 20.0, -42.76, -24.84, 0.0]\n  Target bbox: [650.3, 340.4, 696.85, 418.06]\n\nFrame 3:\n  Drone pose: [42.56, 138.08, 19.85, -43.52, -22.61, 0.0]\n  Target bbox: [615.84, 319.55, 664.56, 400.04]\n\nFrame 4:\n  Drone pose: [43.24, 137.76, 20.0, -43.58, -23.2, 0.0]\n  Target bbox: [615.01, 324.86, 663.3, 404.53]\n\nFrame 5 (current):\n  Drone pose: [43.88, 137.54, 19.89, -41.81, -28.8, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 674.35, \"ymin\": 355.57, \"xmax\": 718.13, \"ymax\": 430.98}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.47, \"dz\": 0.11, \"dpitch\": -2.08, \"dyaw\": 4.52, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -0.84, \"dz\": 0.11, \"dpitch\": -2.09, \"dyaw\": 4.1, \"droll\": 0.0}, {\"dx\": 1.63, \"dy\": -1.22, \"dz\": 0.11, \"dpitch\": -2.1, \"dyaw\": 3.71, \"droll\": 0.0}, {\"dx\": 2.19, \"dy\": -1.62, \"dz\": 0.11, \"dpitch\": -2.11, \"dyaw\": 3.36, \"droll\": 0.0}, {\"dx\": 2.74, \"dy\": -2.02, \"dz\": 0.12, \"dpitch\": -2.12, \"dyaw\": 3.04, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.27, "window_alt_abs_m": 0.48, "target_px_mean_hist": 715.8, "cur_frame_id": 57, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00067/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00068/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00069/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00070/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00071/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [49.26, 133.33, 20.02, -43.71, -26.74, 0.0]\n  Target bbox: [618.12, 321.06, 662.24, 398.35]\n\nFrame 2:\n  Drone pose: [49.76, 132.95, 20.04, -43.7, -31.04, 0.0]\n  Target bbox: [664.9, 320.92, 713.3, 398.91]\n\nFrame 3:\n  Drone pose: [50.27, 132.46, 19.9, -43.48, -28.45, 0.0]\n  Target bbox: [635.96, 321.5, 677.92, 397.14]\n\nFrame 4:\n  Drone pose: [50.7, 131.78, 20.13, -45.39, -28.55, 0.0]\n  Target bbox: [646.17, 297.29, 683.65, 370.06]\n\nFrame 5 (current):\n  Drone pose: [51.24, 131.38, 20.04, -40.1, -30.51, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 662.74, \"ymin\": 382.59, \"xmax\": 707.54, \"ymax\": 458.55}, \"waypoint_deltas\": [{\"dx\": 0.46, \"dy\": -0.56, \"dz\": 0.0, \"dpitch\": -3.57, \"dyaw\": 3.86, \"droll\": 0.0}, {\"dx\": 0.92, \"dy\": -1.09, \"dz\": 0.01, \"dpitch\": -3.55, \"dyaw\": 4.01, \"droll\": 0.0}, {\"dx\": 1.38, \"dy\": -1.62, \"dz\": 0.02, \"dpitch\": -3.53, \"dyaw\": 4.12, \"droll\": 0.0}, {\"dx\": 1.84, \"dy\": -2.13, \"dz\": 0.03, \"dpitch\": -3.5, \"dyaw\": 4.2, \"droll\": 0.0}, {\"dx\": 2.31, \"dy\": -2.61, \"dz\": 0.04, \"dpitch\": -3.46, \"dyaw\": 4.19, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 8.96, "window_alt_abs_m": 0.48, "target_px_mean_hist": 726.0, "cur_frame_id": 71, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00080/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00081/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00082/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00083/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00084/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [55.4, 127.43, 20.15, -43.37, -26.55, 0.0]\n  Target bbox: [617.73, 320.37, 661.97, 399.13]\n\nFrame 2:\n  Drone pose: [55.76, 127.19, 20.14, -39.72, -21.1, 0.0]\n  Target bbox: [560.81, 384.19, 604.26, 459.59]\n\nFrame 3:\n  Drone pose: [56.34, 127.14, 20.19, -38.81, -22.87, 0.0]\n  Target bbox: [583.43, 401.75, 626.15, 477.76]\n\nFrame 4:\n  Drone pose: [56.7, 126.93, 20.21, -43.52, -25.01, 0.0]\n  Target bbox: [618.12, 321.64, 661.63, 397.8]\n\nFrame 5 (current):\n  Drone pose: [57.2, 127.11, 20.25, -43.47, -25.46, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.66, \"ymin\": 322.41, \"xmax\": 661.14, \"ymax\": 396.99}, \"waypoint_deltas\": [{\"dx\": 0.58, \"dy\": -0.28, \"dz\": 0.02, \"dpitch\": -0.31, \"dyaw\": 0.63, \"droll\": 0.0}, {\"dx\": 1.07, \"dy\": -0.44, \"dz\": 0.05, \"dpitch\": -0.44, \"dyaw\": 1.05, \"droll\": 0.0}, {\"dx\": 1.58, \"dy\": -0.66, \"dz\": 0.09, \"dpitch\": -0.63, \"dyaw\": 1.61, \"droll\": 0.0}, {\"dx\": 2.1, \"dy\": -0.93, \"dz\": 0.13, \"dpitch\": -0.57, \"dyaw\": 0.99, \"droll\": 0.0}, {\"dx\": 2.62, \"dy\": -1.26, \"dz\": 0.16, \"dpitch\": -0.55, \"dyaw\": 0.51, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 9.8, "window_alt_abs_m": 0.13, "target_px_mean_hist": 690.8, "cur_frame_id": 84, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00093/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00094/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00095/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00096/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00097/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [62.4, 124.65, 20.69, -44.67, -27.76, 0.0]\n  Target bbox: [617.43, 321.43, 662.94, 397.95]\n\nFrame 2:\n  Drone pose: [63.23, 124.97, 20.71, -40.14, -31.63, 0.0]\n  Target bbox: [635.05, 396.62, 678.27, 471.34]\n\nFrame 3:\n  Drone pose: [64.49, 125.12, 20.65, -43.61, -38.07, 0.0]\n  Target bbox: [679.37, 346.89, 720.85, 419.19]\n\nFrame 4:\n  Drone pose: [66.18, 126.31, 20.74, -40.77, -34.11, 0.0]\n  Target bbox: [559.26, 393.72, 601.35, 470.59]\n\nFrame 5 (current):\n  Drone pose: [68.03, 127.32, 20.82, -45.08, -45.02, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 624.39, \"ymin\": 325.98, \"xmax\": 655.6, \"ymax\": 393.17}, \"waypoint_deltas\": [{\"dx\": 1.01, \"dy\": 0.22, \"dz\": 0.02, \"dpitch\": 0.21, \"dyaw\": -2.52, \"droll\": 0.0}, {\"dx\": 1.94, \"dy\": 0.23, \"dz\": 0.07, \"dpitch\": 0.28, \"dyaw\": -4.43, \"droll\": 0.0}, {\"dx\": 2.69, \"dy\": 0.02, \"dz\": 0.12, \"dpitch\": 0.3, \"dyaw\": -5.51, \"droll\": 0.0}, {\"dx\": 3.38, \"dy\": -0.25, \"dz\": 0.17, \"dpitch\": 0.3, \"dyaw\": -6.36, \"droll\": 0.0}, {\"dx\": 3.88, \"dy\": -0.75, \"dz\": 0.22, \"dpitch\": 0.23, \"dyaw\": -6.36, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 25.19, "window_alt_abs_m": 0.25, "target_px_mean_hist": 697.5, "cur_frame_id": 97, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00107/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00108/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00109/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00110/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00111/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [74.31, 125.18, 21.24, -44.92, -51.36, 0.0]\n  Target bbox: [620.05, 323.52, 659.48, 395.88]\n\nFrame 2:\n  Drone pose: [74.92, 124.54, 21.2, -45.74, -50.08, 0.0]\n  Target bbox: [604.17, 312.35, 645.22, 386.52]\n\nFrame 3:\n  Drone pose: [75.41, 124.57, 21.32, -45.24, -51.38, 0.0]\n  Target bbox: [619.1, 322.69, 661.01, 396.71]\n\nFrame 4:\n  Drone pose: [75.94, 124.56, 21.32, -45.29, -51.42, 0.0]\n  Target bbox: [622.68, 325.28, 656.97, 393.97]\n\nFrame 5 (current):\n  Drone pose: [76.41, 124.07, 21.37, -45.31, -51.38, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 618.03, \"ymin\": 322.16, \"xmax\": 662.07, \"ymax\": 397.27}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": 0.0, \"dz\": 0.01, \"dpitch\": -0.02, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -0.5, \"dz\": 0.03, \"dpitch\": -0.04, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -0.5, \"dz\": 0.03, \"dpitch\": -0.05, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -1.0, \"dz\": 0.04, \"dpitch\": -0.06, \"dyaw\": 0.0, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -1.0, \"dz\": 0.03, \"dpitch\": -0.26, \"dyaw\": 0.0, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 2.65, "window_alt_abs_m": 0.2, "target_px_mean_hist": 629.5, "cur_frame_id": 111, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
+{"images": ["/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00120/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00121/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00122/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00123/rgb.png", "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052/aug_001/frames_playback/frame_00124/rgb.png"], "messages": [{"role": "system", "content": "You are a drone navigation expert. Given five sequential observation frames and the drone ego-state history, predict the target bounding box for the current frame and relative waypoint offsets for the next five frames.\n\nInput format:\n- Frames 1-4 (history): each provides the drone pose and a 2D axis-aligned target bounding box [xmin, ymin, xmax, ymax] in image-plane pixel coordinates.\n- Frame 5 (current): only the drone pose is provided; target bounding-box observations for this timestep are omitted (e.g., detection not yet applied).\n- Drone pose: [x, y, z, pitch, yaw, roll] in the world frame (position in meters, Euler angles in degrees).\n- Target bounding-box semantics: when the target is visible in the camera view, the box denotes ground-truth image-plane localization; when the target is not visible, the box encodes a trajectory-inferred localization and is explicitly annotated in the frame text (e.g., as a model-predicted box).\n\nOutput:\n1. current_bbox: the predicted target bounding box [xmin, ymin, xmax, ymax] for the current frame (Frame 5).\n2. waypoint_deltas: five waypoint deltas relative to the current drone pose.\n   - Each delta: [dx, dy, dz, dpitch, dyaw, droll]\n\nRespond ONLY with a JSON object."}, {"role": "user", "content": "<image>\n<image>\n<image>\n<image>\n<image>\nFlight state history (t-4 to t):\n\nFrame 1:\n  Drone pose: [80.99, 122.0, 21.42, -45.76, -51.44, 0.0]\n  Target bbox: [619.61, 322.95, 660.5, 396.39]\n\nFrame 2:\n  Drone pose: [81.26, 122.07, 21.45, -48.9, -54.82, 0.0]\n  Target bbox: [665.41, 268.88, 704.37, 338.85]\n\nFrame 3:\n  Drone pose: [81.91, 121.57, 21.3, -43.81, -46.38, 0.0]\n  Target bbox: [558.17, 351.24, 603.13, 426.2]\n\nFrame 4:\n  Drone pose: [82.41, 121.57, 21.27, -41.27, -51.78, 0.0]\n  Target bbox: [620.36, 390.23, 669.16, 467.95]\n\nFrame 5 (current):\n  Drone pose: [82.91, 121.63, 21.19, -45.22, -51.49, 0.0]\n\nPredict the target bounding box for the current frame and the next 5 waypoint deltas from current pose."}, {"role": "assistant", "content": "{\"current_bbox\": {\"xmin\": 621.71, \"ymin\": 324.9, \"xmax\": 657.91, \"ymax\": 394.37}, \"waypoint_deltas\": [{\"dx\": 0.5, \"dy\": -0.56, \"dz\": 0.01, \"dpitch\": -0.07, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 1.0, \"dy\": -1.06, \"dz\": -0.03, \"dpitch\": -0.02, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 1.5, \"dy\": -1.06, \"dz\": -0.07, \"dpitch\": 0.04, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 2.0, \"dy\": -1.56, \"dz\": -0.11, \"dpitch\": 0.1, \"dyaw\": 0.11, \"droll\": 0.0}, {\"dx\": 2.5, \"dy\": -1.56, \"dz\": -0.15, \"dpitch\": 0.15, \"dyaw\": 0.11, \"droll\": 0.0}]}"}], "sample_flags": {"hist_invisible_cnt": 0, "current_invisible": false, "window_yaw_abs_deg": 17.49, "window_alt_abs_m": 0.29, "target_px_mean_hist": 651.5, "cur_frame_id": 124, "source": "aug_001", "fut_invisible_cnt": 0}, "difficulty": "easy", "trajectory": "/obs:/autel-data-vla/data-v5/ip38/2026-04-16/trajectory_1776306052", "difficulty_score": 0.4543, "seen_in_selected_250k": false, "seen_group": "unseen"}
diff --git a/scripts/exp4_strict_offline_analysis.py b/scripts/exp4_strict_offline_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae974456ea279c19e89a99afe1b20a4a0134e1f1
--- /dev/null
+++ b/scripts/exp4_strict_offline_analysis.py
@@ -0,0 +1,364 @@
+#!/usr/bin/env python3
+"""Offline strict analysis from raw_errors_*.json files.
+
+This computes EXACT metrics that need sample-level data (joint constraints,
+percentile distributions, failure mode breakdown, Pareto frontier, etc.) that
+the on-line eval script cannot easily aggregate.
+"""
+import json
+import glob
+import os
+import math
+from collections import OrderedDict, defaultdict
+import numpy as np
+
+
+OUT_A = "/mnt/sfs_turbo_new/R11181/project_vlm/exp_v5/output/job_exp4_settingA_20260430_083003"
+OUT_B = "/mnt/sfs_turbo_new/R11181/project_vlm/exp_v5/output/job_exp4_settingB_20260430_083037"
+
+DIMS = ["dx", "dy", "dz", "dpitch", "dyaw", "droll"]
+SEEN_BY_B = {"Town01_Opt", "Town02_Opt", "Town03_Opt", "Town04_Opt",
+             "Town05_Opt", "Town06_Opt", "Town07_Opt"}
+UNSEEN_BY_B = {"Town10HD"}
+
+
+def load_raw(out_dir):
+    """Returns {map_name: list of sample dicts (each sample has dim->list[per_wp])}."""
+    res = {}
+    for d in sorted(glob.glob(f"{out_dir}/eval_strict_*")):
+        if not os.path.isdir(d):
+            continue
+        map_name = os.path.basename(d).replace("eval_strict_", "")
+        files = glob.glob(f"{d}/raw_errors_*.json")
+        if not files:
+            continue
+        with open(files[0]) as f:
+            payload = json.load(f)
+        res[map_name] = payload["errors_per_sample"]
+    return res
+
+
+def per_sample_pos_rot(sample):
+    """Convert {dim:[per_wp]} to ([pos_per_wp], [rot_per_wp])."""
+    pos = []
+    rot = []
+    nw = len(sample["dx"])
+    for i in range(nw):
+        p = math.sqrt(sample["dx"][i]**2 + sample["dy"][i]**2 + sample["dz"][i]**2)
+        r = math.sqrt(sample["dpitch"][i]**2 + sample["dyaw"][i]**2 + sample["droll"][i]**2)
+        pos.append(p)
+        rot.append(r)
+    return pos, rot
+
+
+def aggregate_metrics(samples):
+    """Compute EXACT strict metrics from sample-level raw data."""
+    if not samples:
+        return {}
+    n = len(samples)
+    pos_rot = [per_sample_pos_rot(s) for s in samples]
+    all_pos = [p for poss, _ in pos_rot for p in poss]
+    all_rot = [r for _, rots in pos_rot for r in rots]
+    fde = [poss[-1] for poss, _ in pos_rot]
+    ade = [sum(poss)/len(poss) for poss, _ in pos_rot]
+    fde_rot = [rots[-1] for _, rots in pos_rot]
+    ade_rot = [sum(rots)/len(rots) for _, rots in pos_rot]
+
+    m = OrderedDict()
+
+    # ---- Sample-level rates (any wp under threshold) ----
+    POS_THRS = [0.1, 0.2, 0.3, 0.5, 1.0, 2.0]
+    ROT_THRS = [0.5, 1.0, 2.0, 5.0, 10.0]
+    for thr in POS_THRS:
+        m[f"SR@{thr}m"] = sum(1 for p in all_pos if p < thr) / len(all_pos)
+    for thr in ROT_THRS:
+        m[f"RotAcc@{thr}deg"] = sum(1 for r in all_rot if r < thr) / len(all_rot)
+
+    # ---- Trajectory-level (ALL wps under threshold) ----
+    TRAJ_POS = [0.3, 0.5, 1.0, 2.0]
+    TRAJ_ROT = [1.0, 2.0, 5.0, 10.0]
+    for thr in TRAJ_POS:
+        m[f"TrajSR@{thr}m"] = sum(1 for poss, _ in pos_rot if all(p < thr for p in poss)) / n
+    for thr in TRAJ_ROT:
+        m[f"TrajRotSR@{thr}deg"] = sum(1 for _, rots in pos_rot if all(r < thr for r in rots)) / n
+
+    # ---- TRUE Joint constraint rates (any wp satisfies BOTH pos AND rot) ----
+    JOINT = [(0.5, 1.0), (0.5, 5.0), (0.5, 2.0),
+             (0.3, 1.0), (1.0, 1.0), (1.0, 5.0)]
+    for pt, rt in JOINT:
+        hit = 0
+        for poss, rots in pos_rot:
+            if any(p < pt and r < rt for p, r in zip(poss, rots)):
+                hit += 1
+        m[f"JointSR@({pt}m,{rt}deg)"] = hit / n
+
+    # ---- Trajectory-level TRUE Joint (ALL wps satisfy BOTH) ----
+    for pt, rt in JOINT:
+        hit = 0
+        for poss, rots in pos_rot:
+            if all(p < pt and r < rt for p, r in zip(poss, rots)):
+                hit += 1
+        m[f"TrajJointSR@({pt}m,{rt}deg)"] = hit / n
+
+    # ---- Percentile / tail metrics ----
+    fde_arr = np.array(fde); ade_arr = np.array(ade)
+    rot_arr = np.array(all_rot); pos_arr = np.array(all_pos)
+    for p in [50, 75, 90, 95, 99]:
+        m[f"FDE_p{p}"] = float(np.percentile(fde_arr, p))
+        m[f"ADE_p{p}"] = float(np.percentile(ade_arr, p))
+        m[f"rot_err_p{p}"] = float(np.percentile(rot_arr, p))
+        m[f"pos_err_p{p}"] = float(np.percentile(pos_arr, p))
+    m["FDE_max"] = float(fde_arr.max())
+    m["ADE_max"] = float(ade_arr.max())
+    m["rot_err_max"] = float(rot_arr.max())
+
+    # ---- Hard failure rates ----
+    for thr in [1.0, 2.0, 5.0, 10.0]:
+        m[f"HardFailRate_FDE_gt_{thr}m"] = sum(1 for f in fde if f > thr) / n
+    for thr in [10.0, 30.0, 60.0]:
+        per_sample_max_rot = [max(rots) if rots else 0 for _, rots in pos_rot]
+        m[f"HardFailRate_max_rot_gt_{thr}deg"] = sum(1 for r in per_sample_max_rot if r > thr) / n
+
+    # ---- Standard summary ----
+    m["FDE_mean"] = float(fde_arr.mean())
+    m["ADE_mean"] = float(ade_arr.mean())
+    m["FDE_rot_mean"] = float(np.array(fde_rot).mean())
+    m["pos_mae"] = float(pos_arr.mean())
+    m["rot_mae"] = float(rot_arr.mean())
+    m["pos_rmse"] = float(np.sqrt((pos_arr ** 2).mean()))
+    m["rot_rmse"] = float(np.sqrt((rot_arr ** 2).mean()))
+    m["n_samples"] = n
+    return m
+
+
+def fmt_pct(v): return f"{v*100:6.2f}%"
+def fmt_num(v, d=4): return f"{v:7.{d}f}"
+
+
+def main():
+    print("Loading raw error data ...")
+    A = load_raw(OUT_A)
+    B = load_raw(OUT_B)
+    maps = sorted(set(A.keys()) & set(B.keys()))
+    if not maps:
+        print("[ERROR] no maps with raw_errors_*.json found.")
+        print("Did you run eval_exp4_strict_parallel.sh first?")
+        return
+
+    print(f"Maps with raw data: {maps}\n")
+
+    # Compute exact metrics per map
+    metrics_A = {m: aggregate_metrics(A[m]) for m in maps}
+    metrics_B = {m: aggregate_metrics(B[m]) for m in maps}
+
+    # MEAN across maps (exclude all)
+    eval_maps = [m for m in maps if m != "all"]
+    mean_A = OrderedDict()
+    mean_B = OrderedDict()
+    for k in metrics_A[eval_maps[0]].keys():
+        if k == "n_samples":
+            continue
+        mean_A[k] = sum(metrics_A[m][k] for m in eval_maps) / len(eval_maps)
+        mean_B[k] = sum(metrics_B[m][k] for m in eval_maps) / len(eval_maps)
+
+    # ========================================================================
+    # SECTION 1: Layered metrics (loose -> extreme strict)
+    # ========================================================================
+    print("=" * 100)
+    print(" SECTION 1 — Layered precision (sample-level rates, EXACT)")
+    print("=" * 100)
+    LAYERED = OrderedDict([
+        ("L1 LOOSE (saturated)", [
+            ("SR@1.0m", "higher", "%"),
+            ("SR@2.0m", "higher", "%"),
+            ("RotAcc@10.0deg", "higher", "%"),
+        ]),
+        ("L2 STANDARD", [
+            ("SR@0.5m", "higher", "%"),
+            ("RotAcc@5.0deg", "higher", "%"),
+            ("TrajSR@1.0m", "higher", "%"),
+        ]),
+        ("L3 STRICT", [
+            ("SR@0.3m", "higher", "%"),
+            ("RotAcc@2.0deg", "higher", "%"),
+            ("RotAcc@1.0deg", "higher", "%"),
+            ("TrajSR@0.5m", "higher", "%"),
+            ("TrajRotSR@5.0deg", "higher", "%"),
+        ]),
+        ("L4 EXTREME", [
+            ("SR@0.2m", "higher", "%"),
+            ("SR@0.1m", "higher", "%"),
+            ("RotAcc@0.5deg", "higher", "%"),
+            ("TrajSR@0.3m", "higher", "%"),
+            ("TrajRotSR@1.0deg", "higher", "%"),
+        ]),
+    ])
+
+    for layer, entries in LAYERED.items():
+        print(f"\n>>> {layer}")
+        print(f"  {'Metric':25s}{'A mean':>12s}{'B mean':>12s}{'B - A':>12s}{'Win%':>10s}")
+        print("  " + "-" * 75)
+        for key, direction, _ in entries:
+            a, b = mean_A.get(key), mean_B.get(key)
+            if a is None or b is None:
+                continue
+            # win rate across maps
+            wins = sum(1 for m in eval_maps
+                       if (metrics_B[m][key] > metrics_A[m][key] if direction == "higher"
+                           else metrics_B[m][key] < metrics_A[m][key]))
+            ties = sum(1 for m in eval_maps if metrics_A[m][key] == metrics_B[m][key])
+            diff = (b - a) * 100
+            print(f"  {key:25s}{fmt_pct(a):>12s}{fmt_pct(b):>12s}"
+                  f"{diff:+11.2f}pp{wins:>3d}/{len(eval_maps)}+{ties}t")
+
+    # ========================================================================
+    # SECTION 2: TRUE JOINT constraints (exact)
+    # ========================================================================
+    print("\n" + "=" * 100)
+    print(" SECTION 2 — TRUE JOINT constraints (sample-level AND, exact)")
+    print("=" * 100)
+    print("  This is the GOLD STANDARD: each sample must satisfy BOTH pos+rot.")
+    print()
+    print(f"  {'Metric':40s}{'A mean':>12s}{'B mean':>12s}{'B - A':>12s}{'Win%':>10s}")
+    print("  " + "-" * 90)
+    JOINT_ROWS = [
+        ("JointSR@(0.5m,1.0deg)",     "any wp pos<0.5 AND rot<1°"),
+        ("JointSR@(0.5m,5.0deg)",     "any wp pos<0.5 AND rot<5°"),
+        ("JointSR@(0.3m,1.0deg)",     "any wp pos<0.3 AND rot<1°"),
+        ("JointSR@(1.0m,1.0deg)",     "any wp pos<1.0 AND rot<1°"),
+        ("TrajJointSR@(0.5m,1.0deg)", "ALL wps pos<0.5 AND rot<1°"),
+        ("TrajJointSR@(0.5m,5.0deg)", "ALL wps pos<0.5 AND rot<5°"),
+        ("TrajJointSR@(1.0m,5.0deg)", "ALL wps pos<1.0 AND rot<5°"),
+    ]
+    for key, _desc in JOINT_ROWS:
+        a, b = mean_A.get(key), mean_B.get(key)
+        if a is None or b is None:
+            continue
+        wins = sum(1 for m in eval_maps if metrics_B[m][key] > metrics_A[m][key])
+        ties = sum(1 for m in eval_maps if metrics_A[m][key] == metrics_B[m][key])
+        diff = (b - a) * 100
+        print(f"  {key:40s}{fmt_pct(a):>12s}{fmt_pct(b):>12s}"
+              f"{diff:+11.2f}pp{wins:>3d}/{len(eval_maps)}+{ties}t")
+
+    # ========================================================================
+    # SECTION 3: Percentile distributions (tail risk)
+    # ========================================================================
+    print("\n" + "=" * 100)
+    print(" SECTION 3 — Percentile distributions (tail risk, exact)")
+    print("=" * 100)
+    print("  Lower is better for all (these are error percentiles).")
+    print()
+    print(f"  {'Metric':25s}{'A':>10s}{'B':>10s}{'B improves':>14s}{'Win%':>10s}")
+    print("  " + "-" * 75)
+    PCT_ROWS = ["FDE_p50", "FDE_p75", "FDE_p90", "FDE_p95", "FDE_p99",
+                "ADE_p50", "ADE_p75", "ADE_p90", "ADE_p95", "ADE_p99",
+                "rot_err_p50", "rot_err_p75", "rot_err_p90", "rot_err_p95", "rot_err_p99",
+                "pos_err_p50", "pos_err_p75", "pos_err_p90", "pos_err_p95", "pos_err_p99",
+                "FDE_max", "ADE_max", "rot_err_max"]
+    for key in PCT_ROWS:
+        a, b = mean_A.get(key), mean_B.get(key)
+        if a is None or b is None:
+            continue
+        wins = sum(1 for m in eval_maps if metrics_B[m][key] < metrics_A[m][key])
+        rel = (a - b) / max(abs(a), 1e-9) * 100
+        print(f"  {key:25s}{fmt_num(a):>10s}{fmt_num(b):>10s}"
+              f"{rel:+12.2f}%  {wins:>3d}/{len(eval_maps)}")
+
+    # ========================================================================
+    # SECTION 4: Hard failure rates (catastrophic predictions)
+    # ========================================================================
+    print("\n" + "=" * 100)
+    print(" SECTION 4 — HARD failure rates (catastrophic predictions)")
+    print("=" * 100)
+    print("  Lower is better. These are samples where the model went seriously wrong.")
+    print()
+    print(f"  {'Metric':40s}{'A':>10s}{'B':>10s}{'B improves':>14s}{'Win%':>10s}")
+    print("  " + "-" * 90)
+    HARD_ROWS = ["HardFailRate_FDE_gt_1.0m", "HardFailRate_FDE_gt_2.0m",
+                 "HardFailRate_FDE_gt_5.0m", "HardFailRate_FDE_gt_10.0m",
+                 "HardFailRate_max_rot_gt_10.0deg",
+                 "HardFailRate_max_rot_gt_30.0deg",
+                 "HardFailRate_max_rot_gt_60.0deg"]
+    for key in HARD_ROWS:
+        a, b = mean_A.get(key), mean_B.get(key)
+        if a is None or b is None:
+            continue
+        wins = sum(1 for m in eval_maps if metrics_B[m][key] < metrics_A[m][key])
+        rel = (a - b) / max(abs(a), 1e-9) * 100 if a > 0 else 0
+        print(f"  {key:40s}{fmt_pct(a):>10s}{fmt_pct(b):>10s}"
+              f"{rel:+12.2f}%  {wins:>3d}/{len(eval_maps)}")
+
+    # ========================================================================
+    # SECTION 5: OOD analysis (Town10HD vs seen maps)
+    # ========================================================================
+    print("\n" + "=" * 100)
+    print(" SECTION 5 — OOD generalization (Town10HD = TRUE hold-out)")
+    print("=" * 100)
+    seen_maps = sorted(set(eval_maps) & SEEN_BY_B)
+    unseen_maps = sorted(set(eval_maps) & UNSEEN_BY_B)
+    if not unseen_maps:
+        print("  No OOD maps in eval set, skipping.")
+    else:
+        print(f"  Seen by B (near-domain): {seen_maps}")
+        print(f"  TRUE OOD: {unseen_maps}")
+        print()
+        OOD_KEYS = ["JointSR@(0.5m,1.0deg)", "TrajJointSR@(0.5m,5.0deg)",
+                    "RotAcc@1.0deg", "FDE_p95", "HardFailRate_FDE_gt_2.0m"]
+        for k in OOD_KEYS:
+            a_seen = sum(metrics_A[m][k] for m in seen_maps) / len(seen_maps)
+            b_seen = sum(metrics_B[m][k] for m in seen_maps) / len(seen_maps)
+            a_uns = sum(metrics_A[m][k] for m in unseen_maps) / len(unseen_maps)
+            b_uns = sum(metrics_B[m][k] for m in unseen_maps) / len(unseen_maps)
+            is_pct = "SR" in k or "Acc" in k or "Rate" in k
+            f = fmt_pct if is_pct else fmt_num
+            print(f"  {k:40s}")
+            print(f"     A seen: {f(a_seen)}  B seen: {f(b_seen)}  "
+                  f"A unseen: {f(a_uns)}  B unseen: {f(b_uns)}")
+            if is_pct:
+                gap_seen = (b_seen - a_seen) * 100
+                gap_uns = (b_uns - a_uns) * 100
+                print(f"     B-A on seen: {gap_seen:+.2f}pp,  "
+                      f"B-A on OOD: {gap_uns:+.2f}pp,  "
+                      f"OOD-loss-A: {(a_seen-a_uns)*100:.2f}pp,  "
+                      f"OOD-loss-B: {(b_seen-b_uns)*100:.2f}pp")
+
+    # ========================================================================
+    # SECTION 6: Composite verdict
+    # ========================================================================
+    print("\n" + "=" * 100)
+    print(" SECTION 6 — Verdict")
+    print("=" * 100)
+    # Win rate over a curated set
+    KEY_VERDICT_METRICS = [
+        ("SR@0.5m", "higher"),
+        ("RotAcc@1.0deg", "higher"),
+        ("JointSR@(0.5m,1.0deg)", "higher"),
+        ("TrajJointSR@(0.5m,5.0deg)", "higher"),
+        ("FDE_p95", "lower"),
+        ("HardFailRate_FDE_gt_2.0m", "lower"),
+        ("rot_err_p95", "lower"),
+    ]
+    print(f"  {'Verdict metric':40s}{'A':>11s}{'B':>11s}{'B advantage':>15s}")
+    print("  " + "-" * 80)
+    a_wins = 0; b_wins = 0
+    for key, direction in KEY_VERDICT_METRICS:
+        a, b = mean_A.get(key), mean_B.get(key)
+        if a is None or b is None:
+            continue
+        is_pct = "SR" in key or "Acc" in key or "Rate" in key
+        f = fmt_pct if is_pct else fmt_num
+        if direction == "higher":
+            adv = f"{(b-a)*100:+.2f}pp"
+            if b > a: b_wins += 1
+            else: a_wins += 1
+        else:
+            adv = f"{(a-b)/max(abs(a),1e-9)*100:+.2f}%"
+            if b < a: b_wins += 1
+            else: a_wins += 1
+        print(f"  {key:40s}{f(a):>11s}{f(b):>11s}{adv:>15s}")
+    print()
+    print(f"  Overall: B wins {b_wins}/{a_wins+b_wins} verdict metrics.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/patch_gemma_checkpoint.py b/scripts/patch_gemma_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e57b833e950b2df481ea70a713c9be7acd69956
--- /dev/null
+++ b/scripts/patch_gemma_checkpoint.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""Patch a Gemma-4 SFT checkpoint that is missing some weights.
+
+DeepSpeed ZeRO3 sometimes drops sliding-window-layer K/V weights when saving.
+This script copies the missing weights from the base model into the SFT
+checkpoint, producing a complete model.safetensors plus an updated
+model.safetensors.index.json (if needed) so the model can be loaded by vLLM.
+
+Usage:
+  python3 patch_gemma_checkpoint.py \
+      --base /path/to/Gemma-4-E4B-it \
+      --sft  /path/to/sft/output_dir \
+      [--out /path/to/output_dir]   # default: in-place
+"""
+import argparse
+import json
+import os
+import shutil
+from pathlib import Path
+
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--base", required=True, help="path to base model dir")
+    ap.add_argument("--sft", required=True, help="path to sft model dir")
+    ap.add_argument("--out", default=None, help="output dir (default: --sft)")
+    args = ap.parse_args()
+
+    base_dir = Path(args.base)
+    sft_dir = Path(args.sft)
+    out_dir = Path(args.out) if args.out else sft_dir
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    base_st = base_dir / "model.safetensors"
+    sft_st = sft_dir / "model.safetensors"
+    if not base_st.exists():
+        # multi-shard base model
+        base_st_index = base_dir / "model.safetensors.index.json"
+        assert base_st_index.exists(), f"missing {base_st} or its index"
+    if not sft_st.exists():
+        sft_st_index = sft_dir / "model.safetensors.index.json"
+        assert sft_st_index.exists(), f"missing {sft_st} or its index"
+
+    # load all sft tensors
+    print(f"[1/4] loading SFT weights from {sft_dir}...")
+    sft_tensors = {}
+    sft_index_file = sft_dir / "model.safetensors.index.json"
+    if sft_st.exists():
+        with safe_open(sft_st, framework="pt") as f:
+            for k in f.keys():
+                sft_tensors[k] = f.get_tensor(k)
+    else:
+        idx = json.loads(sft_index_file.read_text())
+        for shard in set(idx["weight_map"].values()):
+            with safe_open(sft_dir / shard, framework="pt") as f:
+                for k in f.keys():
+                    sft_tensors[k] = f.get_tensor(k)
+
+    # load all base tensor keys
+    print(f"[2/4] scanning base weights from {base_dir}...")
+    base_keys_to_files = {}
+    if base_st.exists():
+        with safe_open(base_st, framework="pt") as f:
+            for k in f.keys():
+                base_keys_to_files[k] = base_st
+    else:
+        idx = json.loads((base_dir / "model.safetensors.index.json").read_text())
+        for k, shard in idx["weight_map"].items():
+            base_keys_to_files[k] = base_dir / shard
+
+    base_keys = set(base_keys_to_files.keys())
+    sft_keys = set(sft_tensors.keys())
+    missing = sorted(base_keys - sft_keys)
+    extra = sorted(sft_keys - base_keys)
+
+    print(f"  base keys: {len(base_keys)}")
+    print(f"  sft  keys: {len(sft_keys)}")
+    print(f"  missing in sft: {len(missing)}  (will copy from base)")
+    print(f"  extra in sft  : {len(extra)}    (kept as-is)")
+
+    if not missing:
+        print("[OK] nothing to patch; sft is already complete")
+        return
+
+    # group missing keys by source shard, copy in batch
+    print(f"[3/4] copying {len(missing)} missing weights from base...")
+    by_shard = {}
+    for k in missing:
+        by_shard.setdefault(base_keys_to_files[k], []).append(k)
+
+    for shard_path, keys in by_shard.items():
+        with safe_open(shard_path, framework="pt") as f:
+            for k in keys:
+                t = f.get_tensor(k)
+                if t.dtype != torch.bfloat16:
+                    t = t.to(torch.bfloat16)
+                sft_tensors[k] = t
+
+    # write back as a single safetensors file
+    out_path = out_dir / "model.safetensors"
+    print(f"[4/4] writing patched checkpoint -> {out_path}")
+    # remove any stale single-file or index file in out_dir to avoid mismatch
+    if out_dir == sft_dir:
+        for stale in [out_dir / "model.safetensors.index.json"]:
+            if stale.exists():
+                print(f"  removing stale {stale}")
+                stale.unlink()
+    save_file(sft_tensors, str(out_path), metadata={"format": "pt"})
+    print(f"[OK] saved {len(sft_tensors)} tensors to {out_path}")
+    print(f"     size: {out_path.stat().st_size / 1e9:.2f} GB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/patch_qwen35_visual_keys.py b/scripts/patch_qwen35_visual_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..4415046b686650796224bed30cfe797d1d2dcb66
--- /dev/null
+++ b/scripts/patch_qwen35_visual_keys.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""Rename mis-located visual encoder keys in Qwen3.5 SFT checkpoints.
+
+Symptom: After full-parameter SFT with `freeze_vision_tower: true`, the
+saved checkpoint nests visual weights under `model.language_model.visual.*`
+instead of the expected top-level `visual.*` (which vLLM looks for).
+
+This script renames every key matching `model.language_model.visual.*`
+to `visual.*` and writes a new model.safetensors in place.
+
+Usage:
+  python3 patch_qwen35_visual_keys.py --sft /path/to/sft/output_dir
+"""
+import argparse
+import json
+from pathlib import Path
+
+from safetensors import safe_open
+from safetensors.torch import save_file
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--sft", required=True, help="path to sft model dir (contains model.safetensors)")
+    args = ap.parse_args()
+
+    sft_dir = Path(args.sft)
+    sft_st = sft_dir / "model.safetensors"
+    sft_index = sft_dir / "model.safetensors.index.json"
+
+    print(f"[1/3] loading SFT weights from {sft_dir}...")
+    sft_tensors = {}
+    if sft_st.exists():
+        with safe_open(sft_st, framework="pt") as f:
+            for k in f.keys():
+                sft_tensors[k] = f.get_tensor(k)
+    elif sft_index.exists():
+        idx = json.loads(sft_index.read_text())
+        for shard in set(idx["weight_map"].values()):
+            with safe_open(sft_dir / shard, framework="pt") as f:
+                for k in f.keys():
+                    sft_tensors[k] = f.get_tensor(k)
+    else:
+        raise FileNotFoundError(f"no model.safetensors or index in {sft_dir}")
+
+    print(f"  loaded {len(sft_tensors)} tensors")
+
+    print("[2/3] renaming model.language_model.visual.* -> visual.* ...")
+    PREFIX = "model.language_model.visual."
+    new_tensors = {}
+    renamed = 0
+    for k, v in sft_tensors.items():
+        if k.startswith(PREFIX):
+            new_key = "visual." + k[len(PREFIX):]
+            new_tensors[new_key] = v
+            renamed += 1
+        else:
+            new_tensors[k] = v
+
+    print(f"  renamed {renamed} keys")
+    if renamed == 0:
+        print("[OK] nothing to rename; skipping write")
+        return
+
+    out_path = sft_dir / "model.safetensors"
+    print(f"[3/3] writing patched checkpoint -> {out_path}")
+    if sft_index.exists():
+        print(f"  removing stale index file {sft_index}")
+        sft_index.unlink()
+    save_file(new_tensors, str(out_path), metadata={"format": "pt"})
+    sz_gb = out_path.stat().st_size / 1e9
+    print(f"[OK] wrote {len(new_tensors)} tensors, size {sz_gb:.2f} GB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/watchdog.ps1 b/watchdog.ps1
new file mode 100644
index 0000000000000000000000000000000000000000..cbb6e13b52ec7795f2888b73bb94c7265beeb143
--- /dev/null
+++ b/watchdog.ps1
@@ -0,0 +1,71 @@
+# Upload watchdog: kill+restart upload if its disk Read hasn't grown for 5 min.
+# Run this in its own detached PowerShell.
+
+$LOG_FILE = "D:\hf_upload\.watchdog.log"
+$STALL_SECONDS = 300  # 5 min no IO -> kill
+
+function Log-Msg($msg) {
+    $line = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] $msg"
+    Write-Host $line
+    Add-Content -Path $LOG_FILE -Value $line
+}
+
+function Get-UploadProc {
+    return Get-CimInstance Win32_Process -Filter "Name='python.exe'" |
+           Where-Object { $_.CommandLine -like "*upload_to_hf*" } |
+           Select-Object -First 1
+}
+
+function Restart-Upload {
+    Log-Msg "Restarting upload (LFS dedup will skip already uploaded chunks)..."
+    $cmdLine = 'cmd.exe /c "D:\hf_upload\start_upload_detached.cmd"'
+    $r = Invoke-CimMethod -ClassName Win32_Process -MethodName Create -Arguments @{ CommandLine = $cmdLine }
+    Log-Msg "Restart issued, WMI ReturnValue=$($r.ReturnValue), launcher PID=$($r.ProcessId)"
+    Start-Sleep -Seconds 20
+}
+
+Log-Msg "Watchdog started, stall threshold = $STALL_SECONDS s"
+
+$lastRead = $null
+$lastReadTime = Get-Date
+
+while ($true) {
+    $proc = Get-UploadProc
+    if (-not $proc) {
+        Log-Msg "No upload python found. Restarting..."
+        Restart-Upload
+        $lastRead = $null
+        $lastReadTime = Get-Date
+        continue
+    }
+
+    $curRead = $proc.ReadTransferCount
+    $curPid = $proc.ProcessId
+
+    if ($null -eq $lastRead) {
+        $lastRead = $curRead
+        $lastReadTime = Get-Date
+        Log-Msg "Tracking PID $curPid, init Read=$([math]::Round($curRead/1GB,2)) GB"
+    } elseif ($curRead -gt $lastRead) {
+        $lastRead = $curRead
+        $lastReadTime = Get-Date
+    } else {
+        $stallSec = ((Get-Date) - $lastReadTime).TotalSeconds
+        if ($stallSec -ge $STALL_SECONDS) {
+            Log-Msg "STALL DETECTED on PID ${curPid}: no Read for $([math]::Round($stallSec,0))s, killing..."
+            Stop-Process -Id $curPid -Force -ErrorAction SilentlyContinue
+            Get-CimInstance Win32_Process -Filter "Name='powershell.exe'" |
+                Where-Object { $_.CommandLine -like "*upload_to_hf*" } |
+                ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }
+            Get-CimInstance Win32_Process -Filter "Name='cmd.exe'" |
+                Where-Object { $_.CommandLine -like "*start_upload_detached*" } |
+                ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }
+            Start-Sleep -Seconds 5
+            Restart-Upload
+            $lastRead = $null
+            $lastReadTime = Get-Date
+        }
+    }
+
+    Start-Sleep -Seconds 30
+}