diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..b6996a385f7c877a81241bd702b61456306c8806 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoints/InternVL3.5-8B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoints/Qwen3-VL-2B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoints/Qwen3-VL-8B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoints/Qwen3.5-0.8B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoints/Qwen3.5-2B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoints/Qwen3.5-9B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/.watchdog.log b/.watchdog.log new file mode 100644 index 0000000000000000000000000000000000000000..20444bfe0861bc495ec0c2794babe3fcee26a72e --- /dev/null +++ b/.watchdog.log @@ -0,0 +1,281 @@ +[2026-05-07 01:23:57] Watchdog started, stall threshold = 300 s +[2026-05-07 01:23:58] No upload python found. Restarting... +[2026-05-07 01:23:58] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 01:23:58] Restart issued, WMI ReturnValue=0, launcher PID=20160 +[2026-05-07 01:24:18] Tracking PID 22168, init Read=12.72 GB +[2026-05-07 03:23:40] No upload python found. Restarting... +[2026-05-07 03:23:40] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:23:40] Restart issued, WMI ReturnValue=0, launcher PID=27392 +[2026-05-07 03:24:01] Tracking PID 4136, init Read=9.47 GB +[2026-05-07 03:27:03] No upload python found. Restarting... +[2026-05-07 03:27:03] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:27:03] Restart issued, WMI ReturnValue=0, launcher PID=20248 +[2026-05-07 03:27:24] Tracking PID 13668, init Read=10.66 GB +[2026-05-07 03:30:26] No upload python found. Restarting... +[2026-05-07 03:30:26] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:30:26] Restart issued, WMI ReturnValue=0, launcher PID=14656 +[2026-05-07 03:30:46] Tracking PID 11616, init Read=9.98 GB +[2026-05-07 03:33:48] No upload python found. Restarting... +[2026-05-07 03:33:48] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:33:49] Restart issued, WMI ReturnValue=0, launcher PID=26872 +[2026-05-07 03:34:09] Tracking PID 1688, init Read=8.44 GB +[2026-05-07 03:37:11] No upload python found. Restarting... +[2026-05-07 03:37:11] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:37:11] Restart issued, WMI ReturnValue=0, launcher PID=25172 +[2026-05-07 03:37:31] Tracking PID 20240, init Read=10.62 GB +[2026-05-07 03:40:34] No upload python found. Restarting... +[2026-05-07 03:40:34] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:40:34] Restart issued, WMI ReturnValue=0, launcher PID=15440 +[2026-05-07 03:40:54] Tracking PID 7668, init Read=9.31 GB +[2026-05-07 03:43:56] No upload python found. Restarting... +[2026-05-07 03:43:56] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:43:56] Restart issued, WMI ReturnValue=0, launcher PID=12332 +[2026-05-07 03:44:17] Tracking PID 16364, init Read=9.01 GB +[2026-05-07 03:47:19] No upload python found. Restarting... +[2026-05-07 03:47:19] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:47:19] Restart issued, WMI ReturnValue=0, launcher PID=25116 +[2026-05-07 03:47:39] Tracking PID 21724, init Read=8.21 GB +[2026-05-07 03:50:41] No upload python found. Restarting... +[2026-05-07 03:50:41] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:50:42] Restart issued, WMI ReturnValue=0, launcher PID=29036 +[2026-05-07 03:51:02] Tracking PID 28372, init Read=8.72 GB +[2026-05-07 03:54:04] No upload python found. Restarting... +[2026-05-07 03:54:04] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:54:04] Restart issued, WMI ReturnValue=0, launcher PID=29684 +[2026-05-07 03:54:24] Tracking PID 20664, init Read=8.09 GB +[2026-05-07 03:57:26] No upload python found. Restarting... +[2026-05-07 03:57:26] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 03:57:26] Restart issued, WMI ReturnValue=0, launcher PID=25116 +[2026-05-07 03:57:47] Tracking PID 15052, init Read=8.79 GB +[2026-05-07 04:00:49] No upload python found. Restarting... +[2026-05-07 04:00:49] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:00:49] Restart issued, WMI ReturnValue=0, launcher PID=2668 +[2026-05-07 04:01:09] Tracking PID 8028, init Read=9.38 GB +[2026-05-07 04:04:11] No upload python found. Restarting... +[2026-05-07 04:04:11] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:04:11] Restart issued, WMI ReturnValue=0, launcher PID=4128 +[2026-05-07 04:04:31] Tracking PID 27280, init Read=9.46 GB +[2026-05-07 04:08:04] No upload python found. Restarting... +[2026-05-07 04:08:04] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:08:04] Restart issued, WMI ReturnValue=0, launcher PID=27408 +[2026-05-07 04:08:24] Tracking PID 29060, init Read=10.32 GB +[2026-05-07 04:11:27] No upload python found. Restarting... +[2026-05-07 04:11:27] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:11:27] Restart issued, WMI ReturnValue=0, launcher PID=18400 +[2026-05-07 04:11:47] Tracking PID 28568, init Read=9.09 GB +[2026-05-07 04:14:49] No upload python found. Restarting... +[2026-05-07 04:14:49] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:14:49] Restart issued, WMI ReturnValue=0, launcher PID=25660 +[2026-05-07 04:15:10] Tracking PID 7216, init Read=9.88 GB +[2026-05-07 04:18:12] No upload python found. Restarting... +[2026-05-07 04:18:12] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:18:12] Restart issued, WMI ReturnValue=0, launcher PID=27632 +[2026-05-07 04:18:32] Tracking PID 26584, init Read=8.21 GB +[2026-05-07 04:21:34] No upload python found. Restarting... +[2026-05-07 04:21:34] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:21:34] Restart issued, WMI ReturnValue=0, launcher PID=29684 +[2026-05-07 04:21:54] Tracking PID 1452, init Read=8.94 GB +[2026-05-07 04:24:57] No upload python found. Restarting... +[2026-05-07 04:24:57] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:24:57] Restart issued, WMI ReturnValue=0, launcher PID=23396 +[2026-05-07 04:25:17] Tracking PID 2080, init Read=9.61 GB +[2026-05-07 04:28:19] No upload python found. Restarting... +[2026-05-07 04:28:19] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:28:19] Restart issued, WMI ReturnValue=0, launcher PID=29288 +[2026-05-07 04:28:40] Tracking PID 12628, init Read=9.36 GB +[2026-05-07 04:31:42] No upload python found. Restarting... +[2026-05-07 04:31:42] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:31:42] Restart issued, WMI ReturnValue=0, launcher PID=29080 +[2026-05-07 04:32:02] Tracking PID 20776, init Read=8.87 GB +[2026-05-07 04:35:04] No upload python found. Restarting... +[2026-05-07 04:35:04] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:35:04] Restart issued, WMI ReturnValue=0, launcher PID=25012 +[2026-05-07 04:35:24] Tracking PID 23744, init Read=9.38 GB +[2026-05-07 04:38:27] No upload python found. Restarting... +[2026-05-07 04:38:27] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:38:27] Restart issued, WMI ReturnValue=0, launcher PID=8960 +[2026-05-07 04:38:47] Tracking PID 28516, init Read=8.45 GB +[2026-05-07 04:42:19] No upload python found. Restarting... +[2026-05-07 04:42:19] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:42:20] Restart issued, WMI ReturnValue=0, launcher PID=24896 +[2026-05-07 04:42:40] Tracking PID 20416, init Read=7.3 GB +[2026-05-07 04:45:42] No upload python found. Restarting... +[2026-05-07 04:45:42] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:45:42] Restart issued, WMI ReturnValue=0, launcher PID=16408 +[2026-05-07 04:46:02] Tracking PID 28992, init Read=9.8 GB +[2026-05-07 04:49:05] No upload python found. Restarting... +[2026-05-07 04:49:05] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:49:05] Restart issued, WMI ReturnValue=0, launcher PID=27912 +[2026-05-07 04:49:25] Tracking PID 960, init Read=9.27 GB +[2026-05-07 04:52:27] No upload python found. Restarting... +[2026-05-07 04:52:27] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:52:27] Restart issued, WMI ReturnValue=0, launcher PID=15432 +[2026-05-07 04:52:47] Tracking PID 24880, init Read=9.64 GB +[2026-05-07 04:56:20] No upload python found. Restarting... +[2026-05-07 04:56:20] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:56:20] Restart issued, WMI ReturnValue=0, launcher PID=3744 +[2026-05-07 04:56:41] Tracking PID 25356, init Read=8.27 GB +[2026-05-07 04:59:43] No upload python found. Restarting... +[2026-05-07 04:59:43] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 04:59:43] Restart issued, WMI ReturnValue=0, launcher PID=27888 +[2026-05-07 05:00:04] Tracking PID 27952, init Read=10.45 GB +[2026-05-07 05:03:06] No upload python found. Restarting... +[2026-05-07 05:03:06] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:03:06] Restart issued, WMI ReturnValue=0, launcher PID=20772 +[2026-05-07 05:03:26] Tracking PID 1456, init Read=10.27 GB +[2026-05-07 05:06:29] No upload python found. Restarting... +[2026-05-07 05:06:29] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:06:29] Restart issued, WMI ReturnValue=0, launcher PID=13848 +[2026-05-07 05:06:49] Tracking PID 28648, init Read=10.36 GB +[2026-05-07 05:09:51] No upload python found. Restarting... +[2026-05-07 05:09:51] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:09:52] Restart issued, WMI ReturnValue=0, launcher PID=6508 +[2026-05-07 05:10:12] Tracking PID 29120, init Read=9.57 GB +[2026-05-07 05:13:14] No upload python found. Restarting... +[2026-05-07 05:13:14] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:13:14] Restart issued, WMI ReturnValue=0, launcher PID=29080 +[2026-05-07 05:13:34] Tracking PID 29408, init Read=7.42 GB +[2026-05-07 05:17:07] No upload python found. Restarting... +[2026-05-07 05:17:07] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:17:07] Restart issued, WMI ReturnValue=0, launcher PID=5536 +[2026-05-07 05:17:27] Tracking PID 24176, init Read=9.01 GB +[2026-05-07 05:20:29] No upload python found. Restarting... +[2026-05-07 05:20:29] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:20:29] Restart issued, WMI ReturnValue=0, launcher PID=27784 +[2026-05-07 05:20:50] Tracking PID 27904, init Read=10.14 GB +[2026-05-07 05:23:52] No upload python found. Restarting... +[2026-05-07 05:23:52] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:23:52] Restart issued, WMI ReturnValue=0, launcher PID=3892 +[2026-05-07 05:24:12] Tracking PID 23124, init Read=8.03 GB +[2026-05-07 05:27:14] No upload python found. Restarting... +[2026-05-07 05:27:14] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:27:14] Restart issued, WMI ReturnValue=0, launcher PID=924 +[2026-05-07 05:27:35] Tracking PID 6124, init Read=8.05 GB +[2026-05-07 05:30:37] No upload python found. Restarting... +[2026-05-07 05:30:37] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:30:37] Restart issued, WMI ReturnValue=0, launcher PID=28232 +[2026-05-07 05:30:57] Tracking PID 1836, init Read=9.41 GB +[2026-05-07 05:33:59] No upload python found. Restarting... +[2026-05-07 05:33:59] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:33:59] Restart issued, WMI ReturnValue=0, launcher PID=29568 +[2026-05-07 05:34:20] Tracking PID 14728, init Read=8.76 GB +[2026-05-07 05:37:22] No upload python found. Restarting... +[2026-05-07 05:37:22] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:37:22] Restart issued, WMI ReturnValue=0, launcher PID=29036 +[2026-05-07 05:37:42] Tracking PID 21932, init Read=9.63 GB +[2026-05-07 05:40:44] No upload python found. Restarting... +[2026-05-07 05:40:44] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:40:44] Restart issued, WMI ReturnValue=0, launcher PID=5956 +[2026-05-07 05:41:05] Tracking PID 16784, init Read=10.13 GB +[2026-05-07 05:44:07] No upload python found. Restarting... +[2026-05-07 05:44:07] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:44:07] Restart issued, WMI ReturnValue=0, launcher PID=29208 +[2026-05-07 05:44:27] Tracking PID 26468, init Read=9.85 GB +[2026-05-07 05:47:29] No upload python found. Restarting... +[2026-05-07 05:47:29] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:47:30] Restart issued, WMI ReturnValue=0, launcher PID=17204 +[2026-05-07 05:47:50] Tracking PID 27924, init Read=9.21 GB +[2026-05-07 05:50:52] No upload python found. Restarting... +[2026-05-07 05:50:52] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:50:52] Restart issued, WMI ReturnValue=0, launcher PID=7704 +[2026-05-07 05:51:12] Tracking PID 25912, init Read=10.19 GB +[2026-05-07 05:54:14] No upload python found. Restarting... +[2026-05-07 05:54:14] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:54:14] Restart issued, WMI ReturnValue=0, launcher PID=28952 +[2026-05-07 05:54:35] Tracking PID 29272, init Read=8.83 GB +[2026-05-07 05:57:37] No upload python found. Restarting... +[2026-05-07 05:57:37] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 05:57:37] Restart issued, WMI ReturnValue=0, launcher PID=25524 +[2026-05-07 05:57:58] Tracking PID 8760, init Read=7.8 GB +[2026-05-07 06:01:00] No upload python found. Restarting... +[2026-05-07 06:01:00] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:01:00] Restart issued, WMI ReturnValue=0, launcher PID=29016 +[2026-05-07 06:01:20] Tracking PID 8040, init Read=9.98 GB +[2026-05-07 06:04:22] No upload python found. Restarting... +[2026-05-07 06:04:22] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:04:22] Restart issued, WMI ReturnValue=0, launcher PID=28840 +[2026-05-07 06:04:42] Tracking PID 25172, init Read=8.78 GB +[2026-05-07 06:07:44] No upload python found. Restarting... +[2026-05-07 06:07:44] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:07:44] Restart issued, WMI ReturnValue=0, launcher PID=14524 +[2026-05-07 06:08:05] Tracking PID 11872, init Read=8.56 GB +[2026-05-07 06:11:07] No upload python found. Restarting... +[2026-05-07 06:11:07] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:11:07] Restart issued, WMI ReturnValue=0, launcher PID=13756 +[2026-05-07 06:11:27] Tracking PID 15716, init Read=10.04 GB +[2026-05-07 06:15:00] No upload python found. Restarting... +[2026-05-07 06:15:00] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:15:00] Restart issued, WMI ReturnValue=0, launcher PID=28472 +[2026-05-07 06:15:20] Tracking PID 20180, init Read=7.92 GB +[2026-05-07 06:18:22] No upload python found. Restarting... +[2026-05-07 06:18:22] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:18:22] Restart issued, WMI ReturnValue=0, launcher PID=15712 +[2026-05-07 06:18:43] Tracking PID 12508, init Read=9.04 GB +[2026-05-07 06:21:45] No upload python found. Restarting... +[2026-05-07 06:21:45] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:21:45] Restart issued, WMI ReturnValue=0, launcher PID=22588 +[2026-05-07 06:22:06] Tracking PID 20564, init Read=8.24 GB +[2026-05-07 06:25:07] No upload python found. Restarting... +[2026-05-07 06:25:07] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:25:08] Restart issued, WMI ReturnValue=0, launcher PID=21216 +[2026-05-07 06:25:28] Tracking PID 27056, init Read=8.88 GB +[2026-05-07 06:28:30] No upload python found. Restarting... +[2026-05-07 06:28:30] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:28:30] Restart issued, WMI ReturnValue=0, launcher PID=15504 +[2026-05-07 06:28:50] Tracking PID 23240, init Read=8.62 GB +[2026-05-07 06:31:53] No upload python found. Restarting... +[2026-05-07 06:31:53] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:31:53] Restart issued, WMI ReturnValue=0, launcher PID=12632 +[2026-05-07 06:32:13] Tracking PID 29112, init Read=7.91 GB +[2026-05-07 06:35:46] No upload python found. Restarting... +[2026-05-07 06:35:46] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:35:46] Restart issued, WMI ReturnValue=0, launcher PID=6688 +[2026-05-07 06:36:06] Tracking PID 1880, init Read=9.38 GB +[2026-05-07 06:39:08] No upload python found. Restarting... +[2026-05-07 06:39:08] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:39:08] Restart issued, WMI ReturnValue=0, launcher PID=28860 +[2026-05-07 06:39:28] Tracking PID 20996, init Read=10.1 GB +[2026-05-07 06:42:31] No upload python found. Restarting... +[2026-05-07 06:42:31] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:42:31] Restart issued, WMI ReturnValue=0, launcher PID=12436 +[2026-05-07 06:42:51] Tracking PID 23428, init Read=8.12 GB +[2026-05-07 06:45:53] No upload python found. Restarting... +[2026-05-07 06:45:53] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:45:53] Restart issued, WMI ReturnValue=0, launcher PID=15440 +[2026-05-07 06:46:14] Tracking PID 26756, init Read=9.91 GB +[2026-05-07 06:49:16] No upload python found. Restarting... +[2026-05-07 06:49:16] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:49:16] Restart issued, WMI ReturnValue=0, launcher PID=28312 +[2026-05-07 06:49:36] Tracking PID 13260, init Read=8.84 GB +[2026-05-07 06:53:09] No upload python found. Restarting... +[2026-05-07 06:53:09] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:53:09] Restart issued, WMI ReturnValue=0, launcher PID=13476 +[2026-05-07 06:53:29] Tracking PID 18072, init Read=8.38 GB +[2026-05-07 06:56:32] No upload python found. Restarting... +[2026-05-07 06:56:32] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 06:56:32] Restart issued, WMI ReturnValue=0, launcher PID=17460 +[2026-05-07 06:56:52] Tracking PID 29056, init Read=8.07 GB +[2026-05-07 07:00:25] No upload python found. Restarting... +[2026-05-07 07:00:25] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 07:00:25] Restart issued, WMI ReturnValue=0, launcher PID=29528 +[2026-05-07 07:00:45] Tracking PID 21456, init Read=9.85 GB +[2026-05-07 07:03:47] No upload python found. Restarting... +[2026-05-07 07:03:47] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 07:03:47] Restart issued, WMI ReturnValue=0, launcher PID=24264 +[2026-05-07 07:04:08] Tracking PID 3440, init Read=9.81 GB +[2026-05-07 07:07:10] No upload python found. Restarting... +[2026-05-07 07:07:10] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 07:07:10] Restart issued, WMI ReturnValue=0, launcher PID=19964 +[2026-05-07 07:07:30] Tracking PID 29076, init Read=10.53 GB +[2026-05-07 07:11:03] No upload python found. Restarting... +[2026-05-07 07:11:03] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 07:11:03] Restart issued, WMI ReturnValue=0, launcher PID=5848 +[2026-05-07 07:11:23] Tracking PID 20508, init Read=9.59 GB +[2026-05-07 07:14:26] No upload python found. Restarting... +[2026-05-07 07:14:26] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 07:14:26] Restart issued, WMI ReturnValue=0, launcher PID=23236 +[2026-05-07 07:14:46] Tracking PID 2916, init Read=8.73 GB +[2026-05-07 07:17:48] No upload python found. Restarting... +[2026-05-07 07:17:48] Restarting upload (LFS dedup will skip already uploaded chunks)... +[2026-05-07 07:17:48] Restart issued, WMI ReturnValue=0, launcher PID=29588 +[2026-05-07 07:18:09] Tracking PID 26428, init Read=9.4 GB diff --git a/checkpoints/GLM-4.6V-Flash-SFT/all_results.json b/checkpoints/GLM-4.6V-Flash-SFT/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d6e3b419bb08f6b62c9d9805d663158c57f63ea8 --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 2477163648385024.0, + "train_loss": 0.20598802658081056, + "train_runtime": 35266.4791, + "train_samples_per_second": 5.671, + "train_steps_per_second": 0.089 +} \ No newline at end of file diff --git a/checkpoints/GLM-4.6V-Flash-SFT/chat_template.jinja b/checkpoints/GLM-4.6V-Flash-SFT/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..efe6364aa1a49684fe075d7ff43003340e1db78a --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/chat_template.jinja @@ -0,0 +1,140 @@ +[gMASK] +{%- if tools -%} +<|system|> +# Tools + +You may call one or more functions to assist with the user query. + +You are provided with function signatures within XML tags: + +{% for tool in tools %} +{{ tool | tojson(ensure_ascii=False) }} +{% endfor %} + + +For each function call, output the function name and arguments within the following XML format: +{function-name} +{arg-key-1} +{arg-value-1} +{arg-key-2} +{arg-value-2} +... +{%- endif -%} +{%- macro visible_text(content) -%} + {%- if content is string -%} + {{- content }} + {%- elif content is iterable and content is not mapping -%} + {%- for item in content -%} + {%- if item is mapping and item.type == 'text' -%} + {{- item.text }} + {%- elif item is mapping and (item.type == 'image' or 'image' in item) -%} + <|begin_of_image|><|image|><|end_of_image|> + {%- elif item is mapping and (item.type == 'video' or 'video' in item) -%} + <|begin_of_video|><|video|><|end_of_video|> + {%- elif item is string -%} + {{- item }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{- content }} + {%- endif -%} +{%- endmacro -%} +{%- set ns = namespace(last_user_index=-1) %} +{%- for m in messages %} + {%- if m.role == 'user' %} + {% set ns.last_user_index = loop.index0 -%} + {%- endif %} +{%- endfor %} +{% for m in messages %} +{%- if m.role == 'user' -%}<|user|> +{% if m.content is string %} +{{ m.content }} +{%- else %} +{%- for item in m.content %} +{% if item.type == 'video' or 'video' in item %} +<|begin_of_video|><|video|><|end_of_video|>{% elif item.type == 'image' or 'image' in item %} +<|begin_of_image|><|image|><|end_of_image|>{% elif item.type == 'text' %} +{{ item.text }} +{%- endif %} +{%- endfor %} +{%- endif %} +{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}} +{%- elif m.role == 'assistant' -%} +<|assistant|> +{%- set reasoning_content = '' %} +{%- set content = visible_text(m.content) %} +{%- if m.reasoning_content is string %} + {%- set reasoning_content = m.reasoning_content %} +{%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} +{%- endif %} +{%- if loop.index0 > ns.last_user_index and reasoning_content -%} +{{ '\n' + reasoning_content.strip() + ''}} +{%- else -%} +{{ '\n' }} +{%- endif -%} +{%- if content.strip() -%} +{{ '\n' + content.strip() }} +{%- endif -%} +{% if m.tool_calls %} +{% for tc in m.tool_calls %} +{%- if tc.function %} + {%- set tc = tc.function %} +{%- endif %} +{{ '\n' + tc.name }} +{% set _args = tc.arguments %} +{% for k, v in _args.items() %} +{{ k }} +{{ v | tojson(ensure_ascii=False) if v is not string else v }} +{% endfor %} +{% endfor %} +{% endif %} +{%- elif m.role == 'tool' -%} +{%- if m.content is string -%} +{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|observation|>' }} +{%- endif %} +{{- '\n\n' }} +{{- m.content }} +{{- '\n' }} +{% elif m.content is iterable and m.content is not mapping %} +{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} +{{- '<|observation|>' }} +{%- endif %} +{{- '\n\n' }} +{%- for tr in m.content -%} + {%- if tr is mapping and tr.type is defined -%} + {%- set t = tr.type | lower -%} + {%- if t == 'text' and tr.text is defined -%} +{{ tr.text }} + {%- elif t in ['image', 'image_url'] -%} +<|begin_of_image|><|image|><|end_of_image|> + {%- elif t in ['video', 'video_url'] -%} +<|begin_of_video|><|video|><|end_of_video|> + {%- else -%} +{{ tr | tojson(ensure_ascii=False) }} + {%- endif -%} + {%- else -%} +{{ tr.output if tr.output is defined else tr }} + {%- endif -%} +{%- endfor -%} +{{- '\n' }} +{%- else -%} +<|observation|>{% for tr in m.content %} + + +{{ tr.output if tr.output is defined else tr }} +{% endfor -%} +{% endif -%} +{%- elif m.role == 'system' -%} +<|system|> +{{ visible_text(m.content) }} +{%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} +<|assistant|> +{{'\n' if (enable_thinking is defined and not enable_thinking) else ''}} +{%- endif -%} \ No newline at end of file diff --git a/checkpoints/GLM-4.6V-Flash-SFT/config.json b/checkpoints/GLM-4.6V-Flash-SFT/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4b8c90732f3778a5bb41fde87b5cca52730074fe --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/config.json @@ -0,0 +1,72 @@ +{ + "architectures": [ + "Glm4vForConditionalGeneration" + ], + "dtype": "bfloat16", + "eos_token_id": 151329, + "hidden_size": 4096, + "image_end_token_id": 151340, + "image_start_token_id": 151339, + "image_token_id": 151363, + "model_type": "glm4v", + "pad_token_id": 151329, + "text_config": { + "attention_bias": true, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": [ + 151329, + 151336, + 151338 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 13696, + "max_position_embeddings": 131072, + "model_type": "glm4v_text", + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 2, + "pad_token_id": 151329, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "mrope_section": [ + 8, + 12, + 12 + ], + "partial_rotary_factor": 0.5, + "rope_theta": 500000, + "rope_type": "default" + }, + "use_cache": false, + "vocab_size": 151552 + }, + "tie_word_embeddings": false, + "transformers_version": "5.5.3", + "use_cache": false, + "video_end_token_id": 151342, + "video_start_token_id": 151341, + "video_token_id": 151364, + "vision_config": { + "attention_bias": false, + "attention_dropout": 0.0, + "depth": 24, + "dtype": "bfloat16", + "hidden_act": "silu", + "hidden_dropout_prob": 0.0, + "hidden_size": 1536, + "image_size": 336, + "in_channels": 3, + "initializer_range": 0.02, + "intermediate_size": 13696, + "model_type": "glm4v_vision", + "num_heads": 12, + "out_hidden_size": 4096, + "patch_size": 14, + "rms_norm_eps": 1e-05, + "spatial_merge_size": 2, + "temporal_patch_size": 2 + } +} diff --git a/checkpoints/GLM-4.6V-Flash-SFT/eval_results_job_glm_glm_46v_flash_20260430_010119.json b/checkpoints/GLM-4.6V-Flash-SFT/eval_results_job_glm_glm_46v_flash_20260430_010119.json new file mode 100644 index 0000000000000000000000000000000000000000..d68a7434c3c70e2d63a3dab9c6bed2eef9716d64 --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/eval_results_job_glm_glm_46v_flash_20260430_010119.json @@ -0,0 +1,56 @@ +{ + "mae_dx": 0.1517896551724138, + "rmse_dx": 0.5050280292665226, + "mae_dy": 0.13570689655172413, + "rmse_dy": 0.40379185488190017, + "mae_dz": 0.017967241379310345, + "rmse_dz": 0.15680698656144998, + "mae_dpitch": 0.24627758620689652, + "rmse_dpitch": 0.5965444891927231, + "mae_dyaw": 1.0261448275862068, + "rmse_dyaw": 2.459724339755617, + "mae_droll": 0.0, + "rmse_droll": 0.0, + "mae_overall": 0.26298103448275856, + "mae_position": 0.10182126436781609, + "mae_rotation": 0.42414080459770115, + "rmse_overall": 1.068394337204253, + "wp1_euc_mae": 0.0698010264307822, + "wp1_euc_median": 0.01999999999999999, + "wp2_euc_mae": 0.1401695004658457, + "wp2_euc_median": 0.04123105625617661, + "wp3_euc_mae": 0.22301934350856006, + "wp3_euc_median": 0.07211102550927984, + "wp4_euc_mae": 0.32865394783587415, + "wp4_euc_median": 0.1104536101718727, + "wp5_euc_mae": 0.44338792793915116, + "wp5_euc_median": 0.15905694150420963, + "euclidean_mae": 0.24100634923604267, + "ADE": 0.24100634923604267, + "FDE": 0.44338792793915116, + "ADE_median": 0.08327688731593763, + "FDE_median": 0.15905694150420963, + "SR@0.5m": 0.8951724137931034, + "SR@1.0m": 0.9513793103448276, + "SR@2.0m": 0.9808620689655172, + "SR@5.0m": 0.9968965517241379, + "TrajSR@1.0m": 0.8974137931034483, + "TrajSR@2.0m": 0.9577586206896552, + "TrajSR@5.0m": 0.9922413793103448, + "RotAcc@1.0deg": 0.7027586206896552, + "RotAcc@5.0deg": 0.9586206896551724, + "RotAcc@10.0deg": 0.9889655172413793, + "wp1_rot_mae": 0.5029051706109685, + "wp2_rot_mae": 0.7513635215329055, + "wp3_rot_mae": 1.0546360645612183, + "wp4_rot_mae": 1.4243170022546052, + "wp5_rot_mae": 1.784744600833039, + "rotation_euc_mae": 1.1035932719585473, + "parse_failure_rate": 0.0, + "parse_success_rate": 1.0, + "valid_samples": 1160, + "total_samples": 1160, + "parse_failures": 0, + "inference_engine": "vllm", + "vllm_version": "0.19.0" +} \ No newline at end of file diff --git a/checkpoints/GLM-4.6V-Flash-SFT/generation_config.json b/checkpoints/GLM-4.6V-Flash-SFT/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1329f41b4b2856cdec0a6f99d5946550add7d47c --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/generation_config.json @@ -0,0 +1,16 @@ +{ + "_from_model_config": true, + "do_sample": true, + "eos_token_id": [ + 151329, + 151329, + 151336, + 151338, + 151348 + ], + "pad_token_id": 151329, + "temperature": 0.8, + "top_k": 2, + "top_p": 0.6, + "transformers_version": "5.5.3" +} diff --git a/checkpoints/GLM-4.6V-Flash-SFT/model.safetensors b/checkpoints/GLM-4.6V-Flash-SFT/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cc7b59ff3f6a066137f54581c56df169c99f1d50 --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8a32229e6fe30d156e4259207d341d5b0022d08d8df59cd08760bf85cd5d215 +size 20585645128 diff --git a/checkpoints/GLM-4.6V-Flash-SFT/processor_config.json b/checkpoints/GLM-4.6V-Flash-SFT/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b14e663a0204b0d1d28fdc1e6515145147b5ce85 --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/processor_config.json @@ -0,0 +1,63 @@ +{ + "image_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_processor_type": "Glm46VImageProcessor", + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "merge_size": 2, + "patch_size": 14, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "longest_edge": 9633792, + "shortest_edge": 12544 + }, + "temporal_patch_size": 2 + }, + "processor_class": "Glm46VProcessor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "fps": 2, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "max_duration": 300, + "max_image_size": { + "longest_edge": 47040000 + }, + "merge_size": 2, + "num_frames": 16, + "patch_size": 14, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "size": { + "longest_edge": 100352000, + "shortest_edge": 12544 + }, + "temporal_patch_size": 2, + "video_processor_type": "Glm46VVideoProcessor" + } +} diff --git a/checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json b/checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5045eeb65854e3e7732f8d69dda6529fd862a0bc --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eecde1f225a86abef606164ceeb446737e592c4e7a40afe5cbf3ce8328e3df99 +size 19970886 diff --git a/checkpoints/GLM-4.6V-Flash-SFT/tokenizer_config.json b/checkpoints/GLM-4.6V-Flash-SFT/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..70b612ef9461b1f4390d0773c15c0fa9dfabf11c --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/tokenizer_config.json @@ -0,0 +1,19 @@ +{ + "backend": "tokenizers", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|endoftext|>", + "extra_special_tokens": [ + "<|user|>", + "<|observation|>", + "" + ], + "is_local": true, + "model_max_length": 128000, + "pad_token": "<|endoftext|>", + "padding_side": "right", + "processor_class": "Glm46VProcessor", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "TokenizersBackend" +} diff --git a/checkpoints/GLM-4.6V-Flash-SFT/train_results.json b/checkpoints/GLM-4.6V-Flash-SFT/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d6e3b419bb08f6b62c9d9805d663158c57f63ea8 --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 2477163648385024.0, + "train_loss": 0.20598802658081056, + "train_runtime": 35266.4791, + "train_samples_per_second": 5.671, + "train_steps_per_second": 0.089 +} \ No newline at end of file diff --git a/checkpoints/GLM-4.6V-Flash-SFT/trainer_state.json b/checkpoints/GLM-4.6V-Flash-SFT/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..01464cb06b021fac72b88d87d0fd1df501adb789 --- /dev/null +++ b/checkpoints/GLM-4.6V-Flash-SFT/trainer_state.json @@ -0,0 +1,2227 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 20.093808181688754, + "learning_rate": 1.437699680511182e-07, + "loss": 0.7523126602172852, + "step": 10 + }, + { + "epoch": 0.0064, + "grad_norm": 16.520568445399164, + "learning_rate": 3.0351437699680514e-07, + "loss": 0.684361743927002, + "step": 20 + }, + { + "epoch": 0.0096, + "grad_norm": 7.062991511064744, + "learning_rate": 4.6325878594249205e-07, + "loss": 0.46736898422241213, + "step": 30 + }, + { + "epoch": 0.0128, + "grad_norm": 1.0572338350229438, + "learning_rate": 6.230031948881789e-07, + "loss": 0.3222517013549805, + "step": 40 + }, + { + "epoch": 0.016, + "grad_norm": 0.768970780796944, + "learning_rate": 7.82747603833866e-07, + "loss": 0.29146518707275393, + "step": 50 + }, + { + "epoch": 0.0192, + "grad_norm": 0.8158618748659492, + "learning_rate": 9.424920127795528e-07, + "loss": 0.28341834545135497, + "step": 60 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7218086220464439, + "learning_rate": 1.1022364217252397e-06, + "loss": 0.2903137683868408, + "step": 70 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7459109221323802, + "learning_rate": 1.2619808306709266e-06, + "loss": 0.2718811988830566, + "step": 80 + }, + { + "epoch": 0.0288, + "grad_norm": 0.7186860317140319, + "learning_rate": 1.4217252396166134e-06, + "loss": 0.2660067558288574, + "step": 90 + }, + { + "epoch": 0.032, + "grad_norm": 0.765918500231858, + "learning_rate": 1.5814696485623005e-06, + "loss": 0.26980152130126955, + "step": 100 + }, + { + "epoch": 0.0352, + "grad_norm": 0.7344200083929374, + "learning_rate": 1.7412140575079875e-06, + "loss": 0.2695180416107178, + "step": 110 + }, + { + "epoch": 0.0384, + "grad_norm": 0.7057487416602337, + "learning_rate": 1.9009584664536742e-06, + "loss": 0.2582674264907837, + "step": 120 + }, + { + "epoch": 0.0416, + "grad_norm": 0.6996888798419932, + "learning_rate": 2.060702875399361e-06, + "loss": 0.2612154960632324, + "step": 130 + }, + { + "epoch": 0.0448, + "grad_norm": 0.7150606291134206, + "learning_rate": 2.220447284345048e-06, + "loss": 0.2520437717437744, + "step": 140 + }, + { + "epoch": 0.048, + "grad_norm": 0.7697242977250355, + "learning_rate": 2.380191693290735e-06, + "loss": 0.2501786470413208, + "step": 150 + }, + { + "epoch": 0.0512, + "grad_norm": 0.6327215717833664, + "learning_rate": 2.539936102236422e-06, + "loss": 0.24434318542480468, + "step": 160 + }, + { + "epoch": 0.0544, + "grad_norm": 0.7947096523807732, + "learning_rate": 2.699680511182109e-06, + "loss": 0.25281600952148436, + "step": 170 + }, + { + "epoch": 0.0576, + "grad_norm": 0.6717890611061146, + "learning_rate": 2.8594249201277955e-06, + "loss": 0.2454531669616699, + "step": 180 + }, + { + "epoch": 0.0608, + "grad_norm": 0.7151585341922304, + "learning_rate": 3.0191693290734825e-06, + "loss": 0.2505363464355469, + "step": 190 + }, + { + "epoch": 0.064, + "grad_norm": 0.8601334705182279, + "learning_rate": 3.17891373801917e-06, + "loss": 0.2505714178085327, + "step": 200 + }, + { + "epoch": 0.0672, + "grad_norm": 0.6106680426063227, + "learning_rate": 3.3386581469648564e-06, + "loss": 0.24775364398956298, + "step": 210 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6262984320818072, + "learning_rate": 3.4984025559105434e-06, + "loss": 0.24066565036773682, + "step": 220 + }, + { + "epoch": 0.0736, + "grad_norm": 0.6078537303186395, + "learning_rate": 3.6581469648562303e-06, + "loss": 0.24378209114074706, + "step": 230 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5889510426869463, + "learning_rate": 3.817891373801918e-06, + "loss": 0.23820171356201172, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 0.5658292689427505, + "learning_rate": 3.977635782747604e-06, + "loss": 0.23654117584228515, + "step": 250 + }, + { + "epoch": 0.0832, + "grad_norm": 0.5757166706348428, + "learning_rate": 4.137380191693291e-06, + "loss": 0.23743386268615724, + "step": 260 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5807034355359694, + "learning_rate": 4.297124600638978e-06, + "loss": 0.23970918655395507, + "step": 270 + }, + { + "epoch": 0.0896, + "grad_norm": 0.5634022487351626, + "learning_rate": 4.456869009584665e-06, + "loss": 0.23490209579467775, + "step": 280 + }, + { + "epoch": 0.0928, + "grad_norm": 0.5520223075835592, + "learning_rate": 4.616613418530352e-06, + "loss": 0.2404552936553955, + "step": 290 + }, + { + "epoch": 0.096, + "grad_norm": 0.5587222430473198, + "learning_rate": 4.776357827476039e-06, + "loss": 0.24298410415649413, + "step": 300 + }, + { + "epoch": 0.0992, + "grad_norm": 0.542281258937415, + "learning_rate": 4.936102236421725e-06, + "loss": 0.22964231967926024, + "step": 310 + }, + { + "epoch": 0.1024, + "grad_norm": 0.6339707011249724, + "learning_rate": 4.999943833158769e-06, + "loss": 0.22938170433044433, + "step": 320 + }, + { + "epoch": 0.1056, + "grad_norm": 0.5290859105179109, + "learning_rate": 4.999600600490783e-06, + "loss": 0.23717782497406006, + "step": 330 + }, + { + "epoch": 0.1088, + "grad_norm": 0.574404257271199, + "learning_rate": 4.9989453817439345e-06, + "loss": 0.23035426139831544, + "step": 340 + }, + { + "epoch": 0.112, + "grad_norm": 0.5887719210155044, + "learning_rate": 4.997978258698942e-06, + "loss": 0.230421781539917, + "step": 350 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5618660264892863, + "learning_rate": 4.996699352066659e-06, + "loss": 0.23192777633666992, + "step": 360 + }, + { + "epoch": 0.1184, + "grad_norm": 0.589113954603133, + "learning_rate": 4.995108821473014e-06, + "loss": 0.23194873332977295, + "step": 370 + }, + { + "epoch": 0.1216, + "grad_norm": 0.552581223712263, + "learning_rate": 4.993206865439084e-06, + "loss": 0.22629022598266602, + "step": 380 + }, + { + "epoch": 0.1248, + "grad_norm": 0.5506631212695152, + "learning_rate": 4.990993721356317e-06, + "loss": 0.22567858695983886, + "step": 390 + }, + { + "epoch": 0.128, + "grad_norm": 0.5210832665844604, + "learning_rate": 4.988469665456901e-06, + "loss": 0.22596418857574463, + "step": 400 + }, + { + "epoch": 0.1312, + "grad_norm": 0.5132503738005023, + "learning_rate": 4.985635012779288e-06, + "loss": 0.23435051441192628, + "step": 410 + }, + { + "epoch": 0.1344, + "grad_norm": 0.5264119522984109, + "learning_rate": 4.98249011712887e-06, + "loss": 0.2258882999420166, + "step": 420 + }, + { + "epoch": 0.1376, + "grad_norm": 0.5122311697688684, + "learning_rate": 4.979035371033824e-06, + "loss": 0.22527906894683838, + "step": 430 + }, + { + "epoch": 0.1408, + "grad_norm": 0.5105227090020142, + "learning_rate": 4.975271205696115e-06, + "loss": 0.2246992588043213, + "step": 440 + }, + { + "epoch": 0.144, + "grad_norm": 0.5307268054645026, + "learning_rate": 4.971198090937671e-06, + "loss": 0.2193459987640381, + "step": 450 + }, + { + "epoch": 0.1472, + "grad_norm": 0.46923570087876276, + "learning_rate": 4.966816535141756e-06, + "loss": 0.21553544998168944, + "step": 460 + }, + { + "epoch": 0.1504, + "grad_norm": 0.4881836025298746, + "learning_rate": 4.9621270851895035e-06, + "loss": 0.22505784034729004, + "step": 470 + }, + { + "epoch": 0.1536, + "grad_norm": 0.50506411723612, + "learning_rate": 4.957130326391662e-06, + "loss": 0.22673957347869872, + "step": 480 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5086993434891525, + "learning_rate": 4.951826882415544e-06, + "loss": 0.22294471263885499, + "step": 490 + }, + { + "epoch": 0.16, + "grad_norm": 0.5280465251135189, + "learning_rate": 4.946217415207177e-06, + "loss": 0.21789300441741943, + "step": 500 + }, + { + "epoch": 0.1632, + "grad_norm": 0.5337843871964275, + "learning_rate": 4.940302624908689e-06, + "loss": 0.22192811965942383, + "step": 510 + }, + { + "epoch": 0.1664, + "grad_norm": 0.4884343559217744, + "learning_rate": 4.934083249770912e-06, + "loss": 0.21614904403686525, + "step": 520 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5316592538281818, + "learning_rate": 4.927560066061251e-06, + "loss": 0.21973915100097657, + "step": 530 + }, + { + "epoch": 0.1728, + "grad_norm": 0.518761429695226, + "learning_rate": 4.920733887966783e-06, + "loss": 0.23207192420959472, + "step": 540 + }, + { + "epoch": 0.176, + "grad_norm": 0.511452747175852, + "learning_rate": 4.913605567492636e-06, + "loss": 0.21878607273101808, + "step": 550 + }, + { + "epoch": 0.1792, + "grad_norm": 0.49924599926539726, + "learning_rate": 4.906175994355656e-06, + "loss": 0.22075920104980468, + "step": 560 + }, + { + "epoch": 0.1824, + "grad_norm": 0.5259698850641532, + "learning_rate": 4.898446095873345e-06, + "loss": 0.22276382446289061, + "step": 570 + }, + { + "epoch": 0.1856, + "grad_norm": 0.501751014152873, + "learning_rate": 4.890416836848128e-06, + "loss": 0.21954989433288574, + "step": 580 + }, + { + "epoch": 0.1888, + "grad_norm": 0.5167201593356286, + "learning_rate": 4.882089219446925e-06, + "loss": 0.2145029067993164, + "step": 590 + }, + { + "epoch": 0.192, + "grad_norm": 0.5006060240232905, + "learning_rate": 4.873464283076074e-06, + "loss": 0.22003324031829835, + "step": 600 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4477538874438277, + "learning_rate": 4.864543104251587e-06, + "loss": 0.21916275024414061, + "step": 610 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4832933241270485, + "learning_rate": 4.855326796464798e-06, + "loss": 0.2203526973724365, + "step": 620 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5359361967005408, + "learning_rate": 4.8458165100433725e-06, + "loss": 0.21596732139587402, + "step": 630 + }, + { + "epoch": 0.2048, + "grad_norm": 0.5708003689943741, + "learning_rate": 4.836013432007738e-06, + "loss": 0.2171140193939209, + "step": 640 + }, + { + "epoch": 0.208, + "grad_norm": 0.4831169531465719, + "learning_rate": 4.825918785922921e-06, + "loss": 0.22040581703186035, + "step": 650 + }, + { + "epoch": 0.2112, + "grad_norm": 0.4982382400104379, + "learning_rate": 4.8155338317458315e-06, + "loss": 0.21841506958007811, + "step": 660 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4741071764041748, + "learning_rate": 4.804859865668002e-06, + "loss": 0.2143453598022461, + "step": 670 + }, + { + "epoch": 0.2176, + "grad_norm": 0.47853550451884025, + "learning_rate": 4.793898219953804e-06, + "loss": 0.21545085906982422, + "step": 680 + }, + { + "epoch": 0.2208, + "grad_norm": 0.4902247743421047, + "learning_rate": 4.782650262774164e-06, + "loss": 0.2166231393814087, + "step": 690 + }, + { + "epoch": 0.224, + "grad_norm": 0.4611717059287351, + "learning_rate": 4.7711173980357886e-06, + "loss": 0.21284222602844238, + "step": 700 + }, + { + "epoch": 0.2272, + "grad_norm": 0.4815654128340087, + "learning_rate": 4.759301065205947e-06, + "loss": 0.21358721256256102, + "step": 710 + }, + { + "epoch": 0.2304, + "grad_norm": 0.5049245613626656, + "learning_rate": 4.7472027391328e-06, + "loss": 0.21447527408599854, + "step": 720 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4758997167389971, + "learning_rate": 4.734823929861317e-06, + "loss": 0.21809780597686768, + "step": 730 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5423173365143716, + "learning_rate": 4.722166182444801e-06, + "loss": 0.21390962600708008, + "step": 740 + }, + { + "epoch": 0.24, + "grad_norm": 0.44572231492476455, + "learning_rate": 4.709231076752045e-06, + "loss": 0.21404554843902587, + "step": 750 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4848421373802031, + "learning_rate": 4.696020227270142e-06, + "loss": 0.21710457801818847, + "step": 760 + }, + { + "epoch": 0.2464, + "grad_norm": 0.518532765750562, + "learning_rate": 4.6825352829029705e-06, + "loss": 0.21285481452941896, + "step": 770 + }, + { + "epoch": 0.2496, + "grad_norm": 0.5008678397970389, + "learning_rate": 4.668777926765392e-06, + "loss": 0.21155524253845215, + "step": 780 + }, + { + "epoch": 0.2528, + "grad_norm": 0.48720974823345864, + "learning_rate": 4.6547498759731725e-06, + "loss": 0.20655455589294433, + "step": 790 + }, + { + "epoch": 0.256, + "grad_norm": 0.49528977499161353, + "learning_rate": 4.6404528814286575e-06, + "loss": 0.2101435422897339, + "step": 800 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4532686250809506, + "learning_rate": 4.6258887276022425e-06, + "loss": 0.21684365272521972, + "step": 810 + }, + { + "epoch": 0.2624, + "grad_norm": 0.49803115837380546, + "learning_rate": 4.611059232309639e-06, + "loss": 0.21193151473999022, + "step": 820 + }, + { + "epoch": 0.2656, + "grad_norm": 0.5153783225404047, + "learning_rate": 4.595966246484986e-06, + "loss": 0.21344296932220458, + "step": 830 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4765272009238815, + "learning_rate": 4.580611653949829e-06, + "loss": 0.21319386959075928, + "step": 840 + }, + { + "epoch": 0.272, + "grad_norm": 0.5228745905777464, + "learning_rate": 4.564997371177992e-06, + "loss": 0.21112470626831054, + "step": 850 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4583805155148445, + "learning_rate": 4.54912534705637e-06, + "loss": 0.2108391284942627, + "step": 860 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4920259584441244, + "learning_rate": 4.532997562641683e-06, + "loss": 0.20768051147460936, + "step": 870 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5200095181799963, + "learning_rate": 4.516616030913214e-06, + "loss": 0.21211957931518555, + "step": 880 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4788503683270311, + "learning_rate": 4.499982796521556e-06, + "loss": 0.20693025588989258, + "step": 890 + }, + { + "epoch": 0.288, + "grad_norm": 0.4666456137071941, + "learning_rate": 4.48309993553341e-06, + "loss": 0.20890872478485106, + "step": 900 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4794527139448749, + "learning_rate": 4.465969555172468e-06, + "loss": 0.20777955055236816, + "step": 910 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4616610840587355, + "learning_rate": 4.448593793556391e-06, + "loss": 0.21416122913360597, + "step": 920 + }, + { + "epoch": 0.2976, + "grad_norm": 0.47725407011391663, + "learning_rate": 4.430974819429954e-06, + "loss": 0.20783448219299316, + "step": 930 + }, + { + "epoch": 0.3008, + "grad_norm": 0.4596350013424985, + "learning_rate": 4.413114831894344e-06, + "loss": 0.20199823379516602, + "step": 940 + }, + { + "epoch": 0.304, + "grad_norm": 0.4940149958405755, + "learning_rate": 4.3950160601326865e-06, + "loss": 0.20049993991851806, + "step": 950 + }, + { + "epoch": 0.3072, + "grad_norm": 0.4891958940488766, + "learning_rate": 4.376680763131811e-06, + "loss": 0.20765538215637208, + "step": 960 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5373640149223949, + "learning_rate": 4.358111229400296e-06, + "loss": 0.2103745460510254, + "step": 970 + }, + { + "epoch": 0.3136, + "grad_norm": 0.5035919946088194, + "learning_rate": 4.33930977668283e-06, + "loss": 0.2148181438446045, + "step": 980 + }, + { + "epoch": 0.3168, + "grad_norm": 0.498832420199319, + "learning_rate": 4.320278751670922e-06, + "loss": 0.20667800903320313, + "step": 990 + }, + { + "epoch": 0.32, + "grad_norm": 0.5016480811009209, + "learning_rate": 4.301020529710009e-06, + "loss": 0.20847175121307374, + "step": 1000 + }, + { + "epoch": 0.3232, + "grad_norm": 0.5355131410598809, + "learning_rate": 4.281537514502962e-06, + "loss": 0.21192097663879395, + "step": 1010 + }, + { + "epoch": 0.3264, + "grad_norm": 0.49710771531514497, + "learning_rate": 4.261832137810093e-06, + "loss": 0.20849306583404542, + "step": 1020 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4702938633516668, + "learning_rate": 4.241906859145611e-06, + "loss": 0.20947628021240233, + "step": 1030 + }, + { + "epoch": 0.3328, + "grad_norm": 0.47328762785100176, + "learning_rate": 4.221764165470661e-06, + "loss": 0.20568199157714845, + "step": 1040 + }, + { + "epoch": 0.336, + "grad_norm": 0.48090607151875236, + "learning_rate": 4.201406570882898e-06, + "loss": 0.20522446632385255, + "step": 1050 + }, + { + "epoch": 0.3392, + "grad_norm": 0.46870182419574746, + "learning_rate": 4.180836616302704e-06, + "loss": 0.2044762134552002, + "step": 1060 + }, + { + "epoch": 0.3424, + "grad_norm": 0.49284234006242156, + "learning_rate": 4.160056869156041e-06, + "loss": 0.20835609436035157, + "step": 1070 + }, + { + "epoch": 0.3456, + "grad_norm": 0.425482663225026, + "learning_rate": 4.139069923053995e-06, + "loss": 0.20575876235961915, + "step": 1080 + }, + { + "epoch": 0.3488, + "grad_norm": 0.46647669293000804, + "learning_rate": 4.117878397469062e-06, + "loss": 0.20992250442504884, + "step": 1090 + }, + { + "epoch": 0.352, + "grad_norm": 0.4464343988416538, + "learning_rate": 4.096484937408195e-06, + "loss": 0.20092244148254396, + "step": 1100 + }, + { + "epoch": 0.3552, + "grad_norm": 0.5116088744854695, + "learning_rate": 4.074892213082676e-06, + "loss": 0.20036702156066893, + "step": 1110 + }, + { + "epoch": 0.3584, + "grad_norm": 4.940314739525779, + "learning_rate": 4.0531029195748265e-06, + "loss": 0.21338913440704346, + "step": 1120 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4721397920115156, + "learning_rate": 4.03111977650163e-06, + "loss": 0.20792775154113768, + "step": 1130 + }, + { + "epoch": 0.3648, + "grad_norm": 0.5105519348301445, + "learning_rate": 4.008945527675281e-06, + "loss": 0.2061443328857422, + "step": 1140 + }, + { + "epoch": 0.368, + "grad_norm": 0.523180958068929, + "learning_rate": 3.986582940760717e-06, + "loss": 0.1962942123413086, + "step": 1150 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5027335828799008, + "learning_rate": 3.9640348069301785e-06, + "loss": 0.2031947612762451, + "step": 1160 + }, + { + "epoch": 0.3744, + "grad_norm": 0.48735270934050073, + "learning_rate": 3.941303940514826e-06, + "loss": 0.20448057651519774, + "step": 1170 + }, + { + "epoch": 0.3776, + "grad_norm": 0.5075332440871839, + "learning_rate": 3.918393178653472e-06, + "loss": 0.20594587326049804, + "step": 1180 + }, + { + "epoch": 0.3808, + "grad_norm": 0.4485083644552742, + "learning_rate": 3.895305380938468e-06, + "loss": 0.20264167785644532, + "step": 1190 + }, + { + "epoch": 0.384, + "grad_norm": 0.4568492727427137, + "learning_rate": 3.872043429058783e-06, + "loss": 0.20010733604431152, + "step": 1200 + }, + { + "epoch": 0.3872, + "grad_norm": 0.46103501808297814, + "learning_rate": 3.84861022644033e-06, + "loss": 0.2026883602142334, + "step": 1210 + }, + { + "epoch": 0.3904, + "grad_norm": 0.46609834517793386, + "learning_rate": 3.825008697883574e-06, + "loss": 0.21079249382019044, + "step": 1220 + }, + { + "epoch": 0.3936, + "grad_norm": 0.49992288047242467, + "learning_rate": 3.8012417891984776e-06, + "loss": 0.2031094551086426, + "step": 1230 + }, + { + "epoch": 0.3968, + "grad_norm": 0.4746264528155682, + "learning_rate": 3.777312466836819e-06, + "loss": 0.20238199234008789, + "step": 1240 + }, + { + "epoch": 0.4, + "grad_norm": 0.45243385346205817, + "learning_rate": 3.7532237175219378e-06, + "loss": 0.20085253715515136, + "step": 1250 + }, + { + "epoch": 0.4032, + "grad_norm": 0.48931316379420287, + "learning_rate": 3.728978547875948e-06, + "loss": 0.20520598888397218, + "step": 1260 + }, + { + "epoch": 0.4064, + "grad_norm": 0.5229456414008956, + "learning_rate": 3.7045799840444712e-06, + "loss": 0.19984333515167235, + "step": 1270 + }, + { + "epoch": 0.4096, + "grad_norm": 0.4773055647919508, + "learning_rate": 3.6800310713189258e-06, + "loss": 0.20064287185668944, + "step": 1280 + }, + { + "epoch": 0.4128, + "grad_norm": 0.4824962267097886, + "learning_rate": 3.6553348737564328e-06, + "loss": 0.20138092041015626, + "step": 1290 + }, + { + "epoch": 0.416, + "grad_norm": 0.47245858486532044, + "learning_rate": 3.6304944737973794e-06, + "loss": 0.20704314708709717, + "step": 1300 + }, + { + "epoch": 0.4192, + "grad_norm": 0.47670774891547607, + "learning_rate": 3.6055129718806836e-06, + "loss": 0.20015296936035157, + "step": 1310 + }, + { + "epoch": 0.4224, + "grad_norm": 0.4553061754046557, + "learning_rate": 3.5803934860568134e-06, + "loss": 0.19692450761795044, + "step": 1320 + }, + { + "epoch": 0.4256, + "grad_norm": 0.5124220374815842, + "learning_rate": 3.5551391515986163e-06, + "loss": 0.2016448497772217, + "step": 1330 + }, + { + "epoch": 0.4288, + "grad_norm": 0.4809826187082155, + "learning_rate": 3.529753120609982e-06, + "loss": 0.19793987274169922, + "step": 1340 + }, + { + "epoch": 0.432, + "grad_norm": 0.48798480914379067, + "learning_rate": 3.5042385616324243e-06, + "loss": 0.20041651725769044, + "step": 1350 + }, + { + "epoch": 0.4352, + "grad_norm": 0.4589600174491072, + "learning_rate": 3.4785986592495934e-06, + "loss": 0.19874777793884277, + "step": 1360 + }, + { + "epoch": 0.4384, + "grad_norm": 0.44810416886840765, + "learning_rate": 3.452836613689803e-06, + "loss": 0.19696075916290284, + "step": 1370 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4584133576368786, + "learning_rate": 3.426955640426584e-06, + "loss": 0.20014967918395996, + "step": 1380 + }, + { + "epoch": 0.4448, + "grad_norm": 0.46474214573205574, + "learning_rate": 3.4009589697773605e-06, + "loss": 0.19937365055084227, + "step": 1390 + }, + { + "epoch": 0.448, + "grad_norm": 0.4671452045462699, + "learning_rate": 3.3748498465002475e-06, + "loss": 0.19703936576843262, + "step": 1400 + }, + { + "epoch": 0.4512, + "grad_norm": 0.48450994567172556, + "learning_rate": 3.3486315293890693e-06, + "loss": 0.20506525039672852, + "step": 1410 + }, + { + "epoch": 0.4544, + "grad_norm": 0.48940983460177095, + "learning_rate": 3.3223072908666053e-06, + "loss": 0.19508613348007203, + "step": 1420 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5510507698314822, + "learning_rate": 3.295880416576153e-06, + "loss": 0.20555310249328612, + "step": 1430 + }, + { + "epoch": 0.4608, + "grad_norm": 0.45473195837081576, + "learning_rate": 3.269354204971427e-06, + "loss": 0.19813575744628906, + "step": 1440 + }, + { + "epoch": 0.464, + "grad_norm": 0.4854091562037593, + "learning_rate": 3.242731966904865e-06, + "loss": 0.19694712162017822, + "step": 1450 + }, + { + "epoch": 0.4672, + "grad_norm": 0.4637441174996577, + "learning_rate": 3.2160170252143913e-06, + "loss": 0.1959088087081909, + "step": 1460 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4460606032902631, + "learning_rate": 3.1892127143086716e-06, + "loss": 0.20340628623962403, + "step": 1470 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4768689558424143, + "learning_rate": 3.1623223797509347e-06, + "loss": 0.19146734476089478, + "step": 1480 + }, + { + "epoch": 0.4768, + "grad_norm": 0.46631038217283505, + "learning_rate": 3.135349377841396e-06, + "loss": 0.19588179588317872, + "step": 1490 + }, + { + "epoch": 0.48, + "grad_norm": 0.48197350793708515, + "learning_rate": 3.1082970751983497e-06, + "loss": 0.20245718955993652, + "step": 1500 + }, + { + "epoch": 0.4832, + "grad_norm": 0.44408940491911375, + "learning_rate": 3.0811688483379546e-06, + "loss": 0.19959219694137573, + "step": 1510 + }, + { + "epoch": 0.4864, + "grad_norm": 0.47255519902507054, + "learning_rate": 3.0539680832528074e-06, + "loss": 0.1994904398918152, + "step": 1520 + }, + { + "epoch": 0.4896, + "grad_norm": 0.48800627171777977, + "learning_rate": 3.026698174989316e-06, + "loss": 0.19807126522064208, + "step": 1530 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4748737132528679, + "learning_rate": 2.999362527223952e-06, + "loss": 0.19806113243103027, + "step": 1540 + }, + { + "epoch": 0.496, + "grad_norm": 0.47637730688550123, + "learning_rate": 2.9719645518384194e-06, + "loss": 0.19955278635025026, + "step": 1550 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5411554495039922, + "learning_rate": 2.944507668493807e-06, + "loss": 0.202299165725708, + "step": 1560 + }, + { + "epoch": 0.5024, + "grad_norm": 0.48642193804707995, + "learning_rate": 2.9169953042037623e-06, + "loss": 0.19863581657409668, + "step": 1570 + }, + { + "epoch": 0.5056, + "grad_norm": 0.5363553346933208, + "learning_rate": 2.889430892906754e-06, + "loss": 0.19409118890762328, + "step": 1580 + }, + { + "epoch": 0.5088, + "grad_norm": 0.47187050499878397, + "learning_rate": 2.861817875037462e-06, + "loss": 0.1912764310836792, + "step": 1590 + }, + { + "epoch": 0.512, + "grad_norm": 0.5163595948637988, + "learning_rate": 2.8341596970973683e-06, + "loss": 0.20115599632263184, + "step": 1600 + }, + { + "epoch": 0.5152, + "grad_norm": 0.5033907485073755, + "learning_rate": 2.80645981122458e-06, + "loss": 0.19687057733535768, + "step": 1610 + }, + { + "epoch": 0.5184, + "grad_norm": 0.4753722793172304, + "learning_rate": 2.7787216747629508e-06, + "loss": 0.20292258262634277, + "step": 1620 + }, + { + "epoch": 0.5216, + "grad_norm": 0.46781165760957, + "learning_rate": 2.7509487498305615e-06, + "loss": 0.18959319591522217, + "step": 1630 + }, + { + "epoch": 0.5248, + "grad_norm": 0.4803554793777817, + "learning_rate": 2.7231445028875924e-06, + "loss": 0.19619333744049072, + "step": 1640 + }, + { + "epoch": 0.528, + "grad_norm": 0.43719126287209875, + "learning_rate": 2.6953124043036604e-06, + "loss": 0.19511375427246094, + "step": 1650 + }, + { + "epoch": 0.5312, + "grad_norm": 0.4689037514921924, + "learning_rate": 2.667455927924667e-06, + "loss": 0.19399585723876953, + "step": 1660 + }, + { + "epoch": 0.5344, + "grad_norm": 0.48479905355532704, + "learning_rate": 2.6395785506392164e-06, + "loss": 0.1896076202392578, + "step": 1670 + }, + { + "epoch": 0.5376, + "grad_norm": 0.516453973005613, + "learning_rate": 2.6116837519446407e-06, + "loss": 0.1939442992210388, + "step": 1680 + }, + { + "epoch": 0.5408, + "grad_norm": 0.47710575683228795, + "learning_rate": 2.5837750135127192e-06, + "loss": 0.19078316688537597, + "step": 1690 + }, + { + "epoch": 0.544, + "grad_norm": 0.47654319681013313, + "learning_rate": 2.555855818755108e-06, + "loss": 0.19690483808517456, + "step": 1700 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5030326386548561, + "learning_rate": 2.5279296523885636e-06, + "loss": 0.19325432777404786, + "step": 1710 + }, + { + "epoch": 0.5504, + "grad_norm": 0.49452423153374125, + "learning_rate": 2.5e-06, + "loss": 0.19436432123184205, + "step": 1720 + }, + { + "epoch": 0.5536, + "grad_norm": 0.5135088244704792, + "learning_rate": 2.472070347611437e-06, + "loss": 0.1878933072090149, + "step": 1730 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5160118206798595, + "learning_rate": 2.444144181244893e-06, + "loss": 0.19355961084365844, + "step": 1740 + }, + { + "epoch": 0.56, + "grad_norm": 0.5069308846787346, + "learning_rate": 2.416224986487282e-06, + "loss": 0.19122695922851562, + "step": 1750 + }, + { + "epoch": 0.5632, + "grad_norm": 0.5385800538703149, + "learning_rate": 2.3883162480553605e-06, + "loss": 0.18820159435272216, + "step": 1760 + }, + { + "epoch": 0.5664, + "grad_norm": 0.49129457413116234, + "learning_rate": 2.3604214493607844e-06, + "loss": 0.19197521209716797, + "step": 1770 + }, + { + "epoch": 0.5696, + "grad_norm": 0.4908165776123557, + "learning_rate": 2.332544072075333e-06, + "loss": 0.19534649848937988, + "step": 1780 + }, + { + "epoch": 0.5728, + "grad_norm": 0.49497656453552125, + "learning_rate": 2.30468759569634e-06, + "loss": 0.19484236240386962, + "step": 1790 + }, + { + "epoch": 0.576, + "grad_norm": 0.466973816624908, + "learning_rate": 2.276855497112408e-06, + "loss": 0.191474986076355, + "step": 1800 + }, + { + "epoch": 0.5792, + "grad_norm": 0.498294237386886, + "learning_rate": 2.2490512501694394e-06, + "loss": 0.18636202812194824, + "step": 1810 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5110432771457695, + "learning_rate": 2.2212783252370496e-06, + "loss": 0.19112749099731446, + "step": 1820 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4923044532988948, + "learning_rate": 2.1935401887754213e-06, + "loss": 0.19590845108032226, + "step": 1830 + }, + { + "epoch": 0.5888, + "grad_norm": 0.49881036242858373, + "learning_rate": 2.165840302902632e-06, + "loss": 0.18917866945266723, + "step": 1840 + }, + { + "epoch": 0.592, + "grad_norm": 0.5070848566140863, + "learning_rate": 2.1381821249625383e-06, + "loss": 0.1955878973007202, + "step": 1850 + }, + { + "epoch": 0.5952, + "grad_norm": 0.5245919327161893, + "learning_rate": 2.1105691070932465e-06, + "loss": 0.18681724071502687, + "step": 1860 + }, + { + "epoch": 0.5984, + "grad_norm": 0.5043139368489675, + "learning_rate": 2.083004695796238e-06, + "loss": 0.185194993019104, + "step": 1870 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5180452275250914, + "learning_rate": 2.055492331506194e-06, + "loss": 0.1928567886352539, + "step": 1880 + }, + { + "epoch": 0.6048, + "grad_norm": 0.5320215436686966, + "learning_rate": 2.0280354481615814e-06, + "loss": 0.19074957370758056, + "step": 1890 + }, + { + "epoch": 0.608, + "grad_norm": 0.4725862343819939, + "learning_rate": 2.000637472776049e-06, + "loss": 0.19257795810699463, + "step": 1900 + }, + { + "epoch": 0.6112, + "grad_norm": 0.46908638481055026, + "learning_rate": 1.973301825010685e-06, + "loss": 0.18594731092453004, + "step": 1910 + }, + { + "epoch": 0.6144, + "grad_norm": 0.5595713557618127, + "learning_rate": 1.9460319167471934e-06, + "loss": 0.19121139049530028, + "step": 1920 + }, + { + "epoch": 0.6176, + "grad_norm": 0.507704360185881, + "learning_rate": 1.9188311516620466e-06, + "loss": 0.18624544143676758, + "step": 1930 + }, + { + "epoch": 0.6208, + "grad_norm": 0.4860192603301521, + "learning_rate": 1.891702924801651e-06, + "loss": 0.19231630563735963, + "step": 1940 + }, + { + "epoch": 0.624, + "grad_norm": 0.5275367662218493, + "learning_rate": 1.864650622158604e-06, + "loss": 0.19608126878738402, + "step": 1950 + }, + { + "epoch": 0.6272, + "grad_norm": 0.49282562967431837, + "learning_rate": 1.8376776202490666e-06, + "loss": 0.19235665798187257, + "step": 1960 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5182260002744055, + "learning_rate": 1.8107872856913293e-06, + "loss": 0.18613014221191407, + "step": 1970 + }, + { + "epoch": 0.6336, + "grad_norm": 0.5103313601861706, + "learning_rate": 1.7839829747856096e-06, + "loss": 0.1881113052368164, + "step": 1980 + }, + { + "epoch": 0.6368, + "grad_norm": 0.5451499180289584, + "learning_rate": 1.7572680330951359e-06, + "loss": 0.18735458850860595, + "step": 1990 + }, + { + "epoch": 0.64, + "grad_norm": 0.5090636315844644, + "learning_rate": 1.7306457950285747e-06, + "loss": 0.1885282278060913, + "step": 2000 + }, + { + "epoch": 0.6432, + "grad_norm": 0.4758742975901025, + "learning_rate": 1.704119583423848e-06, + "loss": 0.18241598606109619, + "step": 2010 + }, + { + "epoch": 0.6464, + "grad_norm": 0.49602490022248863, + "learning_rate": 1.677692709133396e-06, + "loss": 0.19074147939682007, + "step": 2020 + }, + { + "epoch": 0.6496, + "grad_norm": 0.520455285125112, + "learning_rate": 1.6513684706109311e-06, + "loss": 0.19024887084960937, + "step": 2030 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5234524283247538, + "learning_rate": 1.6251501534997529e-06, + "loss": 0.18900917768478392, + "step": 2040 + }, + { + "epoch": 0.656, + "grad_norm": 0.4762667999370438, + "learning_rate": 1.5990410302226405e-06, + "loss": 0.18147594928741456, + "step": 2050 + }, + { + "epoch": 0.6592, + "grad_norm": 0.4931916769975977, + "learning_rate": 1.5730443595734162e-06, + "loss": 0.18815698623657226, + "step": 2060 + }, + { + "epoch": 0.6624, + "grad_norm": 0.5595459804684163, + "learning_rate": 1.5471633863101982e-06, + "loss": 0.18958520889282227, + "step": 2070 + }, + { + "epoch": 0.6656, + "grad_norm": 0.551381176131532, + "learning_rate": 1.521401340750407e-06, + "loss": 0.1908926248550415, + "step": 2080 + }, + { + "epoch": 0.6688, + "grad_norm": 0.5155022860725758, + "learning_rate": 1.495761438367577e-06, + "loss": 0.18872777223587037, + "step": 2090 + }, + { + "epoch": 0.672, + "grad_norm": 0.6037433446756716, + "learning_rate": 1.4702468793900187e-06, + "loss": 0.18800405263900757, + "step": 2100 + }, + { + "epoch": 0.6752, + "grad_norm": 0.5613773833705744, + "learning_rate": 1.444860848401384e-06, + "loss": 0.18743778467178346, + "step": 2110 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5277286435676816, + "learning_rate": 1.4196065139431866e-06, + "loss": 0.18769149780273436, + "step": 2120 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5487755330646784, + "learning_rate": 1.3944870281193178e-06, + "loss": 0.1866753101348877, + "step": 2130 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5319334450957595, + "learning_rate": 1.3695055262026208e-06, + "loss": 0.19193503856658936, + "step": 2140 + }, + { + "epoch": 0.688, + "grad_norm": 0.5061777243502238, + "learning_rate": 1.3446651262435679e-06, + "loss": 0.18499069213867186, + "step": 2150 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5063080834031065, + "learning_rate": 1.3199689286810746e-06, + "loss": 0.18700281381607056, + "step": 2160 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5014045449596041, + "learning_rate": 1.2954200159555294e-06, + "loss": 0.18185386657714844, + "step": 2170 + }, + { + "epoch": 0.6976, + "grad_norm": 0.5417896517828541, + "learning_rate": 1.2710214521240527e-06, + "loss": 0.18632771968841552, + "step": 2180 + }, + { + "epoch": 0.7008, + "grad_norm": 0.5710908799443121, + "learning_rate": 1.246776282478063e-06, + "loss": 0.18732945919036864, + "step": 2190 + }, + { + "epoch": 0.704, + "grad_norm": 0.5180508096448415, + "learning_rate": 1.222687533163181e-06, + "loss": 0.18602204322814941, + "step": 2200 + }, + { + "epoch": 0.7072, + "grad_norm": 0.5480758918229119, + "learning_rate": 1.1987582108015228e-06, + "loss": 0.18710973262786865, + "step": 2210 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5631818126474104, + "learning_rate": 1.1749913021164255e-06, + "loss": 0.18828771114349366, + "step": 2220 + }, + { + "epoch": 0.7136, + "grad_norm": 0.4833634541431531, + "learning_rate": 1.1513897735596702e-06, + "loss": 0.18257718086242675, + "step": 2230 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5051522117897481, + "learning_rate": 1.127956570941218e-06, + "loss": 0.17966469526290893, + "step": 2240 + }, + { + "epoch": 0.72, + "grad_norm": 0.5404271805851407, + "learning_rate": 1.104694619061533e-06, + "loss": 0.18814800977706908, + "step": 2250 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5147342090287059, + "learning_rate": 1.0816068213465295e-06, + "loss": 0.1908186197280884, + "step": 2260 + }, + { + "epoch": 0.7264, + "grad_norm": 0.5558495401174878, + "learning_rate": 1.0586960594851762e-06, + "loss": 0.1859324097633362, + "step": 2270 + }, + { + "epoch": 0.7296, + "grad_norm": 0.6185737554957568, + "learning_rate": 1.0359651930698217e-06, + "loss": 0.18477405309677125, + "step": 2280 + }, + { + "epoch": 0.7328, + "grad_norm": 0.5398647348951853, + "learning_rate": 1.0134170592392837e-06, + "loss": 0.1857767939567566, + "step": 2290 + }, + { + "epoch": 0.736, + "grad_norm": 0.5450678028060058, + "learning_rate": 9.910544723247204e-07, + "loss": 0.184822678565979, + "step": 2300 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5999082382312588, + "learning_rate": 9.688802234983706e-07, + "loss": 0.18381783962249756, + "step": 2310 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5175099712487172, + "learning_rate": 9.468970804251742e-07, + "loss": 0.18641353845596315, + "step": 2320 + }, + { + "epoch": 0.7456, + "grad_norm": 0.5367638040398911, + "learning_rate": 9.251077869173244e-07, + "loss": 0.18090612888336183, + "step": 2330 + }, + { + "epoch": 0.7488, + "grad_norm": 0.563594153188617, + "learning_rate": 9.035150625918054e-07, + "loss": 0.18149322271347046, + "step": 2340 + }, + { + "epoch": 0.752, + "grad_norm": 0.5304713442318342, + "learning_rate": 8.821216025309395e-07, + "loss": 0.18464915752410888, + "step": 2350 + }, + { + "epoch": 0.7552, + "grad_norm": 0.535119183480021, + "learning_rate": 8.609300769460055e-07, + "loss": 0.1792607307434082, + "step": 2360 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5724539486438234, + "learning_rate": 8.399431308439592e-07, + "loss": 0.183684778213501, + "step": 2370 + }, + { + "epoch": 0.7616, + "grad_norm": 0.5589161632397335, + "learning_rate": 8.191633836972962e-07, + "loss": 0.18650429248809813, + "step": 2380 + }, + { + "epoch": 0.7648, + "grad_norm": 0.5386156132762686, + "learning_rate": 7.985934291171024e-07, + "loss": 0.1821720838546753, + "step": 2390 + }, + { + "epoch": 0.768, + "grad_norm": 0.5321288466713382, + "learning_rate": 7.7823583452934e-07, + "loss": 0.18489625453948974, + "step": 2400 + }, + { + "epoch": 0.7712, + "grad_norm": 0.5670301824645666, + "learning_rate": 7.58093140854389e-07, + "loss": 0.18495336771011353, + "step": 2410 + }, + { + "epoch": 0.7744, + "grad_norm": 0.6058756306995335, + "learning_rate": 7.381678621899077e-07, + "loss": 0.1848145008087158, + "step": 2420 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5477002870283818, + "learning_rate": 7.184624854970379e-07, + "loss": 0.1817490816116333, + "step": 2430 + }, + { + "epoch": 0.7808, + "grad_norm": 0.5458027173632266, + "learning_rate": 6.989794702899932e-07, + "loss": 0.18078404664993286, + "step": 2440 + }, + { + "epoch": 0.784, + "grad_norm": 0.5772130708628379, + "learning_rate": 6.797212483290777e-07, + "loss": 0.18299766778945922, + "step": 2450 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5674146932938366, + "learning_rate": 6.60690223317171e-07, + "loss": 0.1799448013305664, + "step": 2460 + }, + { + "epoch": 0.7904, + "grad_norm": 0.5238538237059384, + "learning_rate": 6.418887705997046e-07, + "loss": 0.1826066255569458, + "step": 2470 + }, + { + "epoch": 0.7936, + "grad_norm": 0.5857270779434125, + "learning_rate": 6.23319236868189e-07, + "loss": 0.18549437522888185, + "step": 2480 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5274424793724192, + "learning_rate": 6.049839398673141e-07, + "loss": 0.1865037798881531, + "step": 2490 + }, + { + "epoch": 0.8, + "grad_norm": 0.5820741885019232, + "learning_rate": 5.868851681056567e-07, + "loss": 0.18739759922027588, + "step": 2500 + }, + { + "epoch": 0.8032, + "grad_norm": 0.559971376703767, + "learning_rate": 5.690251805700467e-07, + "loss": 0.1853170394897461, + "step": 2510 + }, + { + "epoch": 0.8064, + "grad_norm": 0.5456407872897143, + "learning_rate": 5.514062064436096e-07, + "loss": 0.18589026927948, + "step": 2520 + }, + { + "epoch": 0.8096, + "grad_norm": 0.5866178273652722, + "learning_rate": 5.34030444827533e-07, + "loss": 0.1827709197998047, + "step": 2530 + }, + { + "epoch": 0.8128, + "grad_norm": 0.588749656654477, + "learning_rate": 5.169000644665895e-07, + "loss": 0.1794450044631958, + "step": 2540 + }, + { + "epoch": 0.816, + "grad_norm": 0.5778176841150756, + "learning_rate": 5.000172034784442e-07, + "loss": 0.18060548305511476, + "step": 2550 + }, + { + "epoch": 0.8192, + "grad_norm": 0.566426267196354, + "learning_rate": 4.833839690867853e-07, + "loss": 0.18326361179351808, + "step": 2560 + }, + { + "epoch": 0.8224, + "grad_norm": 0.5763812670051818, + "learning_rate": 4.6700243735831705e-07, + "loss": 0.17798151969909667, + "step": 2570 + }, + { + "epoch": 0.8256, + "grad_norm": 0.5465254160649792, + "learning_rate": 4.508746529436311e-07, + "loss": 0.1761394739151001, + "step": 2580 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5717164779412172, + "learning_rate": 4.350026288220083e-07, + "loss": 0.18241602182388306, + "step": 2590 + }, + { + "epoch": 0.832, + "grad_norm": 0.5532919690194787, + "learning_rate": 4.1938834605017133e-07, + "loss": 0.1799800157546997, + "step": 2600 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5485503614596886, + "learning_rate": 4.0403375351501515e-07, + "loss": 0.18037915229797363, + "step": 2610 + }, + { + "epoch": 0.8384, + "grad_norm": 0.5921392059955939, + "learning_rate": 3.88940767690362e-07, + "loss": 0.17850807905197144, + "step": 2620 + }, + { + "epoch": 0.8416, + "grad_norm": 0.6173777417506611, + "learning_rate": 3.7411127239775774e-07, + "loss": 0.17773046493530273, + "step": 2630 + }, + { + "epoch": 0.8448, + "grad_norm": 0.5704461135916385, + "learning_rate": 3.595471185713431e-07, + "loss": 0.17534157037734985, + "step": 2640 + }, + { + "epoch": 0.848, + "grad_norm": 0.6016600022490033, + "learning_rate": 3.4525012402682826e-07, + "loss": 0.17784465551376344, + "step": 2650 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5793357844007763, + "learning_rate": 3.3122207323460804e-07, + "loss": 0.17941689491271973, + "step": 2660 + }, + { + "epoch": 0.8544, + "grad_norm": 0.5402101980665998, + "learning_rate": 3.1746471709702963e-07, + "loss": 0.17694177627563476, + "step": 2670 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5764717205309013, + "learning_rate": 3.039797727298585e-07, + "loss": 0.18307201862335204, + "step": 2680 + }, + { + "epoch": 0.8608, + "grad_norm": 0.6021889152147203, + "learning_rate": 2.9076892324795546e-07, + "loss": 0.18175405263900757, + "step": 2690 + }, + { + "epoch": 0.864, + "grad_norm": 0.5783244972157141, + "learning_rate": 2.778338175551995e-07, + "loss": 0.17646790742874147, + "step": 2700 + }, + { + "epoch": 0.8672, + "grad_norm": 0.573282650162234, + "learning_rate": 2.6517607013868326e-07, + "loss": 0.18459818363189698, + "step": 2710 + }, + { + "epoch": 0.8704, + "grad_norm": 0.6039696058732922, + "learning_rate": 2.527972608672002e-07, + "loss": 0.18084490299224854, + "step": 2720 + }, + { + "epoch": 0.8736, + "grad_norm": 0.5916439702722857, + "learning_rate": 2.40698934794053e-07, + "loss": 0.18053301572799682, + "step": 2730 + }, + { + "epoch": 0.8768, + "grad_norm": 0.5703451942226244, + "learning_rate": 2.2888260196421237e-07, + "loss": 0.1792958378791809, + "step": 2740 + }, + { + "epoch": 0.88, + "grad_norm": 0.5672304805383847, + "learning_rate": 2.1734973722583735e-07, + "loss": 0.1819172501564026, + "step": 2750 + }, + { + "epoch": 0.8832, + "grad_norm": 0.5784570642525821, + "learning_rate": 2.0610178004619564e-07, + "loss": 0.17332799434661866, + "step": 2760 + }, + { + "epoch": 0.8864, + "grad_norm": 0.575451427907292, + "learning_rate": 1.9514013433199834e-07, + "loss": 0.18558990955352783, + "step": 2770 + }, + { + "epoch": 0.8896, + "grad_norm": 0.5133461724908028, + "learning_rate": 1.8446616825416958e-07, + "loss": 0.18399085998535156, + "step": 2780 + }, + { + "epoch": 0.8928, + "grad_norm": 0.6123280023323261, + "learning_rate": 1.7408121407708007e-07, + "loss": 0.1844745397567749, + "step": 2790 + }, + { + "epoch": 0.896, + "grad_norm": 0.5761361465385083, + "learning_rate": 1.6398656799226253e-07, + "loss": 0.17304511070251466, + "step": 2800 + }, + { + "epoch": 0.8992, + "grad_norm": 0.6034414454227958, + "learning_rate": 1.5418348995662773e-07, + "loss": 0.17871806621551514, + "step": 2810 + }, + { + "epoch": 0.9024, + "grad_norm": 0.5923974971972374, + "learning_rate": 1.4467320353520275e-07, + "loss": 0.17667040824890137, + "step": 2820 + }, + { + "epoch": 0.9056, + "grad_norm": 0.603734748014922, + "learning_rate": 1.3545689574841341e-07, + "loss": 0.1787508487701416, + "step": 2830 + }, + { + "epoch": 0.9088, + "grad_norm": 0.5750783540393263, + "learning_rate": 1.26535716923927e-07, + "loss": 0.18438329696655273, + "step": 2840 + }, + { + "epoch": 0.912, + "grad_norm": 0.5716942434142535, + "learning_rate": 1.1791078055307493e-07, + "loss": 0.1802410364151001, + "step": 2850 + }, + { + "epoch": 0.9152, + "grad_norm": 0.6031535401501658, + "learning_rate": 1.0958316315187289e-07, + "loss": 0.17950894832611083, + "step": 2860 + }, + { + "epoch": 0.9184, + "grad_norm": 0.5724651470732645, + "learning_rate": 1.0155390412665528e-07, + "loss": 0.17800890207290648, + "step": 2870 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5920847136083833, + "learning_rate": 9.38240056443443e-08, + "loss": 0.17559461593627929, + "step": 2880 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5600845233888927, + "learning_rate": 8.639443250736402e-08, + "loss": 0.17780338525772094, + "step": 2890 + }, + { + "epoch": 0.928, + "grad_norm": 0.5760602589693042, + "learning_rate": 7.926611203321777e-08, + "loss": 0.1794909954071045, + "step": 2900 + }, + { + "epoch": 0.9312, + "grad_norm": 0.59057677772977, + "learning_rate": 7.243993393874882e-08, + "loss": 0.1795297384262085, + "step": 2910 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5693422129621047, + "learning_rate": 6.591675022908805e-08, + "loss": 0.17676992416381837, + "step": 2920 + }, + { + "epoch": 0.9376, + "grad_norm": 0.5656532345210596, + "learning_rate": 5.969737509131241e-08, + "loss": 0.17433459758758546, + "step": 2930 + }, + { + "epoch": 0.9408, + "grad_norm": 0.5865348817236666, + "learning_rate": 5.3782584792823334e-08, + "loss": 0.1795581579208374, + "step": 2940 + }, + { + "epoch": 0.944, + "grad_norm": 0.6034375830769324, + "learning_rate": 4.817311758445686e-08, + "loss": 0.18066773414611817, + "step": 2950 + }, + { + "epoch": 0.9472, + "grad_norm": 0.598761782830776, + "learning_rate": 4.286967360833866e-08, + "loss": 0.1803189516067505, + "step": 2960 + }, + { + "epoch": 0.9504, + "grad_norm": 0.5410244646488507, + "learning_rate": 3.787291481049754e-08, + "loss": 0.18075671195983886, + "step": 2970 + }, + { + "epoch": 0.9536, + "grad_norm": 0.6102805369465131, + "learning_rate": 3.3183464858244364e-08, + "loss": 0.18705531358718872, + "step": 2980 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5798299084498433, + "learning_rate": 2.8801909062328992e-08, + "loss": 0.17331962585449218, + "step": 2990 + }, + { + "epoch": 0.96, + "grad_norm": 0.5999449762716584, + "learning_rate": 2.4728794303886248e-08, + "loss": 0.17158935070037842, + "step": 3000 + }, + { + "epoch": 0.9632, + "grad_norm": 0.6212882795186798, + "learning_rate": 2.0964628966175794e-08, + "loss": 0.17738908529281616, + "step": 3010 + }, + { + "epoch": 0.9664, + "grad_norm": 0.564746561855876, + "learning_rate": 1.750988287113009e-08, + "loss": 0.17667733430862426, + "step": 3020 + }, + { + "epoch": 0.9696, + "grad_norm": 0.5852806549215316, + "learning_rate": 1.4364987220713278e-08, + "loss": 0.18457986116409303, + "step": 3030 + }, + { + "epoch": 0.9728, + "grad_norm": 0.5991233203919278, + "learning_rate": 1.1530334543099763e-08, + "loss": 0.18215363025665282, + "step": 3040 + }, + { + "epoch": 0.976, + "grad_norm": 0.6041102228390866, + "learning_rate": 9.006278643683697e-09, + "loss": 0.18243587017059326, + "step": 3050 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5869697890802611, + "learning_rate": 6.793134560916514e-09, + "loss": 0.18486570119857787, + "step": 3060 + }, + { + "epoch": 0.9824, + "grad_norm": 0.5595978682216465, + "learning_rate": 4.891178526986451e-09, + "loss": 0.18047856092453002, + "step": 3070 + }, + { + "epoch": 0.9856, + "grad_norm": 0.5638404572903396, + "learning_rate": 3.3006479333413943e-09, + "loss": 0.18349089622497558, + "step": 3080 + }, + { + "epoch": 0.9888, + "grad_norm": 0.5582534730189623, + "learning_rate": 2.021741301058422e-09, + "loss": 0.18032891750335694, + "step": 3090 + }, + { + "epoch": 0.992, + "grad_norm": 0.5757824692806152, + "learning_rate": 1.0546182560652872e-09, + "loss": 0.1812995433807373, + "step": 3100 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5718406851297113, + "learning_rate": 3.9939950921774607e-10, + "loss": 0.17747504711151124, + "step": 3110 + }, + { + "epoch": 0.9984, + "grad_norm": 0.549457935685087, + "learning_rate": 5.616684123160854e-11, + "loss": 0.17633507251739503, + "step": 3120 + }, + { + "epoch": 1.0, + "step": 3125, + "total_flos": 2477163648385024.0, + "train_loss": 0.20598802658081056, + "train_runtime": 35266.4791, + "train_samples_per_second": 5.671, + "train_steps_per_second": 0.089 + } + ], + "logging_steps": 10, + "max_steps": 3125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2477163648385024.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/GLM-4.6V-Flash-SFT/training_loss.png b/checkpoints/GLM-4.6V-Flash-SFT/training_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c6f8a107f88ce6fd0298ddbb917d6ebee5122146 Binary files /dev/null and b/checkpoints/GLM-4.6V-Flash-SFT/training_loss.png differ diff --git a/checkpoints/Gemma-4-E4B-it-SFT/all_results.json b/checkpoints/Gemma-4-E4B-it-SFT/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..95fe79d7fbf9dc20758150408b914257954b6c4a --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 1.0913057758773248e+16, + "train_loss": 0.7292402684783935, + "train_runtime": 30167.0559, + "train_samples_per_second": 6.63, + "train_steps_per_second": 0.104 +} \ No newline at end of file diff --git a/checkpoints/Gemma-4-E4B-it-SFT/chat_template.jinja b/checkpoints/Gemma-4-E4B-it-SFT/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..afb1d517bedb410d5bb32df4300d17f6e5888e2a --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/chat_template.jinja @@ -0,0 +1,263 @@ +{%- macro format_parameters(properties, required) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'OBJECT' -%} + ,properties:{ + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + {%- elif value is mapping -%} + {{- format_parameters(value, value['required'] | default([])) -}} + {%- endif -%} + } + {%- if value['required'] -%} + ,required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + ,items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{ bos_token }} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {{- messages[0]['content'] | trim -}} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + + {{- '\n' -}} +{%- endif %} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {{- '<|turn>' + role + '\n' }} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- if message['tool_responses'] -%} + {#- Tool Response handling -#} + {%- for tool_response in message['tool_responses'] -%} + {{- '<|tool_response>' -}} + {%- if tool_response['response'] is mapping -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}} + {%- for key, value in tool_response['response'] | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '\n\n<|image|>\n\n' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '\n\n<|video|>\n\n' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- if not (message['tool_responses'] and not message['content']) -%} + {{- '\n' -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoints/Gemma-4-E4B-it-SFT/config.json b/checkpoints/Gemma-4-E4B-it-SFT/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f697944ac8a7124c0cdbc70da313658644d7a22b --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/config.json @@ -0,0 +1,199 @@ +{ + "architectures": [ + "Gemma4ForConditionalGeneration" + ], + "audio_config": { + "_name_or_path": "", + "architectures": null, + "attention_chunk_size": 12, + "attention_context_left": 13, + "attention_context_right": 0, + "attention_invalid_logits_value": -1000000000.0, + "attention_logit_cap": 50.0, + "chunk_size_feed_forward": 0, + "conv_kernel_size": 5, + "dtype": "bfloat16", + "gradient_clipping": 10000000000.0, + "hidden_act": "silu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "model_type": "gemma4_audio", + "num_attention_heads": 8, + "num_hidden_layers": 12, + "output_attentions": false, + "output_hidden_states": false, + "output_proj_dims": 1536, + "problem_type": null, + "residual_weight": 0.5, + "return_dict": true, + "rms_norm_eps": 1e-06, + "subsampling_conv_channels": [ + 128, + 32 + ], + "use_clipped_linears": true + }, + "audio_token_id": 258881, + "boa_token_id": 256000, + "boi_token_id": 255999, + "bos_token_id": 2, + "dtype": "bfloat16", + "eoa_token_id": 258883, + "eoa_token_index": 258883, + "eoi_token_id": 258882, + "eos_token_id": 106, + "hidden_size": 2560, + "image_token_id": 258880, + "initializer_range": 0.02, + "model_type": "gemma4", + "pad_token_id": 0, + "text_config": { + "attention_bias": false, + "attention_dropout": 0.0, + "attention_k_eq_v": false, + "bos_token_id": 2, + "dtype": "bfloat16", + "enable_moe_block": false, + "eos_token_id": 1, + "expert_intermediate_size": null, + "final_logit_softcapping": 30.0, + "global_head_dim": 512, + "head_dim": 256, + "hidden_activation": "gelu_pytorch_tanh", + "hidden_size": 2560, + "hidden_size_per_layer_input": 256, + "initializer_range": 0.02, + "intermediate_size": 10240, + "layer_types": [ + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "sliding_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "model_type": "gemma4_text", + "moe_intermediate_size": null, + "num_attention_heads": 8, + "num_experts": null, + "num_global_key_value_heads": null, + "num_hidden_layers": 42, + "num_key_value_heads": 2, + "num_kv_shared_layers": 18, + "pad_token_id": 0, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "full_attention": { + "partial_rotary_factor": 0.25, + "rope_theta": 1000000.0, + "rope_type": "proportional" + }, + "sliding_attention": { + "rope_theta": 10000.0, + "rope_type": "default" + } + }, + "sliding_window": 512, + "tie_word_embeddings": true, + "top_k_experts": null, + "use_bidirectional_attention": null, + "use_cache": false, + "use_double_wide_mlp": false, + "vocab_size": 262144, + "vocab_size_per_layer_input": 262144 + }, + "tie_word_embeddings": true, + "transformers_version": "5.5.3", + "use_cache": false, + "video_token_id": 258884, + "vision_config": { + "_name_or_path": "", + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size_feed_forward": 0, + "default_output_length": 280, + "dtype": "bfloat16", + "global_head_dim": 64, + "head_dim": 64, + "hidden_activation": "gelu_pytorch_tanh", + "hidden_size": 768, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "max_position_embeddings": 131072, + "model_type": "gemma4_vision", + "num_attention_heads": 12, + "num_hidden_layers": 16, + "num_key_value_heads": 12, + "output_attentions": false, + "output_hidden_states": false, + "patch_size": 16, + "pooling_kernel_size": 3, + "position_embedding_size": 10240, + "problem_type": null, + "return_dict": true, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 100.0, + "rope_type": "default" + }, + "standardize": false, + "use_clipped_linears": true + }, + "vision_soft_tokens_per_image": 280 +} diff --git a/checkpoints/Gemma-4-E4B-it-SFT/eval_results_job_gemma_gemma_4_e4b_20260430_011024.json b/checkpoints/Gemma-4-E4B-it-SFT/eval_results_job_gemma_gemma_4_e4b_20260430_011024.json new file mode 100644 index 0000000000000000000000000000000000000000..8a51cfe202de611bfc2d01db96808e984bc26986 --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/eval_results_job_gemma_gemma_4_e4b_20260430_011024.json @@ -0,0 +1,56 @@ +{ + "mae_dx": 0.48666724137931033, + "rmse_dx": 1.0707492462177417, + "mae_dy": 0.3855034482758621, + "rmse_dy": 0.7843001492655289, + "mae_dz": 0.04997413793103449, + "rmse_dz": 0.156602120477122, + "mae_dpitch": 0.9934068965517242, + "rmse_dpitch": 1.7330746049166195, + "mae_dyaw": 2.2219862068965517, + "rmse_dyaw": 3.906024586323736, + "mae_droll": 0.0, + "rmse_droll": 0.0, + "mae_overall": 0.6895896551724138, + "mae_position": 0.30738160919540225, + "mae_rotation": 1.0717977011494253, + "rmse_overall": 1.8278735619896387, + "wp1_euc_mae": 0.2665493636964831, + "wp1_euc_median": 0.18, + "wp2_euc_mae": 0.5012943438070621, + "wp2_euc_median": 0.31144823004794875, + "wp3_euc_mae": 0.7271333853911885, + "wp3_euc_median": 0.48, + "wp4_euc_mae": 0.958032444080531, + "wp4_euc_median": 0.6351377754492935, + "wp5_euc_mae": 1.1876023185914943, + "wp5_euc_median": 0.7778817364281356, + "euclidean_mae": 0.7281223711133517, + "ADE": 0.7281223711133519, + "FDE": 1.1876023185914943, + "ADE_median": 0.49122803576716423, + "FDE_median": 0.7778817364281356, + "SR@0.5m": 0.5736206896551724, + "SR@1.0m": 0.783448275862069, + "SR@2.0m": 0.9222413793103448, + "SR@5.0m": 0.9898275862068966, + "TrajSR@1.0m": 0.5887931034482758, + "TrajSR@2.0m": 0.8353448275862069, + "TrajSR@5.0m": 0.9724137931034482, + "RotAcc@1.0deg": 0.39948275862068966, + "RotAcc@5.0deg": 0.83, + "RotAcc@10.0deg": 0.9762068965517241, + "wp1_rot_mae": 1.8561397413473146, + "wp2_rot_mae": 2.249132034716281, + "wp3_rot_mae": 2.6355453352548355, + "wp4_rot_mae": 3.048629056478642, + "wp5_rot_mae": 3.45811827126434, + "rotation_euc_mae": 2.6495128878122824, + "parse_failure_rate": 0.0, + "parse_success_rate": 1.0, + "valid_samples": 1160, + "total_samples": 1160, + "parse_failures": 0, + "inference_engine": "vllm", + "vllm_version": "0.19.0" +} \ No newline at end of file diff --git a/checkpoints/Gemma-4-E4B-it-SFT/generation_config.json b/checkpoints/Gemma-4-E4B-it-SFT/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6633d85c38512e1932d115a29dca1605862e16e2 --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/generation_config.json @@ -0,0 +1,15 @@ +{ + "bos_token_id": 2, + "do_sample": true, + "eos_token_id": [ + 106, + 1, + 106, + 50 + ], + "pad_token_id": 0, + "temperature": 1.0, + "top_k": 64, + "top_p": 0.95, + "transformers_version": "5.5.3" +} diff --git a/checkpoints/Gemma-4-E4B-it-SFT/model.safetensors b/checkpoints/Gemma-4-E4B-it-SFT/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c78bfd5e93a040783e456059d34ae42d8c59ddaf --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa115532595f57272ed0b16337a23de6762ffa60ab858147f5f51f1cff34105b +size 15992595884 diff --git a/checkpoints/Gemma-4-E4B-it-SFT/processor_config.json b/checkpoints/Gemma-4-E4B-it-SFT/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5465974d23e1eca2c46c2809b26c997946ce0d90 --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "right", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json b/checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoints/Gemma-4-E4B-it-SFT/tokenizer_config.json b/checkpoints/Gemma-4-E4B-it-SFT/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ac1c3eff3ad5a4d3913ba8e5f36a14ed1c7e51d6 --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/tokenizer_config.json @@ -0,0 +1,96 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": true, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P(?:(?!\\<\\|tool_call\\>)(?!\\).)+)?(?P\\<\\|tool_call\\>.*\\)?(?:\\)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "split_special_tokens": false, + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/checkpoints/Gemma-4-E4B-it-SFT/train_results.json b/checkpoints/Gemma-4-E4B-it-SFT/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..95fe79d7fbf9dc20758150408b914257954b6c4a --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 1.0913057758773248e+16, + "train_loss": 0.7292402684783935, + "train_runtime": 30167.0559, + "train_samples_per_second": 6.63, + "train_steps_per_second": 0.104 +} \ No newline at end of file diff --git a/checkpoints/Gemma-4-E4B-it-SFT/trainer_state.json b/checkpoints/Gemma-4-E4B-it-SFT/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2f369a1edf7f1302c8239943827d3b72e96e101f --- /dev/null +++ b/checkpoints/Gemma-4-E4B-it-SFT/trainer_state.json @@ -0,0 +1,2227 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 366.0841096744857, + "learning_rate": 1.437699680511182e-07, + "loss": 23.85431823730469, + "step": 10 + }, + { + "epoch": 0.0064, + "grad_norm": 367.47333882445946, + "learning_rate": 3.0351437699680514e-07, + "loss": 23.65589599609375, + "step": 20 + }, + { + "epoch": 0.0096, + "grad_norm": 367.96579270464326, + "learning_rate": 4.6325878594249205e-07, + "loss": 22.780029296875, + "step": 30 + }, + { + "epoch": 0.0128, + "grad_norm": 332.5732884154056, + "learning_rate": 6.230031948881789e-07, + "loss": 20.279689025878906, + "step": 40 + }, + { + "epoch": 0.016, + "grad_norm": 219.53674756423746, + "learning_rate": 7.82747603833866e-07, + "loss": 15.498806762695313, + "step": 50 + }, + { + "epoch": 0.0192, + "grad_norm": 156.1487544830451, + "learning_rate": 9.424920127795528e-07, + "loss": 10.388201904296874, + "step": 60 + }, + { + "epoch": 0.0224, + "grad_norm": 37.96869040917498, + "learning_rate": 1.1022364217252397e-06, + "loss": 3.7560958862304688, + "step": 70 + }, + { + "epoch": 0.0256, + "grad_norm": 16.783464772614202, + "learning_rate": 1.2619808306709266e-06, + "loss": 2.033830261230469, + "step": 80 + }, + { + "epoch": 0.0288, + "grad_norm": 5.438256169634593, + "learning_rate": 1.4217252396166134e-06, + "loss": 1.0431390762329102, + "step": 90 + }, + { + "epoch": 0.032, + "grad_norm": 3.6935246150045775, + "learning_rate": 1.5814696485623005e-06, + "loss": 0.8069572448730469, + "step": 100 + }, + { + "epoch": 0.0352, + "grad_norm": 9.218312544625562, + "learning_rate": 1.7412140575079875e-06, + "loss": 0.7057615280151367, + "step": 110 + }, + { + "epoch": 0.0384, + "grad_norm": 5.394484238866305, + "learning_rate": 1.9009584664536742e-06, + "loss": 0.6301750183105469, + "step": 120 + }, + { + "epoch": 0.0416, + "grad_norm": 6.577481237217732, + "learning_rate": 2.060702875399361e-06, + "loss": 0.5898516654968262, + "step": 130 + }, + { + "epoch": 0.0448, + "grad_norm": 3.4158074483641068, + "learning_rate": 2.220447284345048e-06, + "loss": 0.5524418830871582, + "step": 140 + }, + { + "epoch": 0.048, + "grad_norm": 4.032046521040006, + "learning_rate": 2.380191693290735e-06, + "loss": 0.5317594051361084, + "step": 150 + }, + { + "epoch": 0.0512, + "grad_norm": 5.468634675306576, + "learning_rate": 2.539936102236422e-06, + "loss": 0.5184277534484864, + "step": 160 + }, + { + "epoch": 0.0544, + "grad_norm": 3.4313124951156424, + "learning_rate": 2.699680511182109e-06, + "loss": 0.5204483985900878, + "step": 170 + }, + { + "epoch": 0.0576, + "grad_norm": 5.13400179254009, + "learning_rate": 2.8594249201277955e-06, + "loss": 0.5058025360107422, + "step": 180 + }, + { + "epoch": 0.0608, + "grad_norm": 5.9183424216837786, + "learning_rate": 3.0191693290734825e-06, + "loss": 0.5073411941528321, + "step": 190 + }, + { + "epoch": 0.064, + "grad_norm": 5.625073986187664, + "learning_rate": 3.17891373801917e-06, + "loss": 0.5000103950500489, + "step": 200 + }, + { + "epoch": 0.0672, + "grad_norm": 5.050603467051007, + "learning_rate": 3.3386581469648564e-06, + "loss": 0.488192081451416, + "step": 210 + }, + { + "epoch": 0.0704, + "grad_norm": 11.776866822937645, + "learning_rate": 3.4984025559105434e-06, + "loss": 0.48699202537536623, + "step": 220 + }, + { + "epoch": 0.0736, + "grad_norm": 7.438900018795585, + "learning_rate": 3.6581469648562303e-06, + "loss": 0.4820102691650391, + "step": 230 + }, + { + "epoch": 0.0768, + "grad_norm": 4.3491840646532065, + "learning_rate": 3.817891373801918e-06, + "loss": 0.47640199661254884, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 3.472565426233091, + "learning_rate": 3.977635782747604e-06, + "loss": 0.4729574203491211, + "step": 250 + }, + { + "epoch": 0.0832, + "grad_norm": 3.1912744148161942, + "learning_rate": 4.137380191693291e-06, + "loss": 0.4786433219909668, + "step": 260 + }, + { + "epoch": 0.0864, + "grad_norm": 3.9698013424470777, + "learning_rate": 4.297124600638978e-06, + "loss": 0.4748369216918945, + "step": 270 + }, + { + "epoch": 0.0896, + "grad_norm": 8.11949393489321, + "learning_rate": 4.456869009584665e-06, + "loss": 0.4681865692138672, + "step": 280 + }, + { + "epoch": 0.0928, + "grad_norm": 4.7349566199381234, + "learning_rate": 4.616613418530352e-06, + "loss": 0.46743001937866213, + "step": 290 + }, + { + "epoch": 0.096, + "grad_norm": 4.756427284033883, + "learning_rate": 4.776357827476039e-06, + "loss": 0.46964178085327146, + "step": 300 + }, + { + "epoch": 0.0992, + "grad_norm": 4.86570605379029, + "learning_rate": 4.936102236421725e-06, + "loss": 0.45612516403198244, + "step": 310 + }, + { + "epoch": 0.1024, + "grad_norm": 5.762654788054032, + "learning_rate": 4.999943833158769e-06, + "loss": 0.45009474754333495, + "step": 320 + }, + { + "epoch": 0.1056, + "grad_norm": 3.501477346053355, + "learning_rate": 4.999600600490783e-06, + "loss": 0.4523477554321289, + "step": 330 + }, + { + "epoch": 0.1088, + "grad_norm": 7.957279740713588, + "learning_rate": 4.9989453817439345e-06, + "loss": 0.44190473556518556, + "step": 340 + }, + { + "epoch": 0.112, + "grad_norm": 7.660308885793361, + "learning_rate": 4.997978258698942e-06, + "loss": 0.43758931159973147, + "step": 350 + }, + { + "epoch": 0.1152, + "grad_norm": 5.8839479464224205, + "learning_rate": 4.996699352066659e-06, + "loss": 0.4371060371398926, + "step": 360 + }, + { + "epoch": 0.1184, + "grad_norm": 3.452842882877267, + "learning_rate": 4.995108821473014e-06, + "loss": 0.42999753952026365, + "step": 370 + }, + { + "epoch": 0.1216, + "grad_norm": 4.825810317520427, + "learning_rate": 4.993206865439084e-06, + "loss": 0.4285894393920898, + "step": 380 + }, + { + "epoch": 0.1248, + "grad_norm": 5.379766821254966, + "learning_rate": 4.990993721356317e-06, + "loss": 0.42139811515808107, + "step": 390 + }, + { + "epoch": 0.128, + "grad_norm": 4.854730410799869, + "learning_rate": 4.988469665456901e-06, + "loss": 0.42040281295776366, + "step": 400 + }, + { + "epoch": 0.1312, + "grad_norm": 4.6616615938661745, + "learning_rate": 4.985635012779288e-06, + "loss": 0.4207456588745117, + "step": 410 + }, + { + "epoch": 0.1344, + "grad_norm": 4.5341296475975605, + "learning_rate": 4.98249011712887e-06, + "loss": 0.414472770690918, + "step": 420 + }, + { + "epoch": 0.1376, + "grad_norm": 5.217437981869656, + "learning_rate": 4.979035371033824e-06, + "loss": 0.41441006660461427, + "step": 430 + }, + { + "epoch": 0.1408, + "grad_norm": 3.561516924716779, + "learning_rate": 4.975271205696115e-06, + "loss": 0.40767755508422854, + "step": 440 + }, + { + "epoch": 0.144, + "grad_norm": 3.815692337476438, + "learning_rate": 4.971198090937671e-06, + "loss": 0.3997596263885498, + "step": 450 + }, + { + "epoch": 0.1472, + "grad_norm": 4.559242371997167, + "learning_rate": 4.966816535141756e-06, + "loss": 0.39360842704772947, + "step": 460 + }, + { + "epoch": 0.1504, + "grad_norm": 3.432229350472061, + "learning_rate": 4.9621270851895035e-06, + "loss": 0.40289998054504395, + "step": 470 + }, + { + "epoch": 0.1536, + "grad_norm": 5.375227134041046, + "learning_rate": 4.957130326391662e-06, + "loss": 0.3982266664505005, + "step": 480 + }, + { + "epoch": 0.1568, + "grad_norm": 5.539585521677851, + "learning_rate": 4.951826882415544e-06, + "loss": 0.39270691871643065, + "step": 490 + }, + { + "epoch": 0.16, + "grad_norm": 3.4147092253345743, + "learning_rate": 4.946217415207177e-06, + "loss": 0.3853750705718994, + "step": 500 + }, + { + "epoch": 0.1632, + "grad_norm": 4.444175842440995, + "learning_rate": 4.940302624908689e-06, + "loss": 0.38694162368774415, + "step": 510 + }, + { + "epoch": 0.1664, + "grad_norm": 3.3493207902303475, + "learning_rate": 4.934083249770912e-06, + "loss": 0.3797153949737549, + "step": 520 + }, + { + "epoch": 0.1696, + "grad_norm": 3.0499194254019097, + "learning_rate": 4.927560066061251e-06, + "loss": 0.38063654899597166, + "step": 530 + }, + { + "epoch": 0.1728, + "grad_norm": 3.141871281336489, + "learning_rate": 4.920733887966783e-06, + "loss": 0.39005699157714846, + "step": 540 + }, + { + "epoch": 0.176, + "grad_norm": 3.979297184951908, + "learning_rate": 4.913605567492636e-06, + "loss": 0.38013472557067873, + "step": 550 + }, + { + "epoch": 0.1792, + "grad_norm": 3.7669251986704113, + "learning_rate": 4.906175994355656e-06, + "loss": 0.37832577228546144, + "step": 560 + }, + { + "epoch": 0.1824, + "grad_norm": 2.983798431857085, + "learning_rate": 4.898446095873345e-06, + "loss": 0.38150479793548586, + "step": 570 + }, + { + "epoch": 0.1856, + "grad_norm": 3.657787030439589, + "learning_rate": 4.890416836848128e-06, + "loss": 0.3775670528411865, + "step": 580 + }, + { + "epoch": 0.1888, + "grad_norm": 3.551048022748126, + "learning_rate": 4.882089219446925e-06, + "loss": 0.37199065685272215, + "step": 590 + }, + { + "epoch": 0.192, + "grad_norm": 4.750977601329729, + "learning_rate": 4.873464283076074e-06, + "loss": 0.3790221452713013, + "step": 600 + }, + { + "epoch": 0.1952, + "grad_norm": 7.684545118387627, + "learning_rate": 4.864543104251587e-06, + "loss": 0.37508673667907716, + "step": 610 + }, + { + "epoch": 0.1984, + "grad_norm": 5.872575231845199, + "learning_rate": 4.855326796464798e-06, + "loss": 0.3811868906021118, + "step": 620 + }, + { + "epoch": 0.2016, + "grad_norm": 3.9960144706794316, + "learning_rate": 4.8458165100433725e-06, + "loss": 0.37326750755310056, + "step": 630 + }, + { + "epoch": 0.2048, + "grad_norm": 3.9998452581157657, + "learning_rate": 4.836013432007738e-06, + "loss": 0.3709099769592285, + "step": 640 + }, + { + "epoch": 0.208, + "grad_norm": 2.6973135018594343, + "learning_rate": 4.825918785922921e-06, + "loss": 0.3728507995605469, + "step": 650 + }, + { + "epoch": 0.2112, + "grad_norm": 4.478756132604264, + "learning_rate": 4.8155338317458315e-06, + "loss": 0.36782591342926024, + "step": 660 + }, + { + "epoch": 0.2144, + "grad_norm": 2.5620662799375378, + "learning_rate": 4.804859865668002e-06, + "loss": 0.36416780948638916, + "step": 670 + }, + { + "epoch": 0.2176, + "grad_norm": 2.9398359151969884, + "learning_rate": 4.793898219953804e-06, + "loss": 0.36772732734680175, + "step": 680 + }, + { + "epoch": 0.2208, + "grad_norm": 3.404020172068192, + "learning_rate": 4.782650262774164e-06, + "loss": 0.3651688575744629, + "step": 690 + }, + { + "epoch": 0.224, + "grad_norm": 2.588678061474319, + "learning_rate": 4.7711173980357886e-06, + "loss": 0.3649880409240723, + "step": 700 + }, + { + "epoch": 0.2272, + "grad_norm": 3.5390276900279773, + "learning_rate": 4.759301065205947e-06, + "loss": 0.3612825870513916, + "step": 710 + }, + { + "epoch": 0.2304, + "grad_norm": 3.8670986814196473, + "learning_rate": 4.7472027391328e-06, + "loss": 0.3657612085342407, + "step": 720 + }, + { + "epoch": 0.2336, + "grad_norm": 3.0276354554801217, + "learning_rate": 4.734823929861317e-06, + "loss": 0.36682844161987305, + "step": 730 + }, + { + "epoch": 0.2368, + "grad_norm": 5.205227283770371, + "learning_rate": 4.722166182444801e-06, + "loss": 0.3605961322784424, + "step": 740 + }, + { + "epoch": 0.24, + "grad_norm": 3.1037248816470737, + "learning_rate": 4.709231076752045e-06, + "loss": 0.3625338554382324, + "step": 750 + }, + { + "epoch": 0.2432, + "grad_norm": 3.827009314178272, + "learning_rate": 4.696020227270142e-06, + "loss": 0.36273531913757323, + "step": 760 + }, + { + "epoch": 0.2464, + "grad_norm": 2.553717481812464, + "learning_rate": 4.6825352829029705e-06, + "loss": 0.35740270614624026, + "step": 770 + }, + { + "epoch": 0.2496, + "grad_norm": 2.8273485176739563, + "learning_rate": 4.668777926765392e-06, + "loss": 0.3613132953643799, + "step": 780 + }, + { + "epoch": 0.2528, + "grad_norm": 3.242165291552063, + "learning_rate": 4.6547498759731725e-06, + "loss": 0.3525214672088623, + "step": 790 + }, + { + "epoch": 0.256, + "grad_norm": 2.607635187753211, + "learning_rate": 4.6404528814286575e-06, + "loss": 0.3569283723831177, + "step": 800 + }, + { + "epoch": 0.2592, + "grad_norm": 3.2439792578606204, + "learning_rate": 4.6258887276022425e-06, + "loss": 0.357681941986084, + "step": 810 + }, + { + "epoch": 0.2624, + "grad_norm": 2.9728036180938284, + "learning_rate": 4.611059232309639e-06, + "loss": 0.3537192106246948, + "step": 820 + }, + { + "epoch": 0.2656, + "grad_norm": 2.556165398739607, + "learning_rate": 4.595966246484986e-06, + "loss": 0.3528641700744629, + "step": 830 + }, + { + "epoch": 0.2688, + "grad_norm": 2.593548528246384, + "learning_rate": 4.580611653949829e-06, + "loss": 0.3564203500747681, + "step": 840 + }, + { + "epoch": 0.272, + "grad_norm": 3.428440109671292, + "learning_rate": 4.564997371177992e-06, + "loss": 0.3518026828765869, + "step": 850 + }, + { + "epoch": 0.2752, + "grad_norm": 4.993564850548027, + "learning_rate": 4.54912534705637e-06, + "loss": 0.35079920291900635, + "step": 860 + }, + { + "epoch": 0.2784, + "grad_norm": 3.340510283095063, + "learning_rate": 4.532997562641683e-06, + "loss": 0.3466078042984009, + "step": 870 + }, + { + "epoch": 0.2816, + "grad_norm": 2.6894615056191644, + "learning_rate": 4.516616030913214e-06, + "loss": 0.3472653865814209, + "step": 880 + }, + { + "epoch": 0.2848, + "grad_norm": 1.891440124594712, + "learning_rate": 4.499982796521556e-06, + "loss": 0.34483723640441893, + "step": 890 + }, + { + "epoch": 0.288, + "grad_norm": 3.223309297530686, + "learning_rate": 4.48309993553341e-06, + "loss": 0.3444544553756714, + "step": 900 + }, + { + "epoch": 0.2912, + "grad_norm": 3.1032077209020468, + "learning_rate": 4.465969555172468e-06, + "loss": 0.34571564197540283, + "step": 910 + }, + { + "epoch": 0.2944, + "grad_norm": 2.5407458837926638, + "learning_rate": 4.448593793556391e-06, + "loss": 0.3534140110015869, + "step": 920 + }, + { + "epoch": 0.2976, + "grad_norm": 3.1253686498979123, + "learning_rate": 4.430974819429954e-06, + "loss": 0.3445676326751709, + "step": 930 + }, + { + "epoch": 0.3008, + "grad_norm": 3.740083740472538, + "learning_rate": 4.413114831894344e-06, + "loss": 0.33962287902832033, + "step": 940 + }, + { + "epoch": 0.304, + "grad_norm": 4.724023923665093, + "learning_rate": 4.3950160601326865e-06, + "loss": 0.3363780498504639, + "step": 950 + }, + { + "epoch": 0.3072, + "grad_norm": 3.597276867142834, + "learning_rate": 4.376680763131811e-06, + "loss": 0.3429840087890625, + "step": 960 + }, + { + "epoch": 0.3104, + "grad_norm": 2.97998267012516, + "learning_rate": 4.358111229400296e-06, + "loss": 0.3470882177352905, + "step": 970 + }, + { + "epoch": 0.3136, + "grad_norm": 3.1405275857331856, + "learning_rate": 4.33930977668283e-06, + "loss": 0.35235731601715087, + "step": 980 + }, + { + "epoch": 0.3168, + "grad_norm": 3.774584318253359, + "learning_rate": 4.320278751670922e-06, + "loss": 0.3418004512786865, + "step": 990 + }, + { + "epoch": 0.32, + "grad_norm": 3.4325438208492605, + "learning_rate": 4.301020529710009e-06, + "loss": 0.3456583499908447, + "step": 1000 + }, + { + "epoch": 0.3232, + "grad_norm": 3.1407187711443916, + "learning_rate": 4.281537514502962e-06, + "loss": 0.3446167469024658, + "step": 1010 + }, + { + "epoch": 0.3264, + "grad_norm": 2.6154317834679226, + "learning_rate": 4.261832137810093e-06, + "loss": 0.34354138374328613, + "step": 1020 + }, + { + "epoch": 0.3296, + "grad_norm": 2.8993376261822648, + "learning_rate": 4.241906859145611e-06, + "loss": 0.3451784372329712, + "step": 1030 + }, + { + "epoch": 0.3328, + "grad_norm": 2.3351853591260574, + "learning_rate": 4.221764165470661e-06, + "loss": 0.33875834941864014, + "step": 1040 + }, + { + "epoch": 0.336, + "grad_norm": 3.4295735539049605, + "learning_rate": 4.201406570882898e-06, + "loss": 0.33980226516723633, + "step": 1050 + }, + { + "epoch": 0.3392, + "grad_norm": 2.6388634367096735, + "learning_rate": 4.180836616302704e-06, + "loss": 0.3395829200744629, + "step": 1060 + }, + { + "epoch": 0.3424, + "grad_norm": 3.211009486395674, + "learning_rate": 4.160056869156041e-06, + "loss": 0.3433471441268921, + "step": 1070 + }, + { + "epoch": 0.3456, + "grad_norm": 3.4377414857289317, + "learning_rate": 4.139069923053995e-06, + "loss": 0.34047765731811525, + "step": 1080 + }, + { + "epoch": 0.3488, + "grad_norm": 3.131466112366247, + "learning_rate": 4.117878397469062e-06, + "loss": 0.3420018434524536, + "step": 1090 + }, + { + "epoch": 0.352, + "grad_norm": 2.388207923072635, + "learning_rate": 4.096484937408195e-06, + "loss": 0.3351470470428467, + "step": 1100 + }, + { + "epoch": 0.3552, + "grad_norm": 2.2910707329028117, + "learning_rate": 4.074892213082676e-06, + "loss": 0.33539299964904784, + "step": 1110 + }, + { + "epoch": 0.3584, + "grad_norm": 2.156244058261874, + "learning_rate": 4.0531029195748265e-06, + "loss": 0.33862009048461916, + "step": 1120 + }, + { + "epoch": 0.3616, + "grad_norm": 2.6382644444406296, + "learning_rate": 4.03111977650163e-06, + "loss": 0.34041495323181153, + "step": 1130 + }, + { + "epoch": 0.3648, + "grad_norm": 2.5960896388831545, + "learning_rate": 4.008945527675281e-06, + "loss": 0.3390871524810791, + "step": 1140 + }, + { + "epoch": 0.368, + "grad_norm": 3.657074741484568, + "learning_rate": 3.986582940760717e-06, + "loss": 0.3278806209564209, + "step": 1150 + }, + { + "epoch": 0.3712, + "grad_norm": 2.9587401358526075, + "learning_rate": 3.9640348069301785e-06, + "loss": 0.3368961334228516, + "step": 1160 + }, + { + "epoch": 0.3744, + "grad_norm": 1.965300565427372, + "learning_rate": 3.941303940514826e-06, + "loss": 0.3339808464050293, + "step": 1170 + }, + { + "epoch": 0.3776, + "grad_norm": 2.90985435283837, + "learning_rate": 3.918393178653472e-06, + "loss": 0.3376065969467163, + "step": 1180 + }, + { + "epoch": 0.3808, + "grad_norm": 3.27190473511409, + "learning_rate": 3.895305380938468e-06, + "loss": 0.3342454433441162, + "step": 1190 + }, + { + "epoch": 0.384, + "grad_norm": 2.0468253424433165, + "learning_rate": 3.872043429058783e-06, + "loss": 0.32965447902679446, + "step": 1200 + }, + { + "epoch": 0.3872, + "grad_norm": 2.5123150680001576, + "learning_rate": 3.84861022644033e-06, + "loss": 0.3357837677001953, + "step": 1210 + }, + { + "epoch": 0.3904, + "grad_norm": 3.148104290988529, + "learning_rate": 3.825008697883574e-06, + "loss": 0.34343953132629396, + "step": 1220 + }, + { + "epoch": 0.3936, + "grad_norm": 2.488823913942074, + "learning_rate": 3.8012417891984776e-06, + "loss": 0.333116340637207, + "step": 1230 + }, + { + "epoch": 0.3968, + "grad_norm": 3.0225259799028645, + "learning_rate": 3.777312466836819e-06, + "loss": 0.3318933486938477, + "step": 1240 + }, + { + "epoch": 0.4, + "grad_norm": 3.3439153363899115, + "learning_rate": 3.7532237175219378e-06, + "loss": 0.32833037376403806, + "step": 1250 + }, + { + "epoch": 0.4032, + "grad_norm": 2.72884090647899, + "learning_rate": 3.728978547875948e-06, + "loss": 0.3360243082046509, + "step": 1260 + }, + { + "epoch": 0.4064, + "grad_norm": 2.5999080124511966, + "learning_rate": 3.7045799840444712e-06, + "loss": 0.33025145530700684, + "step": 1270 + }, + { + "epoch": 0.4096, + "grad_norm": 3.0518346526448488, + "learning_rate": 3.6800310713189258e-06, + "loss": 0.3306798219680786, + "step": 1280 + }, + { + "epoch": 0.4128, + "grad_norm": 2.0509087709244507, + "learning_rate": 3.6553348737564328e-06, + "loss": 0.33091559410095217, + "step": 1290 + }, + { + "epoch": 0.416, + "grad_norm": 2.908137390744499, + "learning_rate": 3.6304944737973794e-06, + "loss": 0.33455810546875, + "step": 1300 + }, + { + "epoch": 0.4192, + "grad_norm": 3.0396312942670796, + "learning_rate": 3.6055129718806836e-06, + "loss": 0.331624960899353, + "step": 1310 + }, + { + "epoch": 0.4224, + "grad_norm": 3.282462978283218, + "learning_rate": 3.5803934860568134e-06, + "loss": 0.32364490032196047, + "step": 1320 + }, + { + "epoch": 0.4256, + "grad_norm": 2.2269456751164727, + "learning_rate": 3.5551391515986163e-06, + "loss": 0.3319955348968506, + "step": 1330 + }, + { + "epoch": 0.4288, + "grad_norm": 2.8364899461485527, + "learning_rate": 3.529753120609982e-06, + "loss": 0.3252741813659668, + "step": 1340 + }, + { + "epoch": 0.432, + "grad_norm": 2.89515974439621, + "learning_rate": 3.5042385616324243e-06, + "loss": 0.3287111520767212, + "step": 1350 + }, + { + "epoch": 0.4352, + "grad_norm": 2.311001238312573, + "learning_rate": 3.4785986592495934e-06, + "loss": 0.32939796447753905, + "step": 1360 + }, + { + "epoch": 0.4384, + "grad_norm": 2.4126049139350734, + "learning_rate": 3.452836613689803e-06, + "loss": 0.32168779373168943, + "step": 1370 + }, + { + "epoch": 0.4416, + "grad_norm": 3.1765584413022254, + "learning_rate": 3.426955640426584e-06, + "loss": 0.32864985466003416, + "step": 1380 + }, + { + "epoch": 0.4448, + "grad_norm": 3.154206643410634, + "learning_rate": 3.4009589697773605e-06, + "loss": 0.3260640621185303, + "step": 1390 + }, + { + "epoch": 0.448, + "grad_norm": 3.4230687653412564, + "learning_rate": 3.3748498465002475e-06, + "loss": 0.32304584980010986, + "step": 1400 + }, + { + "epoch": 0.4512, + "grad_norm": 2.6276396964869684, + "learning_rate": 3.3486315293890693e-06, + "loss": 0.33318138122558594, + "step": 1410 + }, + { + "epoch": 0.4544, + "grad_norm": 2.754821177049362, + "learning_rate": 3.3223072908666053e-06, + "loss": 0.32256054878234863, + "step": 1420 + }, + { + "epoch": 0.4576, + "grad_norm": 2.881952130772473, + "learning_rate": 3.295880416576153e-06, + "loss": 0.33387539386749265, + "step": 1430 + }, + { + "epoch": 0.4608, + "grad_norm": 2.5217047707442966, + "learning_rate": 3.269354204971427e-06, + "loss": 0.32321481704711913, + "step": 1440 + }, + { + "epoch": 0.464, + "grad_norm": 2.976679985492794, + "learning_rate": 3.242731966904865e-06, + "loss": 0.32245721817016604, + "step": 1450 + }, + { + "epoch": 0.4672, + "grad_norm": 2.527563459090948, + "learning_rate": 3.2160170252143913e-06, + "loss": 0.32239205837249757, + "step": 1460 + }, + { + "epoch": 0.4704, + "grad_norm": 1.997832889519553, + "learning_rate": 3.1892127143086716e-06, + "loss": 0.32758924961090086, + "step": 1470 + }, + { + "epoch": 0.4736, + "grad_norm": 2.299101703675196, + "learning_rate": 3.1623223797509347e-06, + "loss": 0.31891183853149413, + "step": 1480 + }, + { + "epoch": 0.4768, + "grad_norm": 2.9210746413068907, + "learning_rate": 3.135349377841396e-06, + "loss": 0.32430353164672854, + "step": 1490 + }, + { + "epoch": 0.48, + "grad_norm": 2.6265609696149146, + "learning_rate": 3.1082970751983497e-06, + "loss": 0.3312281608581543, + "step": 1500 + }, + { + "epoch": 0.4832, + "grad_norm": 2.5956160397204786, + "learning_rate": 3.0811688483379546e-06, + "loss": 0.3238035202026367, + "step": 1510 + }, + { + "epoch": 0.4864, + "grad_norm": 2.231793404952503, + "learning_rate": 3.0539680832528074e-06, + "loss": 0.32330875396728515, + "step": 1520 + }, + { + "epoch": 0.4896, + "grad_norm": 2.5723097920479763, + "learning_rate": 3.026698174989316e-06, + "loss": 0.32520170211791993, + "step": 1530 + }, + { + "epoch": 0.4928, + "grad_norm": 2.691498291676849, + "learning_rate": 2.999362527223952e-06, + "loss": 0.3273704290390015, + "step": 1540 + }, + { + "epoch": 0.496, + "grad_norm": 2.0511124933056375, + "learning_rate": 2.9719645518384194e-06, + "loss": 0.3250606536865234, + "step": 1550 + }, + { + "epoch": 0.4992, + "grad_norm": 2.872290392112785, + "learning_rate": 2.944507668493807e-06, + "loss": 0.3281686782836914, + "step": 1560 + }, + { + "epoch": 0.5024, + "grad_norm": 2.330246614888919, + "learning_rate": 2.9169953042037623e-06, + "loss": 0.32374157905578616, + "step": 1570 + }, + { + "epoch": 0.5056, + "grad_norm": 2.0520711406500394, + "learning_rate": 2.889430892906754e-06, + "loss": 0.3169667720794678, + "step": 1580 + }, + { + "epoch": 0.5088, + "grad_norm": 2.048670737699487, + "learning_rate": 2.861817875037462e-06, + "loss": 0.3160442590713501, + "step": 1590 + }, + { + "epoch": 0.512, + "grad_norm": 2.8695840695234303, + "learning_rate": 2.8341596970973683e-06, + "loss": 0.32544608116149903, + "step": 1600 + }, + { + "epoch": 0.5152, + "grad_norm": 1.976397223627746, + "learning_rate": 2.80645981122458e-06, + "loss": 0.3229134798049927, + "step": 1610 + }, + { + "epoch": 0.5184, + "grad_norm": 2.7070609575351807, + "learning_rate": 2.7787216747629508e-06, + "loss": 0.32655487060546873, + "step": 1620 + }, + { + "epoch": 0.5216, + "grad_norm": 2.6027463070090993, + "learning_rate": 2.7509487498305615e-06, + "loss": 0.31430754661560056, + "step": 1630 + }, + { + "epoch": 0.5248, + "grad_norm": 2.4274539931656585, + "learning_rate": 2.7231445028875924e-06, + "loss": 0.3237884759902954, + "step": 1640 + }, + { + "epoch": 0.528, + "grad_norm": 1.9308598632845329, + "learning_rate": 2.6953124043036604e-06, + "loss": 0.32111692428588867, + "step": 1650 + }, + { + "epoch": 0.5312, + "grad_norm": 2.1321964485217784, + "learning_rate": 2.667455927924667e-06, + "loss": 0.3178241729736328, + "step": 1660 + }, + { + "epoch": 0.5344, + "grad_norm": 3.1390388403682534, + "learning_rate": 2.6395785506392164e-06, + "loss": 0.31754770278930666, + "step": 1670 + }, + { + "epoch": 0.5376, + "grad_norm": 2.137535651695072, + "learning_rate": 2.6116837519446407e-06, + "loss": 0.3183767795562744, + "step": 1680 + }, + { + "epoch": 0.5408, + "grad_norm": 2.353751591087722, + "learning_rate": 2.5837750135127192e-06, + "loss": 0.31382954120635986, + "step": 1690 + }, + { + "epoch": 0.544, + "grad_norm": 2.58704039056448, + "learning_rate": 2.555855818755108e-06, + "loss": 0.3226866483688354, + "step": 1700 + }, + { + "epoch": 0.5472, + "grad_norm": 2.709677414439902, + "learning_rate": 2.5279296523885636e-06, + "loss": 0.3166576623916626, + "step": 1710 + }, + { + "epoch": 0.5504, + "grad_norm": 2.0859245317104107, + "learning_rate": 2.5e-06, + "loss": 0.3218212127685547, + "step": 1720 + }, + { + "epoch": 0.5536, + "grad_norm": 2.3347357869338436, + "learning_rate": 2.472070347611437e-06, + "loss": 0.31246294975280764, + "step": 1730 + }, + { + "epoch": 0.5568, + "grad_norm": 2.5799420800617106, + "learning_rate": 2.444144181244893e-06, + "loss": 0.31868853569030764, + "step": 1740 + }, + { + "epoch": 0.56, + "grad_norm": 2.8867509619529406, + "learning_rate": 2.416224986487282e-06, + "loss": 0.31381807327270506, + "step": 1750 + }, + { + "epoch": 0.5632, + "grad_norm": 2.625660671305278, + "learning_rate": 2.3883162480553605e-06, + "loss": 0.31146280765533446, + "step": 1760 + }, + { + "epoch": 0.5664, + "grad_norm": 2.8862495653341544, + "learning_rate": 2.3604214493607844e-06, + "loss": 0.3111546993255615, + "step": 1770 + }, + { + "epoch": 0.5696, + "grad_norm": 2.267020272744141, + "learning_rate": 2.332544072075333e-06, + "loss": 0.32178173065185545, + "step": 1780 + }, + { + "epoch": 0.5728, + "grad_norm": 2.073205643473978, + "learning_rate": 2.30468759569634e-06, + "loss": 0.31751441955566406, + "step": 1790 + }, + { + "epoch": 0.576, + "grad_norm": 2.232045258362397, + "learning_rate": 2.276855497112408e-06, + "loss": 0.3135702610015869, + "step": 1800 + }, + { + "epoch": 0.5792, + "grad_norm": 3.4632505976937744, + "learning_rate": 2.2490512501694394e-06, + "loss": 0.3126095771789551, + "step": 1810 + }, + { + "epoch": 0.5824, + "grad_norm": 2.7008114205550022, + "learning_rate": 2.2212783252370496e-06, + "loss": 0.31725611686706545, + "step": 1820 + }, + { + "epoch": 0.5856, + "grad_norm": 2.640110404643157, + "learning_rate": 2.1935401887754213e-06, + "loss": 0.3210929870605469, + "step": 1830 + }, + { + "epoch": 0.5888, + "grad_norm": 2.9154181525967924, + "learning_rate": 2.165840302902632e-06, + "loss": 0.31817543506622314, + "step": 1840 + }, + { + "epoch": 0.592, + "grad_norm": 2.3435756622683916, + "learning_rate": 2.1381821249625383e-06, + "loss": 0.3186073303222656, + "step": 1850 + }, + { + "epoch": 0.5952, + "grad_norm": 2.391868801860604, + "learning_rate": 2.1105691070932465e-06, + "loss": 0.3081700563430786, + "step": 1860 + }, + { + "epoch": 0.5984, + "grad_norm": 2.27033295147997, + "learning_rate": 2.083004695796238e-06, + "loss": 0.30403599739074705, + "step": 1870 + }, + { + "epoch": 0.6016, + "grad_norm": 2.1095837820360157, + "learning_rate": 2.055492331506194e-06, + "loss": 0.31353535652160647, + "step": 1880 + }, + { + "epoch": 0.6048, + "grad_norm": 2.284519052184323, + "learning_rate": 2.0280354481615814e-06, + "loss": 0.31677517890930174, + "step": 1890 + }, + { + "epoch": 0.608, + "grad_norm": 2.237766836173548, + "learning_rate": 2.000637472776049e-06, + "loss": 0.3152945041656494, + "step": 1900 + }, + { + "epoch": 0.6112, + "grad_norm": 2.7842715157490434, + "learning_rate": 1.973301825010685e-06, + "loss": 0.30818216800689696, + "step": 1910 + }, + { + "epoch": 0.6144, + "grad_norm": 2.4813744091778784, + "learning_rate": 1.9460319167471934e-06, + "loss": 0.31820502281188967, + "step": 1920 + }, + { + "epoch": 0.6176, + "grad_norm": 2.0821248606030887, + "learning_rate": 1.9188311516620466e-06, + "loss": 0.31040709018707274, + "step": 1930 + }, + { + "epoch": 0.6208, + "grad_norm": 2.9336859566866975, + "learning_rate": 1.891702924801651e-06, + "loss": 0.31292426586151123, + "step": 1940 + }, + { + "epoch": 0.624, + "grad_norm": 2.511253012965921, + "learning_rate": 1.864650622158604e-06, + "loss": 0.32196660041809083, + "step": 1950 + }, + { + "epoch": 0.6272, + "grad_norm": 2.4545922236833455, + "learning_rate": 1.8376776202490666e-06, + "loss": 0.31464810371398927, + "step": 1960 + }, + { + "epoch": 0.6304, + "grad_norm": 2.277913414668649, + "learning_rate": 1.8107872856913293e-06, + "loss": 0.30748977661132815, + "step": 1970 + }, + { + "epoch": 0.6336, + "grad_norm": 3.6960663974743273, + "learning_rate": 1.7839829747856096e-06, + "loss": 0.31303911209106444, + "step": 1980 + }, + { + "epoch": 0.6368, + "grad_norm": 2.5169048193896844, + "learning_rate": 1.7572680330951359e-06, + "loss": 0.309541130065918, + "step": 1990 + }, + { + "epoch": 0.64, + "grad_norm": 2.625801312197355, + "learning_rate": 1.7306457950285747e-06, + "loss": 0.31228773593902587, + "step": 2000 + }, + { + "epoch": 0.6432, + "grad_norm": 3.166705714244592, + "learning_rate": 1.704119583423848e-06, + "loss": 0.30709683895111084, + "step": 2010 + }, + { + "epoch": 0.6464, + "grad_norm": 2.7529448920288755, + "learning_rate": 1.677692709133396e-06, + "loss": 0.3121641159057617, + "step": 2020 + }, + { + "epoch": 0.6496, + "grad_norm": 2.4164266641009386, + "learning_rate": 1.6513684706109311e-06, + "loss": 0.31612191200256345, + "step": 2030 + }, + { + "epoch": 0.6528, + "grad_norm": 2.1475852674178486, + "learning_rate": 1.6251501534997529e-06, + "loss": 0.30926761627197263, + "step": 2040 + }, + { + "epoch": 0.656, + "grad_norm": 3.027937409819003, + "learning_rate": 1.5990410302226405e-06, + "loss": 0.3059820652008057, + "step": 2050 + }, + { + "epoch": 0.6592, + "grad_norm": 2.3663528005893575, + "learning_rate": 1.5730443595734162e-06, + "loss": 0.30960190296173096, + "step": 2060 + }, + { + "epoch": 0.6624, + "grad_norm": 2.5495655090650806, + "learning_rate": 1.5471633863101982e-06, + "loss": 0.3146512508392334, + "step": 2070 + }, + { + "epoch": 0.6656, + "grad_norm": 2.563871195645732, + "learning_rate": 1.521401340750407e-06, + "loss": 0.3116560935974121, + "step": 2080 + }, + { + "epoch": 0.6688, + "grad_norm": 2.4316488926893314, + "learning_rate": 1.495761438367577e-06, + "loss": 0.31447796821594237, + "step": 2090 + }, + { + "epoch": 0.672, + "grad_norm": 2.446980089200077, + "learning_rate": 1.4702468793900187e-06, + "loss": 0.3153538703918457, + "step": 2100 + }, + { + "epoch": 0.6752, + "grad_norm": 2.2511595317617283, + "learning_rate": 1.444860848401384e-06, + "loss": 0.31273808479309084, + "step": 2110 + }, + { + "epoch": 0.6784, + "grad_norm": 2.459748219135552, + "learning_rate": 1.4196065139431866e-06, + "loss": 0.31059865951538085, + "step": 2120 + }, + { + "epoch": 0.6816, + "grad_norm": 2.4570005490031805, + "learning_rate": 1.3944870281193178e-06, + "loss": 0.31122384071350095, + "step": 2130 + }, + { + "epoch": 0.6848, + "grad_norm": 2.5940034157380447, + "learning_rate": 1.3695055262026208e-06, + "loss": 0.3145638704299927, + "step": 2140 + }, + { + "epoch": 0.688, + "grad_norm": 2.8940635665298644, + "learning_rate": 1.3446651262435679e-06, + "loss": 0.31133465766906737, + "step": 2150 + }, + { + "epoch": 0.6912, + "grad_norm": 2.2603444512196216, + "learning_rate": 1.3199689286810746e-06, + "loss": 0.31110968589782717, + "step": 2160 + }, + { + "epoch": 0.6944, + "grad_norm": 2.3697248986342223, + "learning_rate": 1.2954200159555294e-06, + "loss": 0.3046250820159912, + "step": 2170 + }, + { + "epoch": 0.6976, + "grad_norm": 2.9149559965372083, + "learning_rate": 1.2710214521240527e-06, + "loss": 0.3056375503540039, + "step": 2180 + }, + { + "epoch": 0.7008, + "grad_norm": 2.785583016537511, + "learning_rate": 1.246776282478063e-06, + "loss": 0.3074607849121094, + "step": 2190 + }, + { + "epoch": 0.704, + "grad_norm": 2.238483419316128, + "learning_rate": 1.222687533163181e-06, + "loss": 0.30821986198425294, + "step": 2200 + }, + { + "epoch": 0.7072, + "grad_norm": 2.0963873111402225, + "learning_rate": 1.1987582108015228e-06, + "loss": 0.31098227500915526, + "step": 2210 + }, + { + "epoch": 0.7104, + "grad_norm": 2.3511311934322725, + "learning_rate": 1.1749913021164255e-06, + "loss": 0.3125911712646484, + "step": 2220 + }, + { + "epoch": 0.7136, + "grad_norm": 2.0182013166602735, + "learning_rate": 1.1513897735596702e-06, + "loss": 0.30506420135498047, + "step": 2230 + }, + { + "epoch": 0.7168, + "grad_norm": 2.0904990978865654, + "learning_rate": 1.127956570941218e-06, + "loss": 0.30170474052429197, + "step": 2240 + }, + { + "epoch": 0.72, + "grad_norm": 2.3591898483151525, + "learning_rate": 1.104694619061533e-06, + "loss": 0.3140627145767212, + "step": 2250 + }, + { + "epoch": 0.7232, + "grad_norm": 2.3874798738589553, + "learning_rate": 1.0816068213465295e-06, + "loss": 0.3148207187652588, + "step": 2260 + }, + { + "epoch": 0.7264, + "grad_norm": 2.462173136321867, + "learning_rate": 1.0586960594851762e-06, + "loss": 0.30828402042388914, + "step": 2270 + }, + { + "epoch": 0.7296, + "grad_norm": 2.2877287929832946, + "learning_rate": 1.0359651930698217e-06, + "loss": 0.30725433826446535, + "step": 2280 + }, + { + "epoch": 0.7328, + "grad_norm": 2.5585705908550413, + "learning_rate": 1.0134170592392837e-06, + "loss": 0.30991530418395996, + "step": 2290 + }, + { + "epoch": 0.736, + "grad_norm": 2.415441399008779, + "learning_rate": 9.910544723247204e-07, + "loss": 0.31087689399719237, + "step": 2300 + }, + { + "epoch": 0.7392, + "grad_norm": 2.6450690086623285, + "learning_rate": 9.688802234983706e-07, + "loss": 0.3067446231842041, + "step": 2310 + }, + { + "epoch": 0.7424, + "grad_norm": 2.363123649822279, + "learning_rate": 9.468970804251742e-07, + "loss": 0.30767192840576174, + "step": 2320 + }, + { + "epoch": 0.7456, + "grad_norm": 2.245412676348008, + "learning_rate": 9.251077869173244e-07, + "loss": 0.30107917785644533, + "step": 2330 + }, + { + "epoch": 0.7488, + "grad_norm": 2.5736642361970503, + "learning_rate": 9.035150625918054e-07, + "loss": 0.303986120223999, + "step": 2340 + }, + { + "epoch": 0.752, + "grad_norm": 2.6844109007429138, + "learning_rate": 8.821216025309395e-07, + "loss": 0.3074802875518799, + "step": 2350 + }, + { + "epoch": 0.7552, + "grad_norm": 2.412670568786912, + "learning_rate": 8.609300769460055e-07, + "loss": 0.30130510330200194, + "step": 2360 + }, + { + "epoch": 0.7584, + "grad_norm": 3.176069141824472, + "learning_rate": 8.399431308439592e-07, + "loss": 0.3105806827545166, + "step": 2370 + }, + { + "epoch": 0.7616, + "grad_norm": 2.23339472526297, + "learning_rate": 8.191633836972962e-07, + "loss": 0.3084972620010376, + "step": 2380 + }, + { + "epoch": 0.7648, + "grad_norm": 2.6912839020724175, + "learning_rate": 7.985934291171024e-07, + "loss": 0.3067460536956787, + "step": 2390 + }, + { + "epoch": 0.768, + "grad_norm": 2.5426618104677976, + "learning_rate": 7.7823583452934e-07, + "loss": 0.30809898376464845, + "step": 2400 + }, + { + "epoch": 0.7712, + "grad_norm": 2.55531536817282, + "learning_rate": 7.58093140854389e-07, + "loss": 0.3071744441986084, + "step": 2410 + }, + { + "epoch": 0.7744, + "grad_norm": 2.285863017236424, + "learning_rate": 7.381678621899077e-07, + "loss": 0.3093477725982666, + "step": 2420 + }, + { + "epoch": 0.7776, + "grad_norm": 2.3600405361881767, + "learning_rate": 7.184624854970379e-07, + "loss": 0.30798888206481934, + "step": 2430 + }, + { + "epoch": 0.7808, + "grad_norm": 2.0247328579355726, + "learning_rate": 6.989794702899932e-07, + "loss": 0.3048464298248291, + "step": 2440 + }, + { + "epoch": 0.784, + "grad_norm": 2.7079172300622334, + "learning_rate": 6.797212483290777e-07, + "loss": 0.3093360424041748, + "step": 2450 + }, + { + "epoch": 0.7872, + "grad_norm": 2.8011999237207967, + "learning_rate": 6.60690223317171e-07, + "loss": 0.30233092308044435, + "step": 2460 + }, + { + "epoch": 0.7904, + "grad_norm": 2.202966089641912, + "learning_rate": 6.418887705997046e-07, + "loss": 0.3048731327056885, + "step": 2470 + }, + { + "epoch": 0.7936, + "grad_norm": 2.6510546903467755, + "learning_rate": 6.23319236868189e-07, + "loss": 0.3104764461517334, + "step": 2480 + }, + { + "epoch": 0.7968, + "grad_norm": 2.510992490322273, + "learning_rate": 6.049839398673141e-07, + "loss": 0.31223044395446775, + "step": 2490 + }, + { + "epoch": 0.8, + "grad_norm": 2.7988283248607604, + "learning_rate": 5.868851681056567e-07, + "loss": 0.3109541893005371, + "step": 2500 + }, + { + "epoch": 0.8032, + "grad_norm": 2.370572243788772, + "learning_rate": 5.690251805700467e-07, + "loss": 0.3075347900390625, + "step": 2510 + }, + { + "epoch": 0.8064, + "grad_norm": 2.057318428676814, + "learning_rate": 5.514062064436096e-07, + "loss": 0.30944228172302246, + "step": 2520 + }, + { + "epoch": 0.8096, + "grad_norm": 2.9526395601791937, + "learning_rate": 5.34030444827533e-07, + "loss": 0.30773684978485105, + "step": 2530 + }, + { + "epoch": 0.8128, + "grad_norm": 2.1808951881567165, + "learning_rate": 5.169000644665895e-07, + "loss": 0.30281686782836914, + "step": 2540 + }, + { + "epoch": 0.816, + "grad_norm": 2.501184820191482, + "learning_rate": 5.000172034784442e-07, + "loss": 0.30731327533721925, + "step": 2550 + }, + { + "epoch": 0.8192, + "grad_norm": 2.4433836822113304, + "learning_rate": 4.833839690867853e-07, + "loss": 0.30861892700195315, + "step": 2560 + }, + { + "epoch": 0.8224, + "grad_norm": 2.482955525732734, + "learning_rate": 4.6700243735831705e-07, + "loss": 0.3014340400695801, + "step": 2570 + }, + { + "epoch": 0.8256, + "grad_norm": 2.516375989369738, + "learning_rate": 4.508746529436311e-07, + "loss": 0.302032995223999, + "step": 2580 + }, + { + "epoch": 0.8288, + "grad_norm": 2.2676227598264926, + "learning_rate": 4.350026288220083e-07, + "loss": 0.30550131797790525, + "step": 2590 + }, + { + "epoch": 0.832, + "grad_norm": 2.3829531066293126, + "learning_rate": 4.1938834605017133e-07, + "loss": 0.3046237945556641, + "step": 2600 + }, + { + "epoch": 0.8352, + "grad_norm": 2.0018887466739548, + "learning_rate": 4.0403375351501515e-07, + "loss": 0.3024258852005005, + "step": 2610 + }, + { + "epoch": 0.8384, + "grad_norm": 2.5182571334882597, + "learning_rate": 3.88940767690362e-07, + "loss": 0.3063870906829834, + "step": 2620 + }, + { + "epoch": 0.8416, + "grad_norm": 2.7441991027074355, + "learning_rate": 3.7411127239775774e-07, + "loss": 0.30306272506713866, + "step": 2630 + }, + { + "epoch": 0.8448, + "grad_norm": 2.161963722714269, + "learning_rate": 3.595471185713431e-07, + "loss": 0.3009947299957275, + "step": 2640 + }, + { + "epoch": 0.848, + "grad_norm": 2.7694143698141285, + "learning_rate": 3.4525012402682826e-07, + "loss": 0.30188300609588625, + "step": 2650 + }, + { + "epoch": 0.8512, + "grad_norm": 2.6814413975784217, + "learning_rate": 3.3122207323460804e-07, + "loss": 0.3024703025817871, + "step": 2660 + }, + { + "epoch": 0.8544, + "grad_norm": 2.4444711671869306, + "learning_rate": 3.1746471709702963e-07, + "loss": 0.3008608102798462, + "step": 2670 + }, + { + "epoch": 0.8576, + "grad_norm": 2.6886622183433015, + "learning_rate": 3.039797727298585e-07, + "loss": 0.30821614265441893, + "step": 2680 + }, + { + "epoch": 0.8608, + "grad_norm": 2.641784614909192, + "learning_rate": 2.9076892324795546e-07, + "loss": 0.30515303611755373, + "step": 2690 + }, + { + "epoch": 0.864, + "grad_norm": 2.5595370943122444, + "learning_rate": 2.778338175551995e-07, + "loss": 0.3007267236709595, + "step": 2700 + }, + { + "epoch": 0.8672, + "grad_norm": 2.283872628964803, + "learning_rate": 2.6517607013868326e-07, + "loss": 0.30617167949676516, + "step": 2710 + }, + { + "epoch": 0.8704, + "grad_norm": 2.558413840419693, + "learning_rate": 2.527972608672002e-07, + "loss": 0.3038905143737793, + "step": 2720 + }, + { + "epoch": 0.8736, + "grad_norm": 2.4952676522317567, + "learning_rate": 2.40698934794053e-07, + "loss": 0.3054081201553345, + "step": 2730 + }, + { + "epoch": 0.8768, + "grad_norm": 2.247637838190116, + "learning_rate": 2.2888260196421237e-07, + "loss": 0.3028261661529541, + "step": 2740 + }, + { + "epoch": 0.88, + "grad_norm": 2.5035963414447804, + "learning_rate": 2.1734973722583735e-07, + "loss": 0.3062435626983643, + "step": 2750 + }, + { + "epoch": 0.8832, + "grad_norm": 1.918923632238423, + "learning_rate": 2.0610178004619564e-07, + "loss": 0.2972743034362793, + "step": 2760 + }, + { + "epoch": 0.8864, + "grad_norm": 2.4603002546330845, + "learning_rate": 1.9514013433199834e-07, + "loss": 0.3119321346282959, + "step": 2770 + }, + { + "epoch": 0.8896, + "grad_norm": 2.1315709346733667, + "learning_rate": 1.8446616825416958e-07, + "loss": 0.30900893211364744, + "step": 2780 + }, + { + "epoch": 0.8928, + "grad_norm": 2.3753122188061218, + "learning_rate": 1.7408121407708007e-07, + "loss": 0.3069151401519775, + "step": 2790 + }, + { + "epoch": 0.896, + "grad_norm": 2.207415755325001, + "learning_rate": 1.6398656799226253e-07, + "loss": 0.2986165523529053, + "step": 2800 + }, + { + "epoch": 0.8992, + "grad_norm": 2.178561452169741, + "learning_rate": 1.5418348995662773e-07, + "loss": 0.3010268688201904, + "step": 2810 + }, + { + "epoch": 0.9024, + "grad_norm": 2.5082064593439393, + "learning_rate": 1.4467320353520275e-07, + "loss": 0.2984073877334595, + "step": 2820 + }, + { + "epoch": 0.9056, + "grad_norm": 2.366814729694057, + "learning_rate": 1.3545689574841341e-07, + "loss": 0.3026757001876831, + "step": 2830 + }, + { + "epoch": 0.9088, + "grad_norm": 2.380709246306716, + "learning_rate": 1.26535716923927e-07, + "loss": 0.310437536239624, + "step": 2840 + }, + { + "epoch": 0.912, + "grad_norm": 2.484246324702375, + "learning_rate": 1.1791078055307493e-07, + "loss": 0.30369887351989744, + "step": 2850 + }, + { + "epoch": 0.9152, + "grad_norm": 2.6412244000001786, + "learning_rate": 1.0958316315187289e-07, + "loss": 0.3044759750366211, + "step": 2860 + }, + { + "epoch": 0.9184, + "grad_norm": 2.4542916560781967, + "learning_rate": 1.0155390412665528e-07, + "loss": 0.30136928558349607, + "step": 2870 + }, + { + "epoch": 0.9216, + "grad_norm": 2.631911471911446, + "learning_rate": 9.38240056443443e-08, + "loss": 0.30144243240356444, + "step": 2880 + }, + { + "epoch": 0.9248, + "grad_norm": 2.2530200189747243, + "learning_rate": 8.639443250736402e-08, + "loss": 0.3027902603149414, + "step": 2890 + }, + { + "epoch": 0.928, + "grad_norm": 3.1331936934174123, + "learning_rate": 7.926611203321777e-08, + "loss": 0.30441856384277344, + "step": 2900 + }, + { + "epoch": 0.9312, + "grad_norm": 2.5134219010551067, + "learning_rate": 7.243993393874882e-08, + "loss": 0.306389307975769, + "step": 2910 + }, + { + "epoch": 0.9344, + "grad_norm": 2.372785201514508, + "learning_rate": 6.591675022908805e-08, + "loss": 0.30292179584503176, + "step": 2920 + }, + { + "epoch": 0.9376, + "grad_norm": 2.407913531878434, + "learning_rate": 5.969737509131241e-08, + "loss": 0.29895825386047364, + "step": 2930 + }, + { + "epoch": 0.9408, + "grad_norm": 2.2376435379528865, + "learning_rate": 5.3782584792823334e-08, + "loss": 0.30271134376525877, + "step": 2940 + }, + { + "epoch": 0.944, + "grad_norm": 2.653290725438786, + "learning_rate": 4.817311758445686e-08, + "loss": 0.3062829732894897, + "step": 2950 + }, + { + "epoch": 0.9472, + "grad_norm": 2.42511171945876, + "learning_rate": 4.286967360833866e-08, + "loss": 0.3066932439804077, + "step": 2960 + }, + { + "epoch": 0.9504, + "grad_norm": 2.1534299736877895, + "learning_rate": 3.787291481049754e-08, + "loss": 0.3068870544433594, + "step": 2970 + }, + { + "epoch": 0.9536, + "grad_norm": 2.209956884835794, + "learning_rate": 3.3183464858244364e-08, + "loss": 0.31453580856323243, + "step": 2980 + }, + { + "epoch": 0.9568, + "grad_norm": 2.5928568899987017, + "learning_rate": 2.8801909062328992e-08, + "loss": 0.2991969108581543, + "step": 2990 + }, + { + "epoch": 0.96, + "grad_norm": 2.385980918167846, + "learning_rate": 2.4728794303886248e-08, + "loss": 0.2963397026062012, + "step": 3000 + }, + { + "epoch": 0.9632, + "grad_norm": 2.374100986684654, + "learning_rate": 2.0964628966175794e-08, + "loss": 0.30301966667175295, + "step": 3010 + }, + { + "epoch": 0.9664, + "grad_norm": 2.094256605734986, + "learning_rate": 1.750988287113009e-08, + "loss": 0.2994666576385498, + "step": 3020 + }, + { + "epoch": 0.9696, + "grad_norm": 1.916185239441286, + "learning_rate": 1.4364987220713278e-08, + "loss": 0.3080729007720947, + "step": 3030 + }, + { + "epoch": 0.9728, + "grad_norm": 2.3446521041543207, + "learning_rate": 1.1530334543099763e-08, + "loss": 0.3026130199432373, + "step": 3040 + }, + { + "epoch": 0.976, + "grad_norm": 2.5854178252734323, + "learning_rate": 9.006278643683697e-09, + "loss": 0.309655499458313, + "step": 3050 + }, + { + "epoch": 0.9792, + "grad_norm": 1.9908162772434517, + "learning_rate": 6.793134560916514e-09, + "loss": 0.31186389923095703, + "step": 3060 + }, + { + "epoch": 0.9824, + "grad_norm": 2.1977962094508534, + "learning_rate": 4.891178526986451e-09, + "loss": 0.30645883083343506, + "step": 3070 + }, + { + "epoch": 0.9856, + "grad_norm": 2.2397406638818147, + "learning_rate": 3.3006479333413943e-09, + "loss": 0.3090504169464111, + "step": 3080 + }, + { + "epoch": 0.9888, + "grad_norm": 2.0435901319475036, + "learning_rate": 2.021741301058422e-09, + "loss": 0.3049570322036743, + "step": 3090 + }, + { + "epoch": 0.992, + "grad_norm": 2.371036869409615, + "learning_rate": 1.0546182560652872e-09, + "loss": 0.3073274612426758, + "step": 3100 + }, + { + "epoch": 0.9952, + "grad_norm": 2.2551729202130457, + "learning_rate": 3.9939950921774607e-10, + "loss": 0.30047030448913575, + "step": 3110 + }, + { + "epoch": 0.9984, + "grad_norm": 2.2067081414460827, + "learning_rate": 5.616684123160854e-11, + "loss": 0.3023503065109253, + "step": 3120 + }, + { + "epoch": 1.0, + "step": 3125, + "total_flos": 1.0913057758773248e+16, + "train_loss": 0.7292402684783935, + "train_runtime": 30167.0559, + "train_samples_per_second": 6.63, + "train_steps_per_second": 0.104 + } + ], + "logging_steps": 10, + "max_steps": 3125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0913057758773248e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/Gemma-4-E4B-it-SFT/training_loss.png b/checkpoints/Gemma-4-E4B-it-SFT/training_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..f78c1b6ee186ebfafd3c8e04e74c3f1677868e33 Binary files /dev/null and b/checkpoints/Gemma-4-E4B-it-SFT/training_loss.png differ diff --git a/checkpoints/InternVL3.5-8B-SFT/all_results.json b/checkpoints/InternVL3.5-8B-SFT/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4436aa71bbc2b5c4eb3af92edaf52a6a1e07b35c --- /dev/null +++ b/checkpoints/InternVL3.5-8B-SFT/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "total_flos": 1955525886476288.0, + "train_loss": 0.1948647116279602, + "train_runtime": 28413.61, + "train_samples_per_second": 7.039, + "train_steps_per_second": 0.11 +} \ No newline at end of file diff --git a/checkpoints/InternVL3.5-8B-SFT/chat_template.jinja b/checkpoints/InternVL3.5-8B-SFT/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..2e5dcf6ad9edfd8c89a52da19c23bd8a8d87a7f2 --- /dev/null +++ b/checkpoints/InternVL3.5-8B-SFT/chat_template.jinja @@ -0,0 +1,6 @@ +{% for message in messages %}{{'<|im_start|>' + message['role'] + ' +'}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ ' +' }}{% elif content['type'] == 'video' %}{{ '