diff --git "a/debug.log" "b/debug.log" --- "a/debug.log" +++ "b/debug.log" @@ -1563,3 +1563,404 @@ Parameter Offload - Persistent parameters statistics: param_count = 65, numel = 24%|██████████████████████████▋ | 600/2499 [1:12:50<3:18:31, 6.27s/it] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00, 1.14it/s] [2025-12-28 12:18:30,249] [INFO] [axolotl.core.trainers.base._save:692] [PID:42410] Saving model checkpoint to ./outputs/luau-codellama-h200/checkpoint-600 + 24%|██████████████████████████▍ | 601/2499 [1:13:01<17:40:22, 33.52s/it] {'loss': 0.5431, 'grad_norm': 0.15443255007266998, 'learning_rate': 0.00017352963349382875, 'ppl': 1.7213, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.05, 'tokens_per_second_per_gpu': 4740.78, 'total_tokens': 26107242, 'epoch': 0.72} + 24%|██████████████████████████▍ | 601/2499 [1:13:01<17:40:22, 33.52s/it] 24%|██████████████████████████▍ | 602/2499 [1:13:07<13:21:04, 25.34s/it] {'loss': 0.5608, 'grad_norm': 0.15965475142002106, 'learning_rate': 0.00017344403050746084, 'ppl': 1.7521, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.05, 'tokens_per_second_per_gpu': 4474.57, 'total_tokens': 26135160, 'epoch': 0.72} + 24%|██████████████████████████▍ | 602/2499 [1:13:07<13:21:04, 25.34s/it] 24%|██████████████████████████▌ | 603/2499 [1:13:13<10:19:58, 19.62s/it] {'loss': 0.5405, 'grad_norm': 0.15331172943115234, 'learning_rate': 0.00017335831051556064, 'ppl': 1.7169, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4560.89, 'total_tokens': 26163788, 'epoch': 0.72} + 24%|██████████████████████████▌ | 603/2499 [1:13:13<10:19:58, 19.62s/it] 24%|██████████████████████████▊ | 604/2499 [1:13:20<8:13:08, 15.61s/it] {'loss': 0.554, 'grad_norm': 0.15818923711776733, 'learning_rate': 0.00017327247365469078, 'ppl': 1.7402, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4679.66, 'total_tokens': 26193104, 'epoch': 0.73} + 24%|██████████████████████████▊ | 604/2499 [1:13:20<8:13:08, 15.61s/it] 24%|██████████████████████████▊ | 605/2499 [1:13:26<6:44:32, 12.82s/it] {'loss': 0.5676, 'grad_norm': 0.1669849008321762, 'learning_rate': 0.0001731865200616001, 'ppl': 1.764, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4402.46, 'total_tokens': 26220768, 'epoch': 0.73} + 24%|██████████████████████████▊ | 605/2499 [1:13:26<6:44:32, 12.82s/it] 24%|██████████████████████████▉ | 606/2499 [1:13:32<5:42:05, 10.84s/it] {'loss': 0.5567, 'grad_norm': 0.1447984129190445, 'learning_rate': 0.00017310044987322348, 'ppl': 1.7449, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4433.92, 'total_tokens': 26248418, 'epoch': 0.73} + 24%|██████████████████████████▉ | 606/2499 [1:13:32<5:42:05, 10.84s/it] 24%|██████████████████████████▉ | 607/2499 [1:13:38<4:58:29, 9.47s/it] {'loss': 0.6084, 'grad_norm': 0.17044642567634583, 'learning_rate': 0.00017301426322668143, 'ppl': 1.8375, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4454.73, 'total_tokens': 26276263, 'epoch': 0.73} + 24%|██████████████████████████▉ | 607/2499 [1:13:38<4:58:29, 9.47s/it] 24%|███████████████████████████ | 608/2499 [1:13:45<4:28:09, 8.51s/it] {'loss': 0.5733, 'grad_norm': 0.1568867266178131, 'learning_rate': 0.00017292796025928012, 'ppl': 1.7741, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4658.56, 'total_tokens': 26305482, 'epoch': 0.73} + 24%|███████████████████████████ | 608/2499 [1:13:45<4:28:09, 8.51s/it] 24%|███████████████████████████ | 609/2499 [1:13:51<4:06:48, 7.84s/it] {'loss': 0.586, 'grad_norm': 0.155142679810524, 'learning_rate': 0.000172841541108511, 'ppl': 1.7968, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4641.87, 'total_tokens': 26334550, 'epoch': 0.73} + 24%|███████████████████████████ | 609/2499 [1:13:51<4:06:48, 7.84s/it] 24%|███████████████████████████ | 610/2499 [1:13:57<3:51:45, 7.36s/it] {'loss': 0.5379, 'grad_norm': 0.164072185754776, 'learning_rate': 0.00017275500591205055, 'ppl': 1.7124, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4624.75, 'total_tokens': 26363473, 'epoch': 0.73} + 24%|███████████████████████████ | 610/2499 [1:13:57<3:51:45, 7.36s/it] 24%|███████████████████████████▏ | 611/2499 [1:14:04<3:41:31, 7.04s/it] {'loss': 0.5643, 'grad_norm': 0.14827710390090942, 'learning_rate': 0.00017266835480776014, 'ppl': 1.7582, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4589.99, 'total_tokens': 26392329, 'epoch': 0.73} + 24%|███████████████████████████▏ | 611/2499 [1:14:04<3:41:31, 7.04s/it] 24%|███████████████████████████▏ | 612/2499 [1:14:10<3:33:57, 6.80s/it] {'loss': 0.5551, 'grad_norm': 0.14518024027347565, 'learning_rate': 0.0001725815879336859, 'ppl': 1.7421, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4454.43, 'total_tokens': 26420158, 'epoch': 0.73} + 24%|███████████████████████████▏ | 612/2499 [1:14:10<3:33:57, 6.80s/it] 25%|███████████████████████████▏ | 613/2499 [1:14:16<3:28:27, 6.63s/it] {'loss': 0.5249, 'grad_norm': 0.14937075972557068, 'learning_rate': 0.00017249470542805826, 'ppl': 1.6903, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4379.43, 'total_tokens': 26447441, 'epoch': 0.74} + 25%|███████████████████████████▏ | 613/2499 [1:14:16<3:28:27, 6.63s/it] 25%|███████████████████████████▎ | 614/2499 [1:14:22<3:24:56, 6.52s/it] {'loss': 0.5378, 'grad_norm': 0.1505361795425415, 'learning_rate': 0.00017240770742929192, 'ppl': 1.7122, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.07, 'tokens_per_second_per_gpu': 4620.94, 'total_tokens': 26476403, 'epoch': 0.74} + 25%|███████████████████████████▎ | 614/2499 [1:14:22<3:24:56, 6.52s/it] 25%|███████████████████████████▎ | 615/2499 [1:14:29<3:22:14, 6.44s/it] {'loss': 0.557, 'grad_norm': 0.15796837210655212, 'learning_rate': 0.00017232059407598565, 'ppl': 1.7454, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4509.85, 'total_tokens': 26504572, 'epoch': 0.74} + 25%|███████████████████████████▎ | 615/2499 [1:14:29<3:22:14, 6.44s/it] 25%|███████████████████████████▎ | 616/2499 [1:14:35<3:20:16, 6.38s/it] {'loss': 0.5542, 'grad_norm': 0.22552503645420074, 'learning_rate': 0.00017223336550692186, 'ppl': 1.7405, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4382.77, 'total_tokens': 26531925, 'epoch': 0.74} + 25%|███████████████████████████▎ | 616/2499 [1:14:35<3:20:16, 6.38s/it] 25%|███████████████████████████▍ | 617/2499 [1:14:41<3:18:54, 6.34s/it] {'loss': 0.5768, 'grad_norm': 0.15645365417003632, 'learning_rate': 0.00017214602186106662, 'ppl': 1.7803, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4453.04, 'total_tokens': 26559739, 'epoch': 0.74} + 25%|███████████████████████████▍ | 617/2499 [1:14:41<3:18:54, 6.34s/it] 25%|███████████████████████████▍ | 618/2499 [1:14:47<3:18:18, 6.33s/it] {'loss': 0.5381, 'grad_norm': 0.14894016087055206, 'learning_rate': 0.00017205856327756925, 'ppl': 1.7127, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4469.12, 'total_tokens': 26587835, 'epoch': 0.74} + 25%|███████████████████████████▍ | 618/2499 [1:14:47<3:18:18, 6.33s/it] 25%|███████████████████████████▍ | 619/2499 [1:14:54<3:17:25, 6.30s/it] {'loss': 0.5574, 'grad_norm': 0.16387908160686493, 'learning_rate': 0.00017197098989576222, 'ppl': 1.7461, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4446.37, 'total_tokens': 26615575, 'epoch': 0.74} + 25%|███████████████████████████▍ | 619/2499 [1:14:54<3:17:25, 6.30s/it] 25%|███████████████████████████▌ | 620/2499 [1:15:00<3:16:50, 6.29s/it] {'loss': 0.5345, 'grad_norm': 0.1502378284931183, 'learning_rate': 0.00017188330185516094, 'ppl': 1.7066, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4413.85, 'total_tokens': 26643152, 'epoch': 0.74} + 25%|███████████████████████████▌ | 620/2499 [1:15:00<3:16:50, 6.29s/it] 25%|███████████████████████████▌ | 621/2499 [1:15:06<3:16:25, 6.28s/it] {'loss': 0.5992, 'grad_norm': 0.1670679748058319, 'learning_rate': 0.00017179549929546335, 'ppl': 1.8207, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4508.66, 'total_tokens': 26671327, 'epoch': 0.75} + 25%|███████████████████████████▌ | 621/2499 [1:15:06<3:16:25, 6.28s/it] 25%|███████████████████████████▋ | 622/2499 [1:15:12<3:16:10, 6.27s/it] {'loss': 0.5946, 'grad_norm': 0.16181008517742157, 'learning_rate': 0.00017170758235654997, 'ppl': 1.8123, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4577.69, 'total_tokens': 26699973, 'epoch': 0.75} + 25%|███████████████████████████▋ | 622/2499 [1:15:12<3:16:10, 6.27s/it] 25%|███████████████████████████▋ | 623/2499 [1:15:19<3:16:02, 6.27s/it] {'loss': 0.5877, 'grad_norm': 0.17699268460273743, 'learning_rate': 0.0001716195511784835, 'ppl': 1.7998, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4582.31, 'total_tokens': 26728687, 'epoch': 0.75} + 25%|███████████████████████████▋ | 623/2499 [1:15:19<3:16:02, 6.27s/it] 25%|███████████████████████████▋ | 624/2499 [1:15:25<3:16:15, 6.28s/it] {'loss': 0.546, 'grad_norm': 0.16804426908493042, 'learning_rate': 0.0001715314059015086, 'ppl': 1.7263, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4576.77, 'total_tokens': 26757523, 'epoch': 0.75} + 25%|███████████████████████████▋ | 624/2499 [1:15:25<3:16:15, 6.28s/it] 25%|███████████████████████████▊ | 625/2499 [1:15:31<3:16:01, 6.28s/it] {'loss': 0.5654, 'grad_norm': 0.1552819013595581, 'learning_rate': 0.00017144314666605172, 'ppl': 1.7602, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4491.13, 'total_tokens': 26785650, 'epoch': 0.75} + 25%|███████████████████████████▊ | 625/2499 [1:15:31<3:16:01, 6.28s/it] 25%|███████████████████████████▊ | 626/2499 [1:15:37<3:15:33, 6.26s/it] {'loss': 0.5238, 'grad_norm': 0.16035959124565125, 'learning_rate': 0.0001713547736127209, 'ppl': 1.6884, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4372.25, 'total_tokens': 26812912, 'epoch': 0.75} + 25%|███████████████████████████▊ | 626/2499 [1:15:37<3:15:33, 6.26s/it] 25%|███████████████████████████▊ | 627/2499 [1:15:44<3:15:27, 6.26s/it] {'loss': 0.5031, 'grad_norm': 0.15231968462467194, 'learning_rate': 0.00017126628688230545, 'ppl': 1.6538, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4624.25, 'total_tokens': 26841872, 'epoch': 0.75} + 25%|███████████████████████████▊ | 627/2499 [1:15:44<3:15:27, 6.26s/it] 25%|███████████████████████████▉ | 628/2499 [1:15:50<3:15:22, 6.27s/it] {'loss': 0.5672, 'grad_norm': 0.1670321822166443, 'learning_rate': 0.0001711776866157758, 'ppl': 1.7633, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4552.49, 'total_tokens': 26870393, 'epoch': 0.75} + 25%|███████████████████████████▉ | 628/2499 [1:15:50<3:15:22, 6.27s/it] 25%|███████████████████████████▉ | 629/2499 [1:15:56<3:15:00, 6.26s/it] {'loss': 0.5665, 'grad_norm': 0.16463960707187653, 'learning_rate': 0.00017108897295428326, 'ppl': 1.7621, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4309.71, 'total_tokens': 26897258, 'epoch': 0.76} + 25%|███████████████████████████▉ | 629/2499 [1:15:56<3:15:00, 6.26s/it] 25%|███████████████████████████▉ | 630/2499 [1:16:02<3:15:03, 6.26s/it] {'loss': 0.5288, 'grad_norm': 0.15193606913089752, 'learning_rate': 0.0001710001460391598, 'ppl': 1.6969, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4405.36, 'total_tokens': 26924890, 'epoch': 0.76} + 25%|███████████████████████████▉ | 630/2499 [1:16:02<3:15:03, 6.26s/it] 25%|████████████████████████████ | 631/2499 [1:16:09<3:14:50, 6.26s/it] {'loss': 0.5362, 'grad_norm': 0.16677305102348328, 'learning_rate': 0.00017091120601191786, 'ppl': 1.7095, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4439.72, 'total_tokens': 26952621, 'epoch': 0.76} + 25%|████████████████████████████ | 631/2499 [1:16:09<3:14:50, 6.26s/it] 25%|████████████████████████████ | 632/2499 [1:16:15<3:14:45, 6.26s/it] {'loss': 0.5676, 'grad_norm': 0.17070624232292175, 'learning_rate': 0.00017082215301424998, 'ppl': 1.764, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4575.59, 'total_tokens': 26981256, 'epoch': 0.76} + 25%|████████████████████████████ | 632/2499 [1:16:15<3:14:45, 6.26s/it] 25%|████████████████████████████ | 633/2499 [1:16:21<3:14:32, 6.26s/it] {'loss': 0.5367, 'grad_norm': 0.15856873989105225, 'learning_rate': 0.00017073298718802871, 'ppl': 1.7104, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4297.55, 'total_tokens': 27008087, 'epoch': 0.76} + 25%|████████████████████████████ | 633/2499 [1:16:21<3:14:32, 6.26s/it] 25%|████████████████████████████▏ | 634/2499 [1:16:27<3:14:24, 6.25s/it] {'loss': 0.5403, 'grad_norm': 0.15367814898490906, 'learning_rate': 0.00017064370867530645, 'ppl': 1.7165, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4376.93, 'total_tokens': 27035441, 'epoch': 0.76} + 25%|████████████████████████████▏ | 634/2499 [1:16:27<3:14:24, 6.25s/it] 25%|████████████████████████████▏ | 635/2499 [1:16:34<3:14:11, 6.25s/it] {'loss': 0.6084, 'grad_norm': 0.1730221062898636, 'learning_rate': 0.00017055431761831498, 'ppl': 1.8375, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4387.66, 'total_tokens': 27062820, 'epoch': 0.76} + 25%|████████████████████████████▏ | 635/2499 [1:16:34<3:14:11, 6.25s/it] 25%|████████████████████████████▏ | 636/2499 [1:16:40<3:14:27, 6.26s/it] {'loss': 0.5498, 'grad_norm': 0.165005624294281, 'learning_rate': 0.00017046481415946549, 'ppl': 1.7329, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4582.61, 'total_tokens': 27091639, 'epoch': 0.76} + 25%|████████████████████████████▏ | 636/2499 [1:16:40<3:14:27, 6.26s/it] 25%|████████████████████████████▎ | 637/2499 [1:16:46<3:14:39, 6.27s/it] {'loss': 0.5798, 'grad_norm': 0.16355308890342712, 'learning_rate': 0.00017037519844134813, 'ppl': 1.7857, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4530.74, 'total_tokens': 27120147, 'epoch': 0.76} + 25%|████████████████████████████▎ | 637/2499 [1:16:46<3:14:39, 6.27s/it] 26%|████████████████████████████▎ | 638/2499 [1:16:53<3:14:26, 6.27s/it] {'loss': 0.6201, 'grad_norm': 0.15821842849254608, 'learning_rate': 0.000170285470606732, 'ppl': 1.8591, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4515.18, 'total_tokens': 27148403, 'epoch': 0.77} + 26%|████████████████████████████▎ | 638/2499 [1:16:53<3:14:26, 6.27s/it] 26%|████████████████████████████▍ | 639/2499 [1:16:59<3:14:06, 6.26s/it] {'loss': 0.5461, 'grad_norm': 0.15199202299118042, 'learning_rate': 0.00017019563079856474, 'ppl': 1.7265, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4300.16, 'total_tokens': 27175240, 'epoch': 0.77} + 26%|████████████████████████████▍ | 639/2499 [1:16:59<3:14:06, 6.26s/it] 26%|████████████████████████████▍ | 640/2499 [1:17:05<3:13:55, 6.26s/it] {'loss': 0.5573, 'grad_norm': 0.15636083483695984, 'learning_rate': 0.00017010567915997244, 'ppl': 1.746, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4436.65, 'total_tokens': 27202969, 'epoch': 0.77} + 26%|████████████████████████████▍ | 640/2499 [1:17:05<3:13:55, 6.26s/it] 26%|████████████████████████████▍ | 641/2499 [1:17:11<3:13:42, 6.26s/it] {'loss': 0.5442, 'grad_norm': 0.14868567883968353, 'learning_rate': 0.00017001561583425932, 'ppl': 1.7232, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4426.99, 'total_tokens': 27230617, 'epoch': 0.77} + 26%|████████████████████████████▍ | 641/2499 [1:17:11<3:13:42, 6.26s/it] 26%|████████████████████████████▌ | 642/2499 [1:17:18<3:13:37, 6.26s/it] {'loss': 0.5611, 'grad_norm': 0.1626492142677307, 'learning_rate': 0.0001699254409649075, 'ppl': 1.7526, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4496.31, 'total_tokens': 27258739, 'epoch': 0.77} + 26%|████████████████████████████▌ | 642/2499 [1:17:18<3:13:37, 6.26s/it] 26%|████████████████████████████▌ | 643/2499 [1:17:24<3:13:48, 6.27s/it] {'loss': 0.5445, 'grad_norm': 0.15078237652778625, 'learning_rate': 0.00016983515469557684, 'ppl': 1.7237, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4534.17, 'total_tokens': 27287235, 'epoch': 0.77} + 26%|████████████████████████████▌ | 643/2499 [1:17:24<3:13:48, 6.27s/it] 26%|████████████████████████████▌ | 644/2499 [1:17:30<3:13:41, 6.26s/it] {'loss': 0.5867, 'grad_norm': 0.1609424501657486, 'learning_rate': 0.00016974475717010468, 'ppl': 1.798, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4581.26, 'total_tokens': 27315915, 'epoch': 0.77} + 26%|████████████████████████████▌ | 644/2499 [1:17:30<3:13:41, 6.26s/it] 26%|████████████████████████████▋ | 645/2499 [1:17:36<3:13:27, 6.26s/it] {'loss': 0.5694, 'grad_norm': 0.1608027219772339, 'learning_rate': 0.00016965424853250557, 'ppl': 1.7672, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4355.39, 'total_tokens': 27343127, 'epoch': 0.77} + 26%|████████████████████████████▋ | 645/2499 [1:17:36<3:13:27, 6.26s/it] 26%|████████████████████████████▋ | 646/2499 [1:17:43<3:13:27, 6.26s/it] {'loss': 0.5648, 'grad_norm': 0.15912018716335297, 'learning_rate': 0.00016956362892697112, 'ppl': 1.7591, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4732.01, 'total_tokens': 27372800, 'epoch': 0.78} + 26%|████████████████████████████▋ | 646/2499 [1:17:43<3:13:27, 6.26s/it] 26%|████████████████████████████▋ | 647/2499 [1:17:49<3:13:21, 6.26s/it] {'loss': 0.5446, 'grad_norm': 0.16331568360328674, 'learning_rate': 0.00016947289849786974, 'ppl': 1.7239, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4598.22, 'total_tokens': 27401590, 'epoch': 0.78} + 26%|████████████████████████████▋ | 647/2499 [1:17:49<3:13:21, 6.26s/it] 26%|████████████████████████████▊ | 648/2499 [1:17:55<3:13:14, 6.26s/it] {'loss': 0.5874, 'grad_norm': 0.16428092122077942, 'learning_rate': 0.00016938205738974626, 'ppl': 1.7993, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4608.8, 'total_tokens': 27430448, 'epoch': 0.78} + 26%|████████████████████████████▊ | 648/2499 [1:17:55<3:13:14, 6.26s/it] 26%|████████████████████████████▊ | 649/2499 [1:18:01<3:13:11, 6.27s/it] {'loss': 0.5681, 'grad_norm': 0.1655767410993576, 'learning_rate': 0.00016929110574732202, 'ppl': 1.7649, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4383.76, 'total_tokens': 27457921, 'epoch': 0.78} + 26%|████████████████████████████▊ | 649/2499 [1:18:01<3:13:11, 6.27s/it] 26%|████████████████████████████▊ | 650/2499 [1:18:08<3:13:14, 6.27s/it] {'loss': 0.5865, 'grad_norm': 0.15909960865974426, 'learning_rate': 0.0001692000437154943, 'ppl': 1.7977, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4610.29, 'total_tokens': 27486869, 'epoch': 0.78} + 26%|████████████████████████████▊ | 650/2499 [1:18:08<3:13:14, 6.27s/it] 26%|████████████████████████████▉ | 651/2499 [1:18:14<3:13:06, 6.27s/it] {'loss': 0.5777, 'grad_norm': 0.1619177907705307, 'learning_rate': 0.00016910887143933636, 'ppl': 1.7819, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4469.57, 'total_tokens': 27514869, 'epoch': 0.78} + 26%|████████████████████████████▉ | 651/2499 [1:18:14<3:13:06, 6.27s/it] 26%|████████████████████████████▉ | 652/2499 [1:18:20<3:12:57, 6.27s/it] {'loss': 0.5718, 'grad_norm': 0.1830313801765442, 'learning_rate': 0.00016901758906409705, 'ppl': 1.7715, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4500.34, 'total_tokens': 27543048, 'epoch': 0.78} + 26%|████████████████████████████▉ | 652/2499 [1:18:20<3:12:57, 6.27s/it] 26%|█████████████████████████████ | 653/2499 [1:18:26<3:13:02, 6.27s/it] {'loss': 0.551, 'grad_norm': 0.16313879191875458, 'learning_rate': 0.00016892619673520057, 'ppl': 1.735, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4727.61, 'total_tokens': 27572770, 'epoch': 0.78} + 26%|█████████████████████████████ | 653/2499 [1:18:26<3:13:02, 6.27s/it] 26%|█████████████████████████████ | 654/2499 [1:18:33<3:12:58, 6.28s/it] {'loss': 0.5615, 'grad_norm': 0.15075667202472687, 'learning_rate': 0.00016883469459824644, 'ppl': 1.7533, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4654.0, 'total_tokens': 27601981, 'epoch': 0.79} + 26%|█████████████████████████████ | 654/2499 [1:18:33<3:12:58, 6.28s/it] 26%|█████████████████████████████ | 655/2499 [1:18:39<3:13:06, 6.28s/it] {'loss': 0.5551, 'grad_norm': 0.15292450785636902, 'learning_rate': 0.0001687430827990089, 'ppl': 1.7421, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4633.85, 'total_tokens': 27631169, 'epoch': 0.79} + 26%|█████████████████████████████ | 655/2499 [1:18:39<3:13:06, 6.28s/it] 26%|█████████████████████████████▏ | 656/2499 [1:18:45<3:12:57, 6.28s/it] {'loss': 0.6184, 'grad_norm': 0.1681700199842453, 'learning_rate': 0.00016865136148343706, 'ppl': 1.856, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4542.76, 'total_tokens': 27659667, 'epoch': 0.79} + 26%|█████████████████████████████▏ | 656/2499 [1:18:45<3:12:57, 6.28s/it] 26%|█████████████████████████████▏ | 657/2499 [1:18:52<3:12:37, 6.27s/it] {'loss': 0.5921, 'grad_norm': 0.16229918599128723, 'learning_rate': 0.00016855953079765448, 'ppl': 1.8078, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4427.98, 'total_tokens': 27687361, 'epoch': 0.79} + 26%|█████████████████████████████▏ | 657/2499 [1:18:52<3:12:37, 6.27s/it] 26%|█████████████████████████████▏ | 658/2499 [1:18:58<3:12:26, 6.27s/it] {'loss': 0.5735, 'grad_norm': 0.15574562549591064, 'learning_rate': 0.0001684675908879589, 'ppl': 1.7745, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4419.82, 'total_tokens': 27715048, 'epoch': 0.79} + 26%|█████████████████████████████▏ | 658/2499 [1:18:58<3:12:26, 6.27s/it] 26%|█████████████████████████████▎ | 659/2499 [1:19:04<3:12:24, 6.27s/it] {'loss': 0.5574, 'grad_norm': 0.16547827422618866, 'learning_rate': 0.00016837554190082208, 'ppl': 1.7461, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4549.83, 'total_tokens': 27743605, 'epoch': 0.79} + 26%|█████████████████████████████▎ | 659/2499 [1:19:04<3:12:24, 6.27s/it] 26%|█████████████████████████████▎ | 660/2499 [1:19:10<3:12:16, 6.27s/it] {'loss': 0.5474, 'grad_norm': 0.15572473406791687, 'learning_rate': 0.00016828338398288965, 'ppl': 1.7288, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4563.37, 'total_tokens': 27772212, 'epoch': 0.79} + 26%|█████████████████████████████▎ | 660/2499 [1:19:10<3:12:16, 6.27s/it] 26%|█████████████████████████████▎ | 661/2499 [1:19:17<3:11:56, 6.27s/it] {'loss': 0.5487, 'grad_norm': 0.2158125638961792, 'learning_rate': 0.00016819111728098065, 'ppl': 1.731, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4361.97, 'total_tokens': 27799457, 'epoch': 0.79} + 26%|█████████████████████████████▎ | 661/2499 [1:19:17<3:11:56, 6.27s/it] 26%|█████████████████████████████▍ | 662/2499 [1:19:23<3:11:52, 6.27s/it] {'loss': 0.534, 'grad_norm': 0.15643706917762756, 'learning_rate': 0.0001680987419420875, 'ppl': 1.7057, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4173.77, 'total_tokens': 27825617, 'epoch': 0.79} + 26%|█████████████████████████████▍ | 662/2499 [1:19:23<3:11:52, 6.27s/it] 27%|█████████████████████████████▍ | 663/2499 [1:19:29<3:11:57, 6.27s/it] {'loss': 0.5736, 'grad_norm': 0.16257119178771973, 'learning_rate': 0.0001680062581133757, 'ppl': 1.7746, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4626.03, 'total_tokens': 27854685, 'epoch': 0.8} + 27%|█████████████████████████████▍ | 663/2499 [1:19:29<3:11:57, 6.27s/it] 27%|█████████████████████████████▍ | 664/2499 [1:19:35<3:11:49, 6.27s/it] {'loss': 0.5828, 'grad_norm': 0.1598690301179886, 'learning_rate': 0.0001679136659421835, 'ppl': 1.791, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4542.55, 'total_tokens': 27883155, 'epoch': 0.8} + 27%|█████████████████████████████▍ | 664/2499 [1:19:35<3:11:49, 6.27s/it] 27%|█████████████████████████████▌ | 665/2499 [1:19:42<3:11:28, 6.26s/it] {'loss': 0.5444, 'grad_norm': 0.15730322897434235, 'learning_rate': 0.0001678209655760219, 'ppl': 1.7236, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4327.13, 'total_tokens': 27910167, 'epoch': 0.8} + 27%|█████████████████████████████▌ | 665/2499 [1:19:42<3:11:28, 6.26s/it] 27%|█████████████████████████████▌ | 666/2499 [1:19:48<3:11:26, 6.27s/it] {'loss': 0.5888, 'grad_norm': 0.1599961817264557, 'learning_rate': 0.00016772815716257412, 'ppl': 1.8018, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4567.61, 'total_tokens': 27938805, 'epoch': 0.8} + 27%|█████████████████████████████▌ | 666/2499 [1:19:48<3:11:26, 6.27s/it] 27%|█████████████████████████████▋ | 667/2499 [1:19:54<3:11:08, 6.26s/it] {'loss': 0.5983, 'grad_norm': 0.16296197474002838, 'learning_rate': 0.0001676352408496956, 'ppl': 1.819, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4387.08, 'total_tokens': 27966196, 'epoch': 0.8} + 27%|█████████████████████████████▋ | 667/2499 [1:19:54<3:11:08, 6.26s/it] 27%|█████████████████████████████▋ | 668/2499 [1:20:01<3:11:24, 6.27s/it] {'loss': 0.5828, 'grad_norm': 0.16709184646606445, 'learning_rate': 0.00016754221678541367, 'ppl': 1.791, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4387.36, 'total_tokens': 27993823, 'epoch': 0.8} + 27%|█████████████████████████████▋ | 668/2499 [1:20:01<3:11:24, 6.27s/it] 27%|█████████████████████████████▋ | 669/2499 [1:20:07<3:11:20, 6.27s/it] {'loss': 0.5743, 'grad_norm': 0.1495347023010254, 'learning_rate': 0.00016744908511792726, 'ppl': 1.7759, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4491.57, 'total_tokens': 28021994, 'epoch': 0.8} + 27%|█████████████████████████████▋ | 669/2499 [1:20:07<3:11:20, 6.27s/it] 27%|█████████████████████████████▊ | 670/2499 [1:20:13<3:11:09, 6.27s/it] {'loss': 0.5361, 'grad_norm': 0.16424506902694702, 'learning_rate': 0.00016735584599560682, 'ppl': 1.7093, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4433.77, 'total_tokens': 28049760, 'epoch': 0.8} + 27%|█████████████████████████████▊ | 670/2499 [1:20:13<3:11:09, 6.27s/it] 27%|█████████████████████████████▊ | 671/2499 [1:20:19<3:10:58, 6.27s/it] {'loss': 0.5669, 'grad_norm': 0.15702269971370697, 'learning_rate': 0.00016726249956699395, 'ppl': 1.7628, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4422.43, 'total_tokens': 28077446, 'epoch': 0.81} + 27%|█████████████████████████████▊ | 671/2499 [1:20:19<3:10:58, 6.27s/it] 27%|█████████████████████████████▊ | 672/2499 [1:20:26<3:10:48, 6.27s/it] {'loss': 0.5313, 'grad_norm': 0.14038637280464172, 'learning_rate': 0.00016716904598080111, 'ppl': 1.7011, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4663.47, 'total_tokens': 28106634, 'epoch': 0.81} + 27%|█████████████████████████████▊ | 672/2499 [1:20:26<3:10:48, 6.27s/it] 27%|█████████████████████████████▉ | 673/2499 [1:20:32<3:10:38, 6.26s/it] {'loss': 0.5654, 'grad_norm': 0.15101970732212067, 'learning_rate': 0.00016707548538591168, 'ppl': 1.7602, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4560.22, 'total_tokens': 28135170, 'epoch': 0.81} + 27%|█████████████████████████████▉ | 673/2499 [1:20:32<3:10:38, 6.26s/it] 27%|█████████████████████████████▉ | 674/2499 [1:20:38<3:10:42, 6.27s/it] {'loss': 0.5556, 'grad_norm': 0.15208765864372253, 'learning_rate': 0.0001669818179313793, 'ppl': 1.743, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.13, 'tokens_per_second_per_gpu': 4615.18, 'total_tokens': 28164153, 'epoch': 0.81} + 27%|█████████████████████████████▉ | 674/2499 [1:20:38<3:10:42, 6.27s/it] 27%|█████████████████████████████▉ | 675/2499 [1:20:44<3:10:47, 6.28s/it] {'loss': 0.5555, 'grad_norm': 0.1590418815612793, 'learning_rate': 0.000166888043766428, 'ppl': 1.7428, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4408.51, 'total_tokens': 28191869, 'epoch': 0.81} + 27%|█████████████████████████████▉ | 675/2499 [1:20:44<3:10:47, 6.28s/it] 27%|██████████████████████████████ | 676/2499 [1:20:51<3:10:43, 6.28s/it] {'loss': 0.6053, 'grad_norm': 0.1561027318239212, 'learning_rate': 0.0001667941630404517, 'ppl': 1.8318, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4736.35, 'total_tokens': 28221602, 'epoch': 0.81} + 27%|██████████████████████████████ | 676/2499 [1:20:51<3:10:43, 6.28s/it] 27%|██████████████████████████████ | 677/2499 [1:20:57<3:10:28, 6.27s/it] {'loss': 0.5262, 'grad_norm': 0.15915250778198242, 'learning_rate': 0.00016670017590301423, 'ppl': 1.6925, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4611.16, 'total_tokens': 28250459, 'epoch': 0.81} + 27%|██████████████████████████████ | 677/2499 [1:20:57<3:10:28, 6.27s/it] 27%|██████████████████████████████ | 678/2499 [1:21:03<3:10:14, 6.27s/it] {'loss': 0.5602, 'grad_norm': 0.14580583572387695, 'learning_rate': 0.0001666060825038488, 'ppl': 1.751, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4565.83, 'total_tokens': 28279025, 'epoch': 0.81} + 27%|██████████████████████████████ | 678/2499 [1:21:03<3:10:14, 6.27s/it] 27%|██████████████████████████████▏ | 679/2499 [1:21:10<3:10:01, 6.26s/it] {'loss': 0.5398, 'grad_norm': 0.14442190527915955, 'learning_rate': 0.00016651188299285802, 'ppl': 1.7157, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4448.55, 'total_tokens': 28306844, 'epoch': 0.82} + 27%|██████████████████████████████▏ | 679/2499 [1:21:10<3:10:01, 6.26s/it] 27%|██████████████████████████████▏ | 680/2499 [1:21:16<3:09:49, 6.26s/it] {'loss': 0.5656, 'grad_norm': 0.1592138707637787, 'learning_rate': 0.00016641757752011344, 'ppl': 1.7605, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4508.01, 'total_tokens': 28335030, 'epoch': 0.82} + 27%|██████████████████████████████▏ | 680/2499 [1:21:16<3:09:49, 6.26s/it] 27%|██████████████████████████████▏ | 681/2499 [1:21:22<3:09:48, 6.26s/it] {'loss': 0.5587, 'grad_norm': 0.15959708392620087, 'learning_rate': 0.00016632316623585553, 'ppl': 1.7484, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4301.24, 'total_tokens': 28361993, 'epoch': 0.82} + 27%|██████████████████████████████▏ | 681/2499 [1:21:22<3:09:48, 6.26s/it] 27%|██████████████████████████████▎ | 682/2499 [1:21:28<3:09:39, 6.26s/it] {'loss': 0.5892, 'grad_norm': 0.15612153708934784, 'learning_rate': 0.0001662286492904933, 'ppl': 1.8025, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4510.28, 'total_tokens': 28390207, 'epoch': 0.82} + 27%|██████████████████████████████▎ | 682/2499 [1:21:28<3:09:39, 6.26s/it] 27%|██████████████████████████████▎ | 683/2499 [1:21:35<3:09:24, 6.26s/it] {'loss': 0.554, 'grad_norm': 0.14454488456249237, 'learning_rate': 0.00016613402683460398, 'ppl': 1.7402, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4489.04, 'total_tokens': 28418241, 'epoch': 0.82} + 27%|██████████████████████████████▎ | 683/2499 [1:21:35<3:09:24, 6.26s/it] 27%|██████████████████████████████▍ | 684/2499 [1:21:41<3:09:24, 6.26s/it] {'loss': 0.5568, 'grad_norm': 0.16239210963249207, 'learning_rate': 0.00016603929901893305, 'ppl': 1.7451, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4551.45, 'total_tokens': 28446759, 'epoch': 0.82} + 27%|██████████████████████████████▍ | 684/2499 [1:21:41<3:09:24, 6.26s/it] 27%|██████████████████████████████▍ | 685/2499 [1:21:47<3:09:15, 6.26s/it] {'loss': 0.5374, 'grad_norm': 0.15430286526679993, 'learning_rate': 0.0001659444659943938, 'ppl': 1.7116, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4503.42, 'total_tokens': 28474926, 'epoch': 0.82} + 27%|██████████████████████████████▍ | 685/2499 [1:21:47<3:09:15, 6.26s/it] 27%|██████████████████████████████▍ | 686/2499 [1:21:53<3:09:13, 6.26s/it] {'loss': 0.6042, 'grad_norm': 0.15854589641094208, 'learning_rate': 0.00016584952791206704, 'ppl': 1.8298, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4610.25, 'total_tokens': 28503808, 'epoch': 0.82} + 27%|██████████████████████████████▍ | 686/2499 [1:21:53<3:09:13, 6.26s/it] 27%|████���█████████████████████████▌ | 687/2499 [1:22:00<3:09:18, 6.27s/it] {'loss': 0.5621, 'grad_norm': 0.1579902619123459, 'learning_rate': 0.0001657544849232011, 'ppl': 1.7544, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4556.28, 'total_tokens': 28532424, 'epoch': 0.82} + 27%|██████████████████████████████▌ | 687/2499 [1:22:00<3:09:18, 6.27s/it] 28%|██████████████████████████████▌ | 688/2499 [1:22:06<3:09:14, 6.27s/it] {'loss': 0.5661, 'grad_norm': 0.14277489483356476, 'learning_rate': 0.00016565933717921128, 'ppl': 1.7614, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4680.69, 'total_tokens': 28561775, 'epoch': 0.83} + 28%|██████████████████████████████▌ | 688/2499 [1:22:06<3:09:14, 6.27s/it] 28%|██████████████████████████████▌ | 689/2499 [1:22:12<3:08:58, 6.26s/it] {'loss': 0.5377, 'grad_norm': 0.14523279666900635, 'learning_rate': 0.00016556408483167986, 'ppl': 1.7121, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4541.27, 'total_tokens': 28590156, 'epoch': 0.83} + 28%|██████████████████████████████▌ | 689/2499 [1:22:12<3:08:58, 6.26s/it] 28%|██████████████████████████████▋ | 690/2499 [1:22:18<3:08:46, 6.26s/it] {'loss': 0.5416, 'grad_norm': 0.16199174523353577, 'learning_rate': 0.00016546872803235578, 'ppl': 1.7188, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4446.12, 'total_tokens': 28617950, 'epoch': 0.83} + 28%|██████████████████████████████▋ | 690/2499 [1:22:18<3:08:46, 6.26s/it] 28%|██████████████████████████████▋ | 691/2499 [1:22:25<3:08:50, 6.27s/it] {'loss': 0.5335, 'grad_norm': 0.15208809077739716, 'learning_rate': 0.0001653732669331543, 'ppl': 1.7049, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4633.04, 'total_tokens': 28647033, 'epoch': 0.83} + 28%|██████████████████████████████▋ | 691/2499 [1:22:25<3:08:50, 6.27s/it] 28%|██████████████████████████████▋ | 692/2499 [1:22:31<3:08:50, 6.27s/it] {'loss': 0.5125, 'grad_norm': 0.15709447860717773, 'learning_rate': 0.00016527770168615698, 'ppl': 1.6695, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4624.44, 'total_tokens': 28676054, 'epoch': 0.83} + 28%|████��█████████████████████████▋ | 692/2499 [1:22:31<3:08:50, 6.27s/it] 28%|██████████████████████████████▊ | 693/2499 [1:22:37<3:09:02, 6.28s/it] {'loss': 0.4875, 'grad_norm': 0.1479036509990692, 'learning_rate': 0.00016518203244361116, 'ppl': 1.6282, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4542.67, 'total_tokens': 28704686, 'epoch': 0.83} + 28%|██████████████████████████████▊ | 693/2499 [1:22:37<3:09:02, 6.28s/it] 28%|██████████████████████████████▊ | 694/2499 [1:22:44<3:08:55, 6.28s/it] {'loss': 0.6043, 'grad_norm': 0.172959104180336, 'learning_rate': 0.00016508625935792996, 'ppl': 1.83, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4336.96, 'total_tokens': 28731899, 'epoch': 0.83} + 28%|██████████████████████████████▊ | 694/2499 [1:22:44<3:08:55, 6.28s/it] 28%|██████████████████████████████▊ | 695/2499 [1:22:50<3:08:44, 6.28s/it] {'loss': 0.5326, 'grad_norm': 0.154370978474617, 'learning_rate': 0.0001649903825816918, 'ppl': 1.7034, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4565.64, 'total_tokens': 28760516, 'epoch': 0.83} + 28%|██████████████████████████████▊ | 695/2499 [1:22:50<3:08:44, 6.28s/it] 28%|██████████████████████████████▉ | 696/2499 [1:22:56<3:08:24, 6.27s/it] {'loss': 0.5108, 'grad_norm': 0.157211035490036, 'learning_rate': 0.00016489440226764051, 'ppl': 1.6666, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4367.51, 'total_tokens': 28787806, 'epoch': 0.84} + 28%|██████████████████████████████▉ | 696/2499 [1:22:56<3:08:24, 6.27s/it] 28%|██████████████████████████████▉ | 697/2499 [1:23:02<3:08:08, 6.26s/it] {'loss': 0.5681, 'grad_norm': 0.1627659946680069, 'learning_rate': 0.0001647983185686847, 'ppl': 1.7649, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4431.29, 'total_tokens': 28815504, 'epoch': 0.84} + 28%|██████████████████████████████▉ | 697/2499 [1:23:02<3:08:08, 6.26s/it] 28%|███████████████████████████████ | 698/2499 [1:23:09<3:08:02, 6.26s/it] {'loss': 0.5997, 'grad_norm': 0.1593720018863678, 'learning_rate': 0.00016470213163789765, 'ppl': 1.8216, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4492.52, 'total_tokens': 28843640, 'epoch': 0.84} + 28%|███████████████████████████████ | 698/2499 [1:23:09<3:08:02, 6.26s/it] 28%|███████████████████████████████ | 699/2499 [1:23:15<3:07:55, 6.26s/it] {'loss': 0.6004, 'grad_norm': 0.1652214229106903, 'learning_rate': 0.00016460584162851727, 'ppl': 1.8228, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4455.31, 'total_tokens': 28871533, 'epoch': 0.84} + 28%|███████████████████████████████ | 699/2499 [1:23:15<3:07:55, 6.26s/it] 28%|███████████████████████████████ | 700/2499 [1:23:21<3:08:09, 6.28s/it] {'loss': 0.6045, 'grad_norm': 0.4151879847049713, 'learning_rate': 0.00016450944869394554, 'ppl': 1.8303, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.19, 'tokens_per_second_per_gpu': 4499.12, 'total_tokens': 28899871, 'epoch': 0.84} + 28%|███████████████████████████████ | 700/2499 [1:23:21<3:08:09, 6.28s/it][2025-12-28 12:28:57,781] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:42410] Running evaluation step... +[2025-12-28 12:29:00,510] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 1.3375587463378906 +[2025-12-28 12:29:01,570] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 1.0599887371063232 +[2025-12-28 12:29:02,589] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 1.0184319019317627 +[2025-12-28 12:29:03,500] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.9104523658752441 +[2025-12-28 12:29:03,500] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90] + + 0%| | 0/90 [00:00