diff --git "a/debug.log" "b/debug.log" --- "a/debug.log" +++ "b/debug.log" @@ -1162,3 +1162,404 @@ Parameter Offload - Persistent parameters statistics: param_count = 65, numel = 16%|██████████████████ | 400/2499 [48:59<3:39:25, 6.27s/it] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:17<00:00, 1.15it/s] [2025-12-28 11:54:39,612] [INFO] [axolotl.core.trainers.base._save:692] [PID:42410] Saving model checkpoint to ./outputs/luau-codellama-h200/checkpoint-400 + 16%|█████████████████▉ | 401/2499 [49:10<19:00:44, 32.62s/it] {'loss': 0.6189, 'grad_norm': 0.16039417684078217, 'learning_rate': 0.0001881269651023858, 'ppl': 1.8569, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.01, 'tokens_per_second_per_gpu': 4650.7, 'total_tokens': 17862360, 'epoch': 0.48} + 16%|█████████████████▉ | 401/2499 [49:10<19:00:44, 32.62s/it] 16%|██████████████████ | 402/2499 [49:16<14:23:47, 24.72s/it] {'loss': 0.5702, 'grad_norm': 0.15665364265441895, 'learning_rate': 0.0001880672420454887, 'ppl': 1.7686, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.03, 'tokens_per_second_per_gpu': 4577.65, 'total_tokens': 17891015, 'epoch': 0.48} + 16%|██████████████████ | 402/2499 [49:16<14:23:47, 24.72s/it] 16%|██████████████████ | 403/2499 [49:23<11:10:17, 19.19s/it] {'loss': 0.5665, 'grad_norm': 0.15146903693675995, 'learning_rate': 0.00018800737868645312, 'ppl': 1.7621, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.03, 'tokens_per_second_per_gpu': 4635.76, 'total_tokens': 17920165, 'epoch': 0.48} + 16%|██████████████████ | 403/2499 [49:23<11:10:17, 19.19s/it] 16%|██████████████████▎ | 404/2499 [49:29<8:54:38, 15.31s/it] {'loss': 0.6125, 'grad_norm': 0.1600656360387802, 'learning_rate': 0.0001879473751206489, 'ppl': 1.845, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4530.87, 'total_tokens': 17948550, 'epoch': 0.48} + 16%|██████████████████▎ | 404/2499 [49:29<8:54:38, 15.31s/it] 16%|██████████████████▎ | 405/2499 [49:35<7:19:31, 12.59s/it] {'loss': 0.5991, 'grad_norm': 0.17024600505828857, 'learning_rate': 0.00018788723144366927, 'ppl': 1.8205, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4537.43, 'total_tokens': 17976903, 'epoch': 0.49} + 16%|██████████████████▎ | 405/2499 [49:35<7:19:31, 12.59s/it] 16%|██████████████████▎ | 406/2499 [49:41<6:13:03, 10.69s/it] {'loss': 0.5744, 'grad_norm': 0.17297804355621338, 'learning_rate': 0.00018782694775133058, 'ppl': 1.7761, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4652.34, 'total_tokens': 18006026, 'epoch': 0.49} + 16%|██████████████████▎ | 406/2499 [49:41<6:13:03, 10.69s/it] 16%|██████████████████▍ | 407/2499 [49:48<5:26:30, 9.36s/it] {'loss': 0.5972, 'grad_norm': 0.17486798763275146, 'learning_rate': 0.00018776652413967236, 'ppl': 1.817, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4501.88, 'total_tokens': 18034203, 'epoch': 0.49} + 16%|██████████████████▍ | 407/2499 [49:48<5:26:30, 9.36s/it] 16%|██████████████████▍ | 408/2499 [49:54<4:53:56, 8.43s/it] {'loss': 0.4842, 'grad_norm': 0.14348573982715607, 'learning_rate': 0.0001877059607049569, 'ppl': 1.6229, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4636.87, 'total_tokens': 18063240, 'epoch': 0.49} + 16%|██████████████████▍ | 408/2499 [49:54<4:53:56, 8.43s/it] 16%|██████████████████▍ | 409/2499 [50:00<4:31:29, 7.79s/it] {'loss': 0.5476, 'grad_norm': 0.14626961946487427, 'learning_rate': 0.00018764525754366937, 'ppl': 1.7291, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4564.52, 'total_tokens': 18091984, 'epoch': 0.49} + 16%|██████████████████▍ | 409/2499 [50:00<4:31:29, 7.79s/it] 16%|██████████████████▌ | 410/2499 [50:07<4:15:24, 7.34s/it] {'loss': 0.5608, 'grad_norm': 0.18620796501636505, 'learning_rate': 0.00018758441475251754, 'ppl': 1.7521, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4481.57, 'total_tokens': 18120048, 'epoch': 0.49} + 16%|██████████████████▌ | 410/2499 [50:07<4:15:24, 7.34s/it] 16%|██████████████████▌ | 411/2499 [50:13<4:04:07, 7.02s/it] {'loss': 0.5314, 'grad_norm': 0.15043221414089203, 'learning_rate': 0.00018752343242843154, 'ppl': 1.7013, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4554.39, 'total_tokens': 18148581, 'epoch': 0.49} + 16%|██████████████████▌ | 411/2499 [50:13<4:04:07, 7.02s/it] 16%|██████████████████▋ | 412/2499 [50:19<3:56:08, 6.79s/it] {'loss': 0.6299, 'grad_norm': 0.15692859888076782, 'learning_rate': 0.00018746231066856387, 'ppl': 1.8774, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4626.86, 'total_tokens': 18177538, 'epoch': 0.49} + 16%|██████████████████▋ | 412/2499 [50:19<3:56:08, 6.79s/it] 17%|██████████████████▋ | 413/2499 [50:25<3:50:31, 6.63s/it] {'loss': 0.6237, 'grad_norm': 0.15896819531917572, 'learning_rate': 0.00018740104957028913, 'ppl': 1.8658, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4577.2, 'total_tokens': 18206187, 'epoch': 0.5} + 17%|██████████████████▋ | 413/2499 [50:25<3:50:31, 6.63s/it] 17%|██████████████████▋ | 414/2499 [50:32<3:46:30, 6.52s/it] {'loss': 0.65, 'grad_norm': 0.18454909324645996, 'learning_rate': 0.00018733964923120392, 'ppl': 1.9155, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4508.93, 'total_tokens': 18234384, 'epoch': 0.5} + 17%|██████████████████▋ | 414/2499 [50:32<3:46:30, 6.52s/it] 17%|██████████████████▊ | 415/2499 [50:38<3:44:02, 6.45s/it] {'loss': 0.5645, 'grad_norm': 0.15952667593955994, 'learning_rate': 0.0001872781097491267, 'ppl': 1.7586, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4557.17, 'total_tokens': 18263045, 'epoch': 0.5} + 17%|██████████████████▊ | 415/2499 [50:38<3:44:02, 6.45s/it] 17%|██████████████████▊ | 416/2499 [50:44<3:41:57, 6.39s/it] {'loss': 0.567, 'grad_norm': 0.15676908195018768, 'learning_rate': 0.0001872164312220975, 'ppl': 1.763, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4339.73, 'total_tokens': 18290199, 'epoch': 0.5} + 17%|██████████████████▊ | 416/2499 [50:44<3:41:57, 6.39s/it] 17%|██████████████████▊ | 417/2499 [50:50<3:40:27, 6.35s/it] {'loss': 0.5373, 'grad_norm': 0.16627971827983856, 'learning_rate': 0.000187154613748378, 'ppl': 1.7114, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4461.35, 'total_tokens': 18318111, 'epoch': 0.5} + 17%|██████████████████▊ | 417/2499 [50:50<3:40:27, 6.35s/it] 17%|██████████████████▉ | 418/2499 [50:57<3:39:31, 6.33s/it] {'loss': 0.5771, 'grad_norm': 0.15211078524589539, 'learning_rate': 0.0001870926574264511, 'ppl': 1.7809, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4570.21, 'total_tokens': 18346773, 'epoch': 0.5} + 17%|██████████████████▉ | 418/2499 [50:57<3:39:31, 6.33s/it] 17%|██████████████████▉ | 419/2499 [51:03<3:38:46, 6.31s/it] {'loss': 0.5499, 'grad_norm': 0.15301309525966644, 'learning_rate': 0.00018703056235502103, 'ppl': 1.7331, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4536.51, 'total_tokens': 18375194, 'epoch': 0.5} + 17%|██████████████████▉ | 419/2499 [51:03<3:38:46, 6.31s/it] 17%|██████████████████▉ | 420/2499 [51:09<3:38:10, 6.30s/it] {'loss': 0.5583, 'grad_norm': 0.14875005185604095, 'learning_rate': 0.000186968328633013, 'ppl': 1.7477, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4548.16, 'total_tokens': 18403674, 'epoch': 0.5} + 17%|██████████████████▉ | 420/2499 [51:09<3:38:10, 6.30s/it] 17%|███████████████████ | 421/2499 [51:16<3:37:58, 6.29s/it] {'loss': 0.5516, 'grad_norm': 0.16316725313663483, 'learning_rate': 0.00018690595635957312, 'ppl': 1.736, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4436.18, 'total_tokens': 18431553, 'epoch': 0.51} + 17%|███████████████████ | 421/2499 [51:16<3:37:58, 6.29s/it] 17%|███████████████████ | 422/2499 [51:22<3:38:26, 6.31s/it] {'loss': 0.5639, 'grad_norm': 0.15915672481060028, 'learning_rate': 0.0001868434456340682, 'ppl': 1.7575, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4741.62, 'total_tokens': 18461634, 'epoch': 0.51} + 17%|███████████████████ | 422/2499 [51:22<3:38:26, 6.31s/it] 17%|███████████████████▏ | 423/2499 [51:28<3:37:56, 6.30s/it] {'loss': 0.623, 'grad_norm': 0.15488934516906738, 'learning_rate': 0.00018678079655608568, 'ppl': 1.8645, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4564.65, 'total_tokens': 18490250, 'epoch': 0.51} + 17%|███████████████████▏ | 423/2499 [51:28<3:37:56, 6.30s/it] 17%|███████████████████▏ | 424/2499 [51:34<3:37:33, 6.29s/it] {'loss': 0.5389, 'grad_norm': 0.16349388659000397, 'learning_rate': 0.00018671800922543338, 'ppl': 1.7141, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4610.8, 'total_tokens': 18519157, 'epoch': 0.51} + 17%|███████████████████▏ | 424/2499 [51:34<3:37:33, 6.29s/it] 17%|███████████████████▏ | 425/2499 [51:41<3:37:23, 6.29s/it] {'loss': 0.5917, 'grad_norm': 0.17400610446929932, 'learning_rate': 0.00018665508374213937, 'ppl': 1.8071, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4723.73, 'total_tokens': 18548836, 'epoch': 0.51} + 17%|███████████████████▏ | 425/2499 [51:41<3:37:23, 6.29s/it] 17%|███████████████████▎ | 426/2499 [51:47<3:37:05, 6.28s/it] {'loss': 0.5805, 'grad_norm': 0.15097637474536896, 'learning_rate': 0.00018659202020645182, 'ppl': 1.7869, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4556.88, 'total_tokens': 18577399, 'epoch': 0.51} + 17%|███████████████████▎ | 426/2499 [51:47<3:37:05, 6.28s/it] 17%|███████████████████▎ | 427/2499 [51:53<3:36:57, 6.28s/it] {'loss': 0.476, 'grad_norm': 0.13820724189281464, 'learning_rate': 0.0001865288187188388, 'ppl': 1.6096, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4660.43, 'total_tokens': 18606658, 'epoch': 0.51} + 17%|███████████████████▎ | 427/2499 [51:53<3:36:57, 6.28s/it] 17%|███████████████████▎ | 428/2499 [52:00<3:36:59, 6.29s/it] {'loss': 0.5665, 'grad_norm': 0.1509668529033661, 'learning_rate': 0.00018646547937998826, 'ppl': 1.7621, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4493.09, 'total_tokens': 18634930, 'epoch': 0.51} + 17%|███████████████████▎ | 428/2499 [52:00<3:36:59, 6.29s/it] 17%|███████████████████▍ | 429/2499 [52:06<3:37:16, 6.30s/it] {'loss': 0.6098, 'grad_norm': 0.16413377225399017, 'learning_rate': 0.00018640200229080763, 'ppl': 1.8401, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.09, 'tokens_per_second_per_gpu': 4687.87, 'total_tokens': 18664562, 'epoch': 0.52} + 17%|███████████████████▍ | 429/2499 [52:06<3:37:16, 6.30s/it] 17%|███████████████████▍ | 430/2499 [52:12<3:37:06, 6.30s/it] {'loss': 0.5941, 'grad_norm': 0.15067212283611298, 'learning_rate': 0.00018633838755242389, 'ppl': 1.8114, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4641.44, 'total_tokens': 18693744, 'epoch': 0.52} + 17%|███████████████████▍ | 430/2499 [52:12<3:37:06, 6.30s/it] 17%|███████████████████▍ | 431/2499 [52:18<3:36:44, 6.29s/it] {'loss': 0.6031, 'grad_norm': 0.16875723004341125, 'learning_rate': 0.00018627463526618327, 'ppl': 1.8278, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4484.05, 'total_tokens': 18721853, 'epoch': 0.52} + 17%|███████████████████▍ | 431/2499 [52:18<3:36:44, 6.29s/it] 17%|███████████████████▌ | 432/2499 [52:25<3:36:23, 6.28s/it] {'loss': 0.5569, 'grad_norm': 0.16489025950431824, 'learning_rate': 0.00018621074553365117, 'ppl': 1.7453, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4513.86, 'total_tokens': 18750119, 'epoch': 0.52} + 17%|███████████████████▌ | 432/2499 [52:25<3:36:23, 6.28s/it] 17%|███████████████████▌ | 433/2499 [52:31<3:36:14, 6.28s/it] {'loss': 0.6399, 'grad_norm': 0.16152667999267578, 'learning_rate': 0.0001861467184566119, 'ppl': 1.8963, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4645.4, 'total_tokens': 18779267, 'epoch': 0.52} + 17%|███████████████████▌ | 433/2499 [52:31<3:36:14, 6.28s/it] 17%|███████████████████▌ | 434/2499 [52:37<3:36:01, 6.28s/it] {'loss': 0.5099, 'grad_norm': 0.15479077398777008, 'learning_rate': 0.0001860825541370686, 'ppl': 1.6651, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4642.91, 'total_tokens': 18808360, 'epoch': 0.52} + 17%|███████████████████▌ | 434/2499 [52:37<3:36:01, 6.28s/it] 17%|███████████████████▋ | 435/2499 [52:44<3:36:14, 6.29s/it] {'loss': 0.5588, 'grad_norm': 0.14588800072669983, 'learning_rate': 0.00018601825267724307, 'ppl': 1.7486, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4598.14, 'total_tokens': 18837352, 'epoch': 0.52} + 17%|███████████████████▋ | 435/2499 [52:44<3:36:14, 6.29s/it] 17%|███████████████████▋ | 436/2499 [52:50<3:36:24, 6.29s/it] {'loss': 0.5802, 'grad_norm': 0.14621266722679138, 'learning_rate': 0.00018595381417957558, 'ppl': 1.7864, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4486.83, 'total_tokens': 18865660, 'epoch': 0.52} + 17%|███████████████████▋ | 436/2499 [52:50<3:36:24, 6.29s/it] 17%|███████████████████▊ | 437/2499 [52:56<3:36:07, 6.29s/it] {'loss': 0.5689, 'grad_norm': 0.1723642498254776, 'learning_rate': 0.00018588923874672474, 'ppl': 1.7663, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4501.25, 'total_tokens': 18893893, 'epoch': 0.52} + 17%|███████████████████▊ | 437/2499 [52:56<3:36:07, 6.29s/it] 18%|███████████████████▊ | 438/2499 [53:02<3:35:50, 6.28s/it] {'loss': 0.5598, 'grad_norm': 0.14782671630382538, 'learning_rate': 0.00018582452648156726, 'ppl': 1.7503, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4625.08, 'total_tokens': 18922889, 'epoch': 0.53} + 18%|███████████████████▊ | 438/2499 [53:02<3:35:50, 6.28s/it] 18%|███████████████████▊ | 439/2499 [53:09<3:35:38, 6.28s/it] {'loss': 0.6162, 'grad_norm': 0.16162589192390442, 'learning_rate': 0.0001857596774871979, 'ppl': 1.8519, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4605.11, 'total_tokens': 18951769, 'epoch': 0.53} + 18%|███████████████████▊ | 439/2499 [53:09<3:35:38, 6.28s/it] 18%|███████████████████▉ | 440/2499 [53:15<3:35:36, 6.28s/it] {'loss': 0.5212, 'grad_norm': 0.15044647455215454, 'learning_rate': 0.00018569469186692925, 'ppl': 1.684, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4708.11, 'total_tokens': 18981362, 'epoch': 0.53} + 18%|███████████████████▉ | 440/2499 [53:15<3:35:36, 6.28s/it] 18%|███████████████████▉ | 441/2499 [53:21<3:35:23, 6.28s/it] {'loss': 0.5599, 'grad_norm': 0.1452936828136444, 'learning_rate': 0.0001856295697242915, 'ppl': 1.7505, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4647.74, 'total_tokens': 19010504, 'epoch': 0.53} + 18%|███████████████████▉ | 441/2499 [53:21<3:35:23, 6.28s/it] 18%|███████████████████▉ | 442/2499 [53:28<3:35:26, 6.28s/it] {'loss': 0.5611, 'grad_norm': 0.1456039994955063, 'learning_rate': 0.0001855643111630324, 'ppl': 1.7526, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4472.27, 'total_tokens': 19038640, 'epoch': 0.53} + 18%|███████████████████▉ | 442/2499 [53:28<3:35:26, 6.28s/it] 18%|████████████████████ | 443/2499 [53:34<3:35:20, 6.28s/it] {'loss': 0.5572, 'grad_norm': 0.15803247690200806, 'learning_rate': 0.00018549891628711696, 'ppl': 1.7458, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4579.12, 'total_tokens': 19067400, 'epoch': 0.53} + 18%|████████████████████ | 443/2499 [53:34<3:35:20, 6.28s/it] 18%|████████████████████ | 444/2499 [53:40<3:35:01, 6.28s/it] {'loss': 0.5711, 'grad_norm': 0.15013474225997925, 'learning_rate': 0.00018543338520072745, 'ppl': 1.7702, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4475.98, 'total_tokens': 19095424, 'epoch': 0.53} + 18%|████████████████████ | 444/2499 [53:40<3:35:01, 6.28s/it] 18%|████████████████████ | 445/2499 [53:46<3:34:47, 6.27s/it] {'loss': 0.5748, 'grad_norm': 0.16678114235401154, 'learning_rate': 0.00018536771800826304, 'ppl': 1.7768, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4595.63, 'total_tokens': 19124207, 'epoch': 0.53} + 18%|████████████████████ | 445/2499 [53:46<3:34:47, 6.27s/it] 18%|████████████████████▏ | 446/2499 [53:53<3:34:34, 6.27s/it] {'loss': 0.5822, 'grad_norm': 0.15794029831886292, 'learning_rate': 0.00018530191481433986, 'ppl': 1.79, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4661.59, 'total_tokens': 19153399, 'epoch': 0.54} + 18%|████████████████████▏ | 446/2499 [53:53<3:34:34, 6.27s/it] 18%|████████████████████▏ | 447/2499 [53:59<3:34:22, 6.27s/it] {'loss': 0.5633, 'grad_norm': 0.15021128952503204, 'learning_rate': 0.0001852359757237906, 'ppl': 1.7565, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4615.87, 'total_tokens': 19182290, 'epoch': 0.54} + 18%|████████████████████▏ | 447/2499 [53:59<3:34:22, 6.27s/it] 18%|████████████████████▎ | 448/2499 [54:05<3:34:36, 6.28s/it] {'loss': 0.5825, 'grad_norm': 0.16197733581066132, 'learning_rate': 0.00018516990084166442, 'ppl': 1.7905, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4640.39, 'total_tokens': 19211516, 'epoch': 0.54} + 18%|████████████████████▎ | 448/2499 [54:05<3:34:36, 6.28s/it] 18%|████████████████████▎ | 449/2499 [54:11<3:34:56, 6.29s/it] {'loss': 0.6094, 'grad_norm': 0.1648341864347458, 'learning_rate': 0.000185103690273227, 'ppl': 1.8393, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4526.22, 'total_tokens': 19240112, 'epoch': 0.54} + 18%|████████████████████▎ | 449/2499 [54:11<3:34:56, 6.29s/it] 18%|████████████████████▎ | 450/2499 [54:18<3:34:49, 6.29s/it] {'loss': 0.5692, 'grad_norm': 0.15157613158226013, 'learning_rate': 0.00018503734412395994, 'ppl': 1.7669, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4723.23, 'total_tokens': 19269798, 'epoch': 0.54} + 18%|████████████████████▎ | 450/2499 [54:18<3:34:49, 6.29s/it] 18%|████████████████████▍ | 451/2499 [54:24<3:34:38, 6.29s/it] {'loss': 0.5899, 'grad_norm': 0.15227428078651428, 'learning_rate': 0.00018497086249956107, 'ppl': 1.8038, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4752.62, 'total_tokens': 19299647, 'epoch': 0.54} + 18%|████████████████████▍ | 451/2499 [54:24<3:34:38, 6.29s/it] 18%|████████████████████▍ | 452/2499 [54:30<3:34:18, 6.28s/it] {'loss': 0.5725, 'grad_norm': 0.18285728991031647, 'learning_rate': 0.00018490424550594384, 'ppl': 1.7727, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4465.02, 'total_tokens': 19327610, 'epoch': 0.54} + 18%|████████████████████▍ | 452/2499 [54:30<3:34:18, 6.28s/it] 18%|████████████████████▍ | 453/2499 [54:37<3:34:07, 6.28s/it] {'loss': 0.5944, 'grad_norm': 0.1537967026233673, 'learning_rate': 0.00018483749324923752, 'ppl': 1.8119, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4686.13, 'total_tokens': 19357004, 'epoch': 0.54} + 18%|████████████████████▍ | 453/2499 [54:37<3:34:07, 6.28s/it] 18%|████████████████████▌ | 454/2499 [54:43<3:33:52, 6.28s/it] {'loss': 0.605, 'grad_norm': 0.16797775030136108, 'learning_rate': 0.00018477060583578676, 'ppl': 1.8313, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4550.85, 'total_tokens': 19385505, 'epoch': 0.55} + 18%|████████████████████▌ | 454/2499 [54:43<3:33:52, 6.28s/it] 18%|████████████████████▌ | 455/2499 [54:49<3:33:56, 6.28s/it] {'loss': 0.621, 'grad_norm': 0.1565423309803009, 'learning_rate': 0.00018470358337215162, 'ppl': 1.8608, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4593.26, 'total_tokens': 19414395, 'epoch': 0.55} + 18%|████████████████████▌ | 455/2499 [54:49<3:33:56, 6.28s/it] 18%|████████████████████▌ | 456/2499 [54:55<3:34:10, 6.29s/it] {'loss': 0.5541, 'grad_norm': 0.15655626356601715, 'learning_rate': 0.0001846364259651073, 'ppl': 1.7404, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4694.8, 'total_tokens': 19444018, 'epoch': 0.55} + 18%|████████████████████▌ | 456/2499 [54:55<3:34:10, 6.29s/it] 18%|████████████████████▋ | 457/2499 [55:02<3:33:47, 6.28s/it] {'loss': 0.5553, 'grad_norm': 0.15646992623806, 'learning_rate': 0.00018456913372164388, 'ppl': 1.7425, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4453.86, 'total_tokens': 19471900, 'epoch': 0.55} + 18%|████████████████████▋ | 457/2499 [55:02<3:33:47, 6.28s/it] 18%|████████████████████▋ | 458/2499 [55:08<3:33:26, 6.27s/it] {'loss': 0.5948, 'grad_norm': 0.1484660506248474, 'learning_rate': 0.0001845017067489664, 'ppl': 1.8127, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4470.62, 'total_tokens': 19499866, 'epoch': 0.55} + 18%|████████████████████▋ | 458/2499 [55:08<3:33:26, 6.27s/it] 18%|████████████████████▊ | 459/2499 [55:14<3:33:14, 6.27s/it] {'loss': 0.5839, 'grad_norm': 0.1672670543193817, 'learning_rate': 0.00018443414515449438, 'ppl': 1.793, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4569.68, 'total_tokens': 19528485, 'epoch': 0.55} + 18%|████████████████████▊ | 459/2499 [55:14<3:33:14, 6.27s/it] 18%|████████████████████▊ | 460/2499 [55:21<3:33:01, 6.27s/it] {'loss': 0.6341, 'grad_norm': 0.1631045639514923, 'learning_rate': 0.00018436644904586198, 'ppl': 1.8853, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4502.65, 'total_tokens': 19556666, 'epoch': 0.55} + 18%|████████████████████▊ | 460/2499 [55:21<3:33:01, 6.27s/it] 18%|████████████████████▊ | 461/2499 [55:27<3:32:52, 6.27s/it] {'loss': 0.5554, 'grad_norm': 0.15775103867053986, 'learning_rate': 0.00018429861853091754, 'ppl': 1.7426, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4483.55, 'total_tokens': 19584739, 'epoch': 0.55} + 18%|████████████████████▊ | 461/2499 [55:27<3:32:52, 6.27s/it] 18%|████████████████████▉ | 462/2499 [55:33<3:33:02, 6.27s/it] {'loss': 0.5995, 'grad_norm': 0.16724328696727753, 'learning_rate': 0.00018423065371772355, 'ppl': 1.8212, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4399.67, 'total_tokens': 19612415, 'epoch': 0.55} + 18%|████████████████████▉ | 462/2499 [55:33<3:33:02, 6.27s/it] 19%|████████████████████▉ | 463/2499 [55:39<3:33:01, 6.28s/it] {'loss': 0.5533, 'grad_norm': 0.14767299592494965, 'learning_rate': 0.00018416255471455646, 'ppl': 1.739, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4603.33, 'total_tokens': 19641326, 'epoch': 0.56} + 19%|████████████████████▉ | 463/2499 [55:39<3:33:01, 6.28s/it] 19%|████████████████████▉ | 464/2499 [55:46<3:32:50, 6.28s/it] {'loss': 0.5469, 'grad_norm': 0.16348305344581604, 'learning_rate': 0.0001840943216299065, 'ppl': 1.7279, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4562.37, 'total_tokens': 19669924, 'epoch': 0.56} + 19%|████████████████████▉ | 464/2499 [55:46<3:32:50, 6.28s/it] 19%|█████████████████████ | 465/2499 [55:52<3:32:48, 6.28s/it] {'loss': 0.5275, 'grad_norm': 0.15997561812400818, 'learning_rate': 0.00018402595457247758, 'ppl': 1.6947, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4670.61, 'total_tokens': 19699254, 'epoch': 0.56} + 19%|█████████████████████ | 465/2499 [55:52<3:32:48, 6.28s/it] 19%|█████████████████████ | 466/2499 [55:58<3:32:25, 6.27s/it] {'loss': 0.5625, 'grad_norm': 0.16886287927627563, 'learning_rate': 0.00018395745365118687, 'ppl': 1.7551, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4419.59, 'total_tokens': 19726868, 'epoch': 0.56} + 19%|█████████████████████ | 466/2499 [55:58<3:32:25, 6.27s/it] 19%|█████████████████████ | 467/2499 [56:04<3:32:19, 6.27s/it] {'loss': 0.5631, 'grad_norm': 0.14943641424179077, 'learning_rate': 0.000183888818975165, 'ppl': 1.7561, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4526.6, 'total_tokens': 19755240, 'epoch': 0.56} + 19%|█████████████████████ | 467/2499 [56:04<3:32:19, 6.27s/it] 19%|█████████████████████▏ | 468/2499 [56:11<3:32:39, 6.28s/it] {'loss': 0.5527, 'grad_norm': 0.1644650548696518, 'learning_rate': 0.0001838200506537556, 'ppl': 1.7379, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4594.43, 'total_tokens': 19784231, 'epoch': 0.56} + 19%|█████████████████████▏ | 468/2499 [56:11<3:32:39, 6.28s/it] 19%|█████████████████████▏ | 469/2499 [56:17<3:32:52, 6.29s/it] {'loss': 0.5602, 'grad_norm': 0.21660394966602325, 'learning_rate': 0.0001837511487965151, 'ppl': 1.751, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4405.97, 'total_tokens': 19812033, 'epoch': 0.56} + 19%|█████████████████████▏ | 469/2499 [56:17<3:32:52, 6.29s/it] 19%|█████████████████████▎ | 470/2499 [56:23<3:32:19, 6.28s/it] {'loss': 0.5774, 'grad_norm': 0.16006816923618317, 'learning_rate': 0.00018368211351321294, 'ppl': 1.7814, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4336.95, 'total_tokens': 19839111, 'epoch': 0.56} + 19%|█████████████████████▎ | 470/2499 [56:23<3:32:19, 6.28s/it] 19%|█████████████████████▎ | 471/2499 [56:30<3:32:05, 6.27s/it] {'loss': 0.6026, 'grad_norm': 0.15922212600708008, 'learning_rate': 0.00018361294491383085, 'ppl': 1.8269, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4576.39, 'total_tokens': 19867776, 'epoch': 0.57} + 19%|█████████████████████▎ | 471/2499 [56:30<3:32:05, 6.27s/it] 19%|█████████████████████▎ | 472/2499 [56:36<3:32:01, 6.28s/it] {'loss': 0.5705, 'grad_norm': 0.15619011223316193, 'learning_rate': 0.0001835436431085631, 'ppl': 1.7692, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4555.67, 'total_tokens': 19896367, 'epoch': 0.57} + 19%|█████████████████████▎ | 472/2499 [56:36<3:32:01, 6.28s/it] 19%|█████████████████████▍ | 473/2499 [56:42<3:31:52, 6.27s/it] {'loss': 0.5242, 'grad_norm': 0.16088345646858215, 'learning_rate': 0.00018347420820781618, 'ppl': 1.6891, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4618.23, 'total_tokens': 19925320, 'epoch': 0.57} + 19%|█████████████████████▍ | 473/2499 [56:42<3:31:52, 6.27s/it] 19%|█████████████████████▍ | 474/2499 [56:48<3:31:43, 6.27s/it] {'loss': 0.5705, 'grad_norm': 0.14945322275161743, 'learning_rate': 0.0001834046403222085, 'ppl': 1.7692, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4559.94, 'total_tokens': 19953900, 'epoch': 0.57} + 19%|█████████████████████▍ | 474/2499 [56:48<3:31:43, 6.27s/it] 19%|█████████████████████▍ | 475/2499 [56:55<3:31:55, 6.28s/it] {'loss': 0.5825, 'grad_norm': 0.16998130083084106, 'learning_rate': 0.0001833349395625705, 'ppl': 1.7905, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4578.29, 'total_tokens': 19982746, 'epoch': 0.57} + 19%|█████████████████████▍ | 475/2499 [56:55<3:31:55, 6.28s/it] 19%|█████████████████████▌ | 476/2499 [57:01<3:32:08, 6.29s/it] {'loss': 0.5645, 'grad_norm': 0.16215763986110687, 'learning_rate': 0.00018326510603994408, 'ppl': 1.7586, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4533.6, 'total_tokens': 20011356, 'epoch': 0.57} + 19%|█████████████████████▌ | 476/2499 [57:01<3:32:08, 6.29s/it] 19%|█████████████████████▌ | 477/2499 [57:07<3:31:57, 6.29s/it] {'loss': 0.517, 'grad_norm': 0.1561872959136963, 'learning_rate': 0.0001831951398655829, 'ppl': 1.677, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4588.7, 'total_tokens': 20040170, 'epoch': 0.57} + 19%|█████████████████████▌ | 477/2499 [57:07<3:31:57, 6.29s/it] 19%|█████████████████████▌ | 478/2499 [57:14<3:31:45, 6.29s/it] {'loss': 0.5762, 'grad_norm': 0.14602787792682648, 'learning_rate': 0.00018312504115095183, 'ppl': 1.7793, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4743.0, 'total_tokens': 20069945, 'epoch': 0.57} + 19%|█████████████████████▌ | 478/2499 [57:14<3:31:45, 6.29s/it] 19%|█████████████████████▋ | 479/2499 [57:20<3:31:22, 6.28s/it] {'loss': 0.5952, 'grad_norm': 0.15776875615119934, 'learning_rate': 0.0001830548100077268, 'ppl': 1.8134, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4432.95, 'total_tokens': 20097685, 'epoch': 0.58} + 19%|█████████████████████▋ | 479/2499 [57:20<3:31:22, 6.28s/it] 19%|█████████████████████▋ | 480/2499 [57:26<3:31:12, 6.28s/it] {'loss': 0.5511, 'grad_norm': 0.17158068716526031, 'learning_rate': 0.00018298444654779494, 'ppl': 1.7352, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4570.76, 'total_tokens': 20126338, 'epoch': 0.58} + 19%|█████████████████████▋ | 480/2499 [57:26<3:31:12, 6.28s/it] 19%|█████████████████████▋ | 481/2499 [57:32<3:30:57, 6.27s/it] {'loss': 0.5418, 'grad_norm': 0.16042540967464447, 'learning_rate': 0.00018291395088325393, 'ppl': 1.7191, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4549.64, 'total_tokens': 20154820, 'epoch': 0.58} + 19%|█████████████████████▋ | 481/2499 [57:32<3:30:57, 6.27s/it] 19%|█████████████████████▊ | 482/2499 [57:39<3:31:05, 6.28s/it] {'loss': 0.5878, 'grad_norm': 0.15308934450149536, 'learning_rate': 0.00018284332312641226, 'ppl': 1.8, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4432.0, 'total_tokens': 20182711, 'epoch': 0.58} + 19%|█████████████████████▊ | 482/2499 [57:39<3:31:05, 6.28s/it] 19%|█████████████████████▊ | 483/2499 [57:45<3:31:21, 6.29s/it] {'loss': 0.571, 'grad_norm': 0.1654343158006668, 'learning_rate': 0.00018277256338978875, 'ppl': 1.77, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4780.75, 'total_tokens': 20212891, 'epoch': 0.58} + 19%|█████████████████████▊ | 483/2499 [57:45<3:31:21, 6.29s/it] 19%|█████████████████████▉ | 484/2499 [57:51<3:31:13, 6.29s/it] {'loss': 0.62, 'grad_norm': 0.15104906260967255, 'learning_rate': 0.00018270167178611254, 'ppl': 1.8589, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4787.43, 'total_tokens': 20242973, 'epoch': 0.58} + 19%|█████████████████████▉ | 484/2499 [57:51<3:31:13, 6.29s/it] 19%|█████████████████████▉ | 485/2499 [57:58<3:30:54, 6.28s/it] {'loss': 0.5792, 'grad_norm': 0.15305499732494354, 'learning_rate': 0.00018263064842832281, 'ppl': 1.7846, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4599.65, 'total_tokens': 20271797, 'epoch': 0.58} + 19%|█████████████████████▉ | 485/2499 [57:58<3:30:54, 6.28s/it] 19%|█████████████████████▉ | 486/2499 [58:04<3:30:32, 6.28s/it] {'loss': 0.6147, 'grad_norm': 0.16085773706436157, 'learning_rate': 0.00018255949342956863, 'ppl': 1.8491, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4499.19, 'total_tokens': 20299936, 'epoch': 0.58} + 19%|█████████████████████▉ | 486/2499 [58:04<3:30:32, 6.28s/it] 19%|██████████████████████ | 487/2499 [58:10<3:30:11, 6.27s/it] {'loss': 0.6101, 'grad_norm': 0.18756824731826782, 'learning_rate': 0.00018248820690320889, 'ppl': 1.8406, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4454.08, 'total_tokens': 20327766, 'epoch': 0.58} + 19%|██████████████████████ | 487/2499 [58:10<3:30:11, 6.27s/it] 20%|██████████████████████ | 488/2499 [58:16<3:30:03, 6.27s/it] {'loss': 0.6178, 'grad_norm': 0.17079570889472961, 'learning_rate': 0.00018241678896281188, 'ppl': 1.8548, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4494.52, 'total_tokens': 20355914, 'epoch': 0.59} + 20%|██████████████████████ | 488/2499 [58:16<3:30:03, 6.27s/it] 20%|██████████████████████ | 489/2499 [58:23<3:30:06, 6.27s/it] {'loss': 0.6194, 'grad_norm': 0.16470293700695038, 'learning_rate': 0.00018234523972215536, 'ppl': 1.8578, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4335.83, 'total_tokens': 20383145, 'epoch': 0.59} + 20%|██████████████████████ | 489/2499 [58:23<3:30:06, 6.27s/it] 20%|██████████████████████▏ | 490/2499 [58:29<3:30:02, 6.27s/it] {'loss': 0.5485, 'grad_norm': 0.16623155772686005, 'learning_rate': 0.00018227355929522623, 'ppl': 1.7307, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4540.41, 'total_tokens': 20411623, 'epoch': 0.59} + 20%|██████████████████████▏ | 490/2499 [58:29<3:30:02, 6.27s/it] 20%|██████████████████████▏ | 491/2499 [58:35<3:30:05, 6.28s/it] {'loss': 0.5823, 'grad_norm': 0.18991202116012573, 'learning_rate': 0.00018220174779622034, 'ppl': 1.7902, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4712.78, 'total_tokens': 20441243, 'epoch': 0.59} + 20%|██████████████████████▏ | 491/2499 [58:35<3:30:05, 6.28s/it] 20%|██████████████████████▏ | 492/2499 [58:41<3:29:51, 6.27s/it] {'loss': 0.567, 'grad_norm': 0.16380782425403595, 'learning_rate': 0.00018212980533954243, 'ppl': 1.763, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4455.55, 'total_tokens': 20469147, 'epoch': 0.59} + 20%|██████████████████████▏ | 492/2499 [58:41<3:29:51, 6.27s/it] 20%|██████████████████████▎ | 493/2499 [58:48<3:29:39, 6.27s/it] {'loss': 0.5529, 'grad_norm': 0.15777407586574554, 'learning_rate': 0.00018205773203980582, 'ppl': 1.7383, 'memory/max_active (GiB)': 18.08, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4500.07, 'total_tokens': 20497328, 'epoch': 0.59} + 20%|██████████████████████▎ | 493/2499 [58:48<3:29:39, 6.27s/it] 20%|██████████████████████▎ | 494/2499 [58:54<3:29:43, 6.28s/it] {'loss': 0.5155, 'grad_norm': 0.15534964203834534, 'learning_rate': 0.0001819855280118323, 'ppl': 1.6745, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4716.79, 'total_tokens': 20526972, 'epoch': 0.59} + 20%|██████████████████████▎ | 494/2499 [58:54<3:29:43, 6.28s/it] 20%|██████████████████████▍ | 495/2499 [59:00<3:29:32, 6.27s/it] {'loss': 0.575, 'grad_norm': 0.1571529060602188, 'learning_rate': 0.00018191319337065195, 'ppl': 1.7771, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4597.26, 'total_tokens': 20555776, 'epoch': 0.59} + 20%|██████████████████████▍ | 495/2499 [59:00<3:29:32, 6.27s/it] 20%|██████████████████████▍ | 496/2499 [59:07<3:29:34, 6.28s/it] {'loss': 0.5403, 'grad_norm': 0.15741947293281555, 'learning_rate': 0.00018184072823150283, 'ppl': 1.7165, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4368.14, 'total_tokens': 20583232, 'epoch': 0.6} + 20%|██████████████████████▍ | 496/2499 [59:07<3:29:34, 6.28s/it] 20%|██████████████████████▍ | 497/2499 [59:13<3:29:42, 6.28s/it] {'loss': 0.598, 'grad_norm': 0.15272513031959534, 'learning_rate': 0.00018176813270983107, 'ppl': 1.8185, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4795.9, 'total_tokens': 20613435, 'epoch': 0.6} + 20%|██████████████████████▍ | 497/2499 [59:13<3:29:42, 6.28s/it] 20%|██████████████████████▌ | 498/2499 [59:19<3:29:15, 6.27s/it] {'loss': 0.5788, 'grad_norm': 0.17274294793605804, 'learning_rate': 0.00018169540692129034, 'ppl': 1.7839, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4501.35, 'total_tokens': 20641562, 'epoch': 0.6} + 20%|██████████████████████▌ | 498/2499 [59:19<3:29:15, 6.27s/it] 20%|██████████████████████▌ | 499/2499 [59:25<3:29:06, 6.27s/it] {'loss': 0.5548, 'grad_norm': 0.1560908406972885, 'learning_rate': 0.000181622550981742, 'ppl': 1.7416, 'memory/max_active (GiB)': 18.11, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4626.44, 'total_tokens': 20670558, 'epoch': 0.6} + 20%|██████████████████████▌ | 499/2499 [59:25<3:29:06, 6.27s/it] 20%|██████████████████████▌ | 500/2499 [59:32<3:28:52, 6.27s/it] {'loss': 0.5454, 'grad_norm': 0.15503084659576416, 'learning_rate': 0.0001815495650072546, 'ppl': 1.7253, 'memory/max_active (GiB)': 18.05, 'memory/max_allocated (GiB)': 12.52, 'memory/device_reserved (GiB)': 20.15, 'tokens_per_second_per_gpu': 4517.68, 'total_tokens': 20698829, 'epoch': 0.6} + 20%|██████████████████████▌ | 500/2499 [59:32<3:28:52, 6.27s/it][2025-12-28 12:05:08,249] [INFO] [axolotl.core.trainers.base.evaluate:388] [PID:42410] Running evaluation step... +[2025-12-28 12:05:09,958] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8499741554260254 +[2025-12-28 12:05:10,798] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8398079872131348 +[2025-12-28 12:05:11,648] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.847926139831543 +[2025-12-28 12:05:12,486] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:42410] generate_batches time: 0.8376157283782959 +[2025-12-28 12:05:12,486] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:42410] gather_len_batches: [90] + + 0%| | 0/90 [00:00