diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,3468 +1,6618 @@ { - "best_metric": 4.272188186645508, - "best_model_checkpoint": "learning_source_20260316/rna_celltype/bert-output/rna_celltype-medium/checkpoint-23000", - "epoch": 162.3290692545214, + "best_metric": 3.1239471435546875, + "best_model_checkpoint": "learning_source_20260316/rna_celltype/bert-output/rna_celltype-medium/checkpoint-44000", + "epoch": 621.8821453775582, "eval_steps": 100, - "global_step": 23000, + "global_step": 44000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.7057785619761799, - "grad_norm": 9.433653831481934, + "grad_norm": 4.538567066192627, "learning_rate": 5e-06, - "loss": 7.4256, + "loss": 5.8142, "step": 100 }, { "epoch": 0.7057785619761799, - "eval_loss": 7.134615421295166, - "eval_runtime": 192.8592, - "eval_samples_per_second": 51.851, - "eval_steps_per_second": 6.481, + "eval_loss": 5.327540397644043, + "eval_runtime": 193.0087, + "eval_samples_per_second": 51.811, + "eval_steps_per_second": 6.476, "step": 100 }, { "epoch": 1.4115571239523599, - "grad_norm": 11.789298057556152, + "grad_norm": 7.248689651489258, "learning_rate": 1e-05, - "loss": 7.1907, + "loss": 5.4369, "step": 200 }, { "epoch": 1.4115571239523599, - "eval_loss": 6.91338586807251, - "eval_runtime": 193.3504, - "eval_samples_per_second": 51.72, - "eval_steps_per_second": 6.465, + "eval_loss": 5.1204304695129395, + "eval_runtime": 192.4871, + "eval_samples_per_second": 51.952, + "eval_steps_per_second": 6.494, "step": 200 }, { "epoch": 2.11733568592854, - "grad_norm": 5.97905969619751, + "grad_norm": 3.5560193061828613, "learning_rate": 9.983277591973245e-06, - "loss": 7.037, + "loss": 5.2934, "step": 300 }, { "epoch": 2.11733568592854, - "eval_loss": 6.766317844390869, - "eval_runtime": 192.8739, - "eval_samples_per_second": 51.847, - "eval_steps_per_second": 6.481, + "eval_loss": 5.017918109893799, + "eval_runtime": 191.809, + "eval_samples_per_second": 52.135, + "eval_steps_per_second": 6.517, "step": 300 }, { "epoch": 2.8231142479047198, - "grad_norm": 10.660772323608398, + "grad_norm": 3.971824884414673, "learning_rate": 9.966555183946488e-06, - "loss": 6.9152, + "loss": 5.1746, "step": 400 }, { "epoch": 2.8231142479047198, - "eval_loss": 6.6594977378845215, - "eval_runtime": 192.7031, - "eval_samples_per_second": 51.893, - "eval_steps_per_second": 6.487, + "eval_loss": 4.890848636627197, + "eval_runtime": 192.9174, + "eval_samples_per_second": 51.836, + "eval_steps_per_second": 6.479, "step": 400 }, { "epoch": 3.5288928098809, - "grad_norm": 13.777559280395508, + "grad_norm": 4.859294891357422, "learning_rate": 9.949832775919734e-06, - "loss": 6.7571, + "loss": 5.0841, "step": 500 }, { "epoch": 3.5288928098809, - "eval_loss": 6.499149322509766, - "eval_runtime": 192.77, - "eval_samples_per_second": 51.875, - "eval_steps_per_second": 6.484, + "eval_loss": 4.817997455596924, + "eval_runtime": 192.8217, + "eval_samples_per_second": 51.861, + "eval_steps_per_second": 6.483, "step": 500 }, { "epoch": 4.23467137185708, - "grad_norm": 9.750181198120117, + "grad_norm": 6.207881927490234, "learning_rate": 9.933110367892978e-06, - "loss": 6.6514, + "loss": 5.0152, "step": 600 }, { "epoch": 4.23467137185708, - "eval_loss": 6.413051605224609, - "eval_runtime": 192.6584, - "eval_samples_per_second": 51.905, - "eval_steps_per_second": 6.488, + "eval_loss": 4.762999057769775, + "eval_runtime": 192.7086, + "eval_samples_per_second": 51.892, + "eval_steps_per_second": 6.486, "step": 600 }, { "epoch": 4.94044993383326, - "grad_norm": 8.93947982788086, + "grad_norm": 6.541872024536133, "learning_rate": 9.916387959866221e-06, - "loss": 6.5614, + "loss": 4.962, "step": 700 }, { "epoch": 4.94044993383326, - "eval_loss": 6.351477146148682, - "eval_runtime": 193.0308, - "eval_samples_per_second": 51.805, - "eval_steps_per_second": 6.476, + "eval_loss": 4.727071762084961, + "eval_runtime": 192.7219, + "eval_samples_per_second": 51.888, + "eval_steps_per_second": 6.486, "step": 700 }, { "epoch": 5.6462284958094395, - "grad_norm": 8.086716651916504, + "grad_norm": 5.75955057144165, "learning_rate": 9.899665551839465e-06, - "loss": 6.4893, + "loss": 4.9235, "step": 800 }, { "epoch": 5.6462284958094395, - "eval_loss": 6.218596935272217, - "eval_runtime": 192.7294, - "eval_samples_per_second": 51.886, - "eval_steps_per_second": 6.486, + "eval_loss": 4.683049201965332, + "eval_runtime": 192.7611, + "eval_samples_per_second": 51.878, + "eval_steps_per_second": 6.485, "step": 800 }, { "epoch": 6.352007057785619, - "grad_norm": 9.664277076721191, + "grad_norm": 5.269717693328857, "learning_rate": 9.88294314381271e-06, - "loss": 6.4178, + "loss": 4.8862, "step": 900 }, { "epoch": 6.352007057785619, - "eval_loss": 6.159446716308594, - "eval_runtime": 193.0818, - "eval_samples_per_second": 51.792, - "eval_steps_per_second": 6.474, + "eval_loss": 4.64549446105957, + "eval_runtime": 196.0126, + "eval_samples_per_second": 51.017, + "eval_steps_per_second": 6.377, "step": 900 }, { "epoch": 7.0577856197618, - "grad_norm": 8.006940841674805, + "grad_norm": 5.438960552215576, "learning_rate": 9.866220735785954e-06, - "loss": 6.3546, + "loss": 4.8578, "step": 1000 }, { "epoch": 7.0577856197618, - "eval_loss": 6.15038537979126, - "eval_runtime": 192.9009, - "eval_samples_per_second": 51.84, - "eval_steps_per_second": 6.48, + "eval_loss": 4.6224799156188965, + "eval_runtime": 192.8239, + "eval_samples_per_second": 51.861, + "eval_steps_per_second": 6.483, "step": 1000 }, { "epoch": 7.76356418173798, - "grad_norm": 8.577669143676758, + "grad_norm": 6.405101776123047, "learning_rate": 9.849498327759198e-06, - "loss": 6.2995, + "loss": 4.8345, "step": 1100 }, { "epoch": 7.76356418173798, - "eval_loss": 6.0646772384643555, - "eval_runtime": 193.018, - "eval_samples_per_second": 51.809, - "eval_steps_per_second": 6.476, + "eval_loss": 4.619054317474365, + "eval_runtime": 192.9295, + "eval_samples_per_second": 51.832, + "eval_steps_per_second": 6.479, "step": 1100 }, { "epoch": 8.46934274371416, - "grad_norm": 8.229045867919922, + "grad_norm": 5.179597854614258, "learning_rate": 9.832775919732442e-06, - "loss": 6.2452, + "loss": 4.8176, "step": 1200 }, { "epoch": 8.46934274371416, - "eval_loss": 6.027785301208496, - "eval_runtime": 192.7708, - "eval_samples_per_second": 51.875, - "eval_steps_per_second": 6.484, + "eval_loss": 4.602497100830078, + "eval_runtime": 193.0329, + "eval_samples_per_second": 51.805, + "eval_steps_per_second": 6.476, "step": 1200 }, { "epoch": 9.17512130569034, - "grad_norm": 5.580593585968018, + "grad_norm": 3.9729690551757812, "learning_rate": 9.816053511705687e-06, - "loss": 6.2021, + "loss": 4.7947, "step": 1300 }, { "epoch": 9.17512130569034, - "eval_loss": 6.017040729522705, - "eval_runtime": 192.743, - "eval_samples_per_second": 51.883, - "eval_steps_per_second": 6.485, + "eval_loss": 4.545534610748291, + "eval_runtime": 192.786, + "eval_samples_per_second": 51.871, + "eval_steps_per_second": 6.484, "step": 1300 }, { "epoch": 9.88089986766652, - "grad_norm": 5.742369651794434, + "grad_norm": 4.882862567901611, "learning_rate": 9.799331103678931e-06, - "loss": 6.1506, + "loss": 4.7722, "step": 1400 }, { "epoch": 9.88089986766652, - "eval_loss": 5.927543640136719, - "eval_runtime": 192.6863, - "eval_samples_per_second": 51.898, - "eval_steps_per_second": 6.487, + "eval_loss": 4.546108722686768, + "eval_runtime": 192.8307, + "eval_samples_per_second": 51.859, + "eval_steps_per_second": 6.482, "step": 1400 }, { "epoch": 10.5866784296427, - "grad_norm": 10.356036186218262, + "grad_norm": 3.8944156169891357, "learning_rate": 9.782608695652175e-06, - "loss": 6.1085, + "loss": 4.7513, "step": 1500 }, { "epoch": 10.5866784296427, - "eval_loss": 5.921632766723633, - "eval_runtime": 193.3134, - "eval_samples_per_second": 51.729, - "eval_steps_per_second": 6.466, + "eval_loss": 4.515088081359863, + "eval_runtime": 192.7368, + "eval_samples_per_second": 51.884, + "eval_steps_per_second": 6.486, "step": 1500 }, { "epoch": 11.292456991618879, - "grad_norm": 7.751805782318115, + "grad_norm": 5.533792018890381, "learning_rate": 9.765886287625419e-06, - "loss": 6.0714, + "loss": 4.731, "step": 1600 }, { "epoch": 11.292456991618879, - "eval_loss": 5.864123821258545, - "eval_runtime": 193.5796, - "eval_samples_per_second": 51.658, - "eval_steps_per_second": 6.457, + "eval_loss": 4.497799396514893, + "eval_runtime": 191.5673, + "eval_samples_per_second": 52.201, + "eval_steps_per_second": 6.525, "step": 1600 }, { "epoch": 11.998235553595059, - "grad_norm": 8.487528800964355, + "grad_norm": 4.2257256507873535, "learning_rate": 9.749163879598664e-06, - "loss": 6.0379, + "loss": 4.7193, "step": 1700 }, { "epoch": 11.998235553595059, - "eval_loss": 5.818101406097412, - "eval_runtime": 192.5622, - "eval_samples_per_second": 51.931, - "eval_steps_per_second": 6.491, + "eval_loss": 4.487800598144531, + "eval_runtime": 192.5021, + "eval_samples_per_second": 51.947, + "eval_steps_per_second": 6.493, "step": 1700 }, { "epoch": 12.704014115571239, - "grad_norm": 5.235436916351318, + "grad_norm": 4.029520034790039, "learning_rate": 9.732441471571908e-06, - "loss": 6.0087, + "loss": 4.7014, "step": 1800 }, { "epoch": 12.704014115571239, - "eval_loss": 5.806471824645996, - "eval_runtime": 192.7608, - "eval_samples_per_second": 51.878, - "eval_steps_per_second": 6.485, + "eval_loss": 4.481775760650635, + "eval_runtime": 192.7297, + "eval_samples_per_second": 51.886, + "eval_steps_per_second": 6.486, "step": 1800 }, { "epoch": 13.40979267754742, - "grad_norm": 5.4680562019348145, + "grad_norm": 3.9024765491485596, "learning_rate": 9.715719063545151e-06, - "loss": 5.9737, + "loss": 4.6848, "step": 1900 }, { "epoch": 13.40979267754742, - "eval_loss": 5.759986877441406, - "eval_runtime": 192.6113, - "eval_samples_per_second": 51.918, - "eval_steps_per_second": 6.49, + "eval_loss": 4.4503583908081055, + "eval_runtime": 192.7847, + "eval_samples_per_second": 51.871, + "eval_steps_per_second": 6.484, "step": 1900 }, { "epoch": 14.1155712395236, - "grad_norm": 8.043121337890625, + "grad_norm": 5.402480125427246, "learning_rate": 9.698996655518395e-06, - "loss": 5.9431, + "loss": 4.6727, "step": 2000 }, { "epoch": 14.1155712395236, - "eval_loss": 5.742130279541016, - "eval_runtime": 192.7372, - "eval_samples_per_second": 51.884, - "eval_steps_per_second": 6.486, + "eval_loss": 4.455318927764893, + "eval_runtime": 192.7638, + "eval_samples_per_second": 51.877, + "eval_steps_per_second": 6.485, "step": 2000 }, { "epoch": 14.82134980149978, - "grad_norm": 5.295115947723389, + "grad_norm": 3.5884945392608643, "learning_rate": 9.682274247491639e-06, - "loss": 5.911, + "loss": 4.6611, "step": 2100 }, { "epoch": 14.82134980149978, - "eval_loss": 5.718338489532471, - "eval_runtime": 193.0201, - "eval_samples_per_second": 51.808, - "eval_steps_per_second": 6.476, + "eval_loss": 4.425973415374756, + "eval_runtime": 192.8769, + "eval_samples_per_second": 51.847, + "eval_steps_per_second": 6.481, "step": 2100 }, { "epoch": 15.52712836347596, - "grad_norm": 7.699941158294678, + "grad_norm": 4.391833782196045, "learning_rate": 9.665551839464884e-06, - "loss": 5.8842, + "loss": 4.6476, "step": 2200 }, { "epoch": 15.52712836347596, - "eval_loss": 5.6654181480407715, - "eval_runtime": 193.0942, - "eval_samples_per_second": 51.788, - "eval_steps_per_second": 6.474, + "eval_loss": 4.420197010040283, + "eval_runtime": 193.3815, + "eval_samples_per_second": 51.711, + "eval_steps_per_second": 6.464, "step": 2200 }, { "epoch": 16.232906925452138, - "grad_norm": 6.278568744659424, + "grad_norm": 3.3455958366394043, "learning_rate": 9.648829431438128e-06, - "loss": 5.8506, + "loss": 4.638, "step": 2300 }, { "epoch": 16.232906925452138, - "eval_loss": 5.633941650390625, - "eval_runtime": 196.486, - "eval_samples_per_second": 50.894, - "eval_steps_per_second": 6.362, + "eval_loss": 4.404079914093018, + "eval_runtime": 192.7422, + "eval_samples_per_second": 51.883, + "eval_steps_per_second": 6.485, "step": 2300 }, { "epoch": 16.93868548742832, - "grad_norm": 5.364164352416992, + "grad_norm": 2.938906669616699, "learning_rate": 9.632107023411372e-06, - "loss": 5.8247, + "loss": 4.6264, "step": 2400 }, { "epoch": 16.93868548742832, - "eval_loss": 5.628259181976318, - "eval_runtime": 192.9959, - "eval_samples_per_second": 51.815, - "eval_steps_per_second": 6.477, + "eval_loss": 4.401567459106445, + "eval_runtime": 192.7203, + "eval_samples_per_second": 51.889, + "eval_steps_per_second": 6.486, "step": 2400 }, { "epoch": 17.644464049404498, - "grad_norm": 5.200554370880127, + "grad_norm": 2.6890764236450195, "learning_rate": 9.615384615384616e-06, - "loss": 5.8119, + "loss": 4.6163, "step": 2500 }, { "epoch": 17.644464049404498, - "eval_loss": 5.5971198081970215, - "eval_runtime": 192.619, - "eval_samples_per_second": 51.916, - "eval_steps_per_second": 6.489, + "eval_loss": 4.387485504150391, + "eval_runtime": 192.7331, + "eval_samples_per_second": 51.885, + "eval_steps_per_second": 6.486, "step": 2500 }, { "epoch": 18.35024261138068, - "grad_norm": 7.878041744232178, + "grad_norm": 3.5793533325195312, "learning_rate": 9.598662207357861e-06, - "loss": 5.7834, + "loss": 4.6039, "step": 2600 }, { "epoch": 18.35024261138068, - "eval_loss": 5.6158366203308105, - "eval_runtime": 192.9746, - "eval_samples_per_second": 51.82, - "eval_steps_per_second": 6.478, + "eval_loss": 4.388312339782715, + "eval_runtime": 192.6508, + "eval_samples_per_second": 51.907, + "eval_steps_per_second": 6.488, "step": 2600 }, { "epoch": 19.05602117335686, - "grad_norm": 4.609034061431885, + "grad_norm": 2.4879586696624756, "learning_rate": 9.581939799331105e-06, - "loss": 5.7621, + "loss": 4.5951, "step": 2700 }, { "epoch": 19.05602117335686, - "eval_loss": 5.559902191162109, - "eval_runtime": 192.5603, - "eval_samples_per_second": 51.932, - "eval_steps_per_second": 6.491, + "eval_loss": 4.378627300262451, + "eval_runtime": 192.7297, + "eval_samples_per_second": 51.886, + "eval_steps_per_second": 6.486, "step": 2700 }, { "epoch": 19.76179973533304, - "grad_norm": 5.649851322174072, + "grad_norm": 4.8779072761535645, "learning_rate": 9.565217391304349e-06, - "loss": 5.7314, + "loss": 4.592, "step": 2800 }, { "epoch": 19.76179973533304, - "eval_loss": 5.544479846954346, - "eval_runtime": 192.9774, - "eval_samples_per_second": 51.82, - "eval_steps_per_second": 6.477, + "eval_loss": 4.376707077026367, + "eval_runtime": 191.5339, + "eval_samples_per_second": 52.21, + "eval_steps_per_second": 6.526, "step": 2800 }, { "epoch": 20.46757829730922, - "grad_norm": 5.958551406860352, + "grad_norm": 4.811100006103516, "learning_rate": 9.548494983277592e-06, - "loss": 5.7083, + "loss": 4.5825, "step": 2900 }, { "epoch": 20.46757829730922, - "eval_loss": 5.522533416748047, - "eval_runtime": 192.9165, - "eval_samples_per_second": 51.836, - "eval_steps_per_second": 6.479, + "eval_loss": 4.363040447235107, + "eval_runtime": 192.7097, + "eval_samples_per_second": 51.892, + "eval_steps_per_second": 6.486, "step": 2900 }, { "epoch": 21.1733568592854, - "grad_norm": 4.8862504959106445, + "grad_norm": 3.4796102046966553, "learning_rate": 9.531772575250838e-06, - "loss": 5.6873, + "loss": 4.5731, "step": 3000 }, { "epoch": 21.1733568592854, - "eval_loss": 5.565715789794922, - "eval_runtime": 192.9833, - "eval_samples_per_second": 51.818, - "eval_steps_per_second": 6.477, + "eval_loss": 4.348217964172363, + "eval_runtime": 192.7117, + "eval_samples_per_second": 51.891, + "eval_steps_per_second": 6.486, "step": 3000 }, { "epoch": 21.87913542126158, - "grad_norm": 5.906116962432861, + "grad_norm": 3.2958459854125977, "learning_rate": 9.515050167224082e-06, - "loss": 5.6758, + "loss": 4.5611, "step": 3100 }, { "epoch": 21.87913542126158, - "eval_loss": 5.535135746002197, - "eval_runtime": 193.0783, - "eval_samples_per_second": 51.792, - "eval_steps_per_second": 6.474, + "eval_loss": 4.336493968963623, + "eval_runtime": 192.6996, + "eval_samples_per_second": 51.894, + "eval_steps_per_second": 6.487, "step": 3100 }, { "epoch": 22.584913983237758, - "grad_norm": 4.937218189239502, + "grad_norm": 3.4779412746429443, "learning_rate": 9.498327759197325e-06, - "loss": 5.6546, + "loss": 4.5518, "step": 3200 }, { "epoch": 22.584913983237758, - "eval_loss": 5.504598617553711, - "eval_runtime": 193.0866, - "eval_samples_per_second": 51.79, - "eval_steps_per_second": 6.474, + "eval_loss": 4.320661544799805, + "eval_runtime": 192.7594, + "eval_samples_per_second": 51.878, + "eval_steps_per_second": 6.485, "step": 3200 }, { "epoch": 23.29069254521394, - "grad_norm": 3.9060332775115967, + "grad_norm": 2.7907259464263916, "learning_rate": 9.48160535117057e-06, - "loss": 5.6276, + "loss": 4.5401, "step": 3300 }, { "epoch": 23.29069254521394, - "eval_loss": 5.466856956481934, - "eval_runtime": 192.9581, - "eval_samples_per_second": 51.825, - "eval_steps_per_second": 6.478, + "eval_loss": 4.316312313079834, + "eval_runtime": 192.9908, + "eval_samples_per_second": 51.816, + "eval_steps_per_second": 6.477, "step": 3300 }, { "epoch": 23.996471107190118, - "grad_norm": 4.933598518371582, + "grad_norm": 3.1774535179138184, "learning_rate": 9.464882943143815e-06, - "loss": 5.6068, + "loss": 4.5307, "step": 3400 }, { "epoch": 23.996471107190118, - "eval_loss": 5.424036026000977, - "eval_runtime": 192.7995, - "eval_samples_per_second": 51.867, - "eval_steps_per_second": 6.483, + "eval_loss": 4.305875778198242, + "eval_runtime": 192.9792, + "eval_samples_per_second": 51.819, + "eval_steps_per_second": 6.477, "step": 3400 }, { "epoch": 24.7022496691663, - "grad_norm": 3.609147548675537, + "grad_norm": 4.013304233551025, "learning_rate": 9.448160535117058e-06, - "loss": 5.5852, + "loss": 4.5236, "step": 3500 }, { "epoch": 24.7022496691663, - "eval_loss": 5.414628028869629, - "eval_runtime": 192.754, - "eval_samples_per_second": 51.88, - "eval_steps_per_second": 6.485, + "eval_loss": 4.303380012512207, + "eval_runtime": 192.8249, + "eval_samples_per_second": 51.861, + "eval_steps_per_second": 6.483, "step": 3500 }, { "epoch": 25.408028231142477, - "grad_norm": 5.111537456512451, + "grad_norm": 3.611783504486084, "learning_rate": 9.431438127090302e-06, - "loss": 5.5704, + "loss": 4.5173, "step": 3600 }, { "epoch": 25.408028231142477, - "eval_loss": 5.408510684967041, - "eval_runtime": 192.584, - "eval_samples_per_second": 51.925, - "eval_steps_per_second": 6.491, + "eval_loss": 4.297857761383057, + "eval_runtime": 192.8889, + "eval_samples_per_second": 51.843, + "eval_steps_per_second": 6.48, "step": 3600 }, { "epoch": 26.11380679311866, - "grad_norm": 6.402336597442627, + "grad_norm": 2.8548221588134766, "learning_rate": 9.414715719063546e-06, - "loss": 5.5551, + "loss": 4.5052, "step": 3700 }, { "epoch": 26.11380679311866, - "eval_loss": 5.369451522827148, - "eval_runtime": 192.6935, - "eval_samples_per_second": 51.896, - "eval_steps_per_second": 6.487, + "eval_loss": 4.289151668548584, + "eval_runtime": 192.7224, + "eval_samples_per_second": 51.888, + "eval_steps_per_second": 6.486, "step": 3700 }, { "epoch": 26.81958535509484, - "grad_norm": 6.334754943847656, + "grad_norm": 3.3853394985198975, "learning_rate": 9.39799331103679e-06, - "loss": 5.5439, + "loss": 4.4983, "step": 3800 }, { "epoch": 26.81958535509484, - "eval_loss": 5.353564262390137, - "eval_runtime": 192.9116, - "eval_samples_per_second": 51.837, - "eval_steps_per_second": 6.48, + "eval_loss": 4.275169372558594, + "eval_runtime": 192.7669, + "eval_samples_per_second": 51.876, + "eval_steps_per_second": 6.485, "step": 3800 }, { "epoch": 27.52536391707102, - "grad_norm": 5.57678747177124, + "grad_norm": 2.4811007976531982, "learning_rate": 9.381270903010035e-06, - "loss": 5.5269, + "loss": 4.4876, "step": 3900 }, { "epoch": 27.52536391707102, - "eval_loss": 5.3518571853637695, - "eval_runtime": 192.5248, - "eval_samples_per_second": 51.941, - "eval_steps_per_second": 6.493, + "eval_loss": 4.272207736968994, + "eval_runtime": 192.9342, + "eval_samples_per_second": 51.831, + "eval_steps_per_second": 6.479, "step": 3900 }, { "epoch": 28.2311424790472, - "grad_norm": 6.486103057861328, + "grad_norm": 2.3202896118164062, "learning_rate": 9.364548494983279e-06, - "loss": 5.507, + "loss": 4.4792, "step": 4000 }, { "epoch": 28.2311424790472, - "eval_loss": 5.373325824737549, - "eval_runtime": 192.6379, - "eval_samples_per_second": 51.911, - "eval_steps_per_second": 6.489, + "eval_loss": 4.268562316894531, + "eval_runtime": 191.834, + "eval_samples_per_second": 52.128, + "eval_steps_per_second": 6.516, "step": 4000 }, { "epoch": 28.93692104102338, - "grad_norm": 3.3786087036132812, + "grad_norm": 2.385047674179077, "learning_rate": 9.347826086956523e-06, - "loss": 5.4893, + "loss": 4.4711, "step": 4100 }, { "epoch": 28.93692104102338, - "eval_loss": 5.322321891784668, - "eval_runtime": 192.6567, - "eval_samples_per_second": 51.906, - "eval_steps_per_second": 6.488, + "eval_loss": 4.258542537689209, + "eval_runtime": 192.3883, + "eval_samples_per_second": 51.978, + "eval_steps_per_second": 6.497, "step": 4100 }, { "epoch": 29.64269960299956, - "grad_norm": 6.318047523498535, + "grad_norm": 2.873243808746338, "learning_rate": 9.331103678929766e-06, - "loss": 5.4737, + "loss": 4.4627, "step": 4200 }, { "epoch": 29.64269960299956, - "eval_loss": 5.297492027282715, - "eval_runtime": 192.8666, - "eval_samples_per_second": 51.849, - "eval_steps_per_second": 6.481, + "eval_loss": 4.245582580566406, + "eval_runtime": 192.4906, + "eval_samples_per_second": 51.951, + "eval_steps_per_second": 6.494, "step": 4200 }, { "epoch": 30.348478164975738, - "grad_norm": 5.060636520385742, + "grad_norm": 2.814659833908081, "learning_rate": 9.314381270903012e-06, - "loss": 5.4656, + "loss": 4.4557, "step": 4300 }, { "epoch": 30.348478164975738, - "eval_loss": 5.279904842376709, - "eval_runtime": 193.0317, - "eval_samples_per_second": 51.805, - "eval_steps_per_second": 6.476, + "eval_loss": 4.239570140838623, + "eval_runtime": 192.8746, + "eval_samples_per_second": 51.847, + "eval_steps_per_second": 6.481, "step": 4300 }, { "epoch": 31.05425672695192, - "grad_norm": 7.1775641441345215, + "grad_norm": 2.69787859916687, "learning_rate": 9.297658862876256e-06, - "loss": 5.4425, + "loss": 4.4454, "step": 4400 }, { "epoch": 31.05425672695192, - "eval_loss": 5.276424884796143, - "eval_runtime": 193.0373, - "eval_samples_per_second": 51.803, - "eval_steps_per_second": 6.475, + "eval_loss": 4.234774589538574, + "eval_runtime": 192.878, + "eval_samples_per_second": 51.846, + "eval_steps_per_second": 6.481, "step": 4400 }, { "epoch": 31.760035288928098, - "grad_norm": 5.923018455505371, + "grad_norm": 2.860370397567749, "learning_rate": 9.2809364548495e-06, - "loss": 5.4285, + "loss": 4.4384, "step": 4500 }, { "epoch": 31.760035288928098, - "eval_loss": 5.2852349281311035, - "eval_runtime": 193.0318, - "eval_samples_per_second": 51.805, + "eval_loss": 4.227160930633545, + "eval_runtime": 193.0111, + "eval_samples_per_second": 51.811, "eval_steps_per_second": 6.476, "step": 4500 }, { "epoch": 32.465813850904276, - "grad_norm": 3.954585313796997, + "grad_norm": 2.4307503700256348, "learning_rate": 9.264214046822743e-06, - "loss": 5.417, + "loss": 4.4306, "step": 4600 }, { "epoch": 32.465813850904276, - "eval_loss": 5.270089149475098, - "eval_runtime": 192.9771, - "eval_samples_per_second": 51.82, - "eval_steps_per_second": 6.477, + "eval_loss": 4.218355655670166, + "eval_runtime": 192.9008, + "eval_samples_per_second": 51.84, + "eval_steps_per_second": 6.48, "step": 4600 }, { "epoch": 33.17159241288046, - "grad_norm": 3.399170160293579, + "grad_norm": 2.4772098064422607, "learning_rate": 9.247491638795989e-06, - "loss": 5.4018, + "loss": 4.4233, "step": 4700 }, { "epoch": 33.17159241288046, - "eval_loss": 5.233738899230957, - "eval_runtime": 192.9309, - "eval_samples_per_second": 51.832, - "eval_steps_per_second": 6.479, + "eval_loss": 4.208319664001465, + "eval_runtime": 191.9987, + "eval_samples_per_second": 52.084, + "eval_steps_per_second": 6.51, "step": 4700 }, { "epoch": 33.87737097485664, - "grad_norm": 2.5665080547332764, + "grad_norm": 3.371622085571289, "learning_rate": 9.230769230769232e-06, - "loss": 5.3866, + "loss": 4.4188, "step": 4800 }, { "epoch": 33.87737097485664, - "eval_loss": 5.215952396392822, - "eval_runtime": 192.966, - "eval_samples_per_second": 51.823, - "eval_steps_per_second": 6.478, + "eval_loss": 4.222493648529053, + "eval_runtime": 192.3929, + "eval_samples_per_second": 51.977, + "eval_steps_per_second": 6.497, "step": 4800 }, { "epoch": 34.58314953683282, - "grad_norm": 4.100139617919922, + "grad_norm": 3.5253772735595703, "learning_rate": 9.214046822742476e-06, - "loss": 5.3769, + "loss": 4.4141, "step": 4900 }, { "epoch": 34.58314953683282, - "eval_loss": 5.216548442840576, - "eval_runtime": 192.8703, - "eval_samples_per_second": 51.848, - "eval_steps_per_second": 6.481, + "eval_loss": 4.214654922485352, + "eval_runtime": 192.7046, + "eval_samples_per_second": 51.893, + "eval_steps_per_second": 6.487, "step": 4900 }, { "epoch": 35.288928098809, - "grad_norm": 3.539120674133301, + "grad_norm": 3.155348300933838, "learning_rate": 9.19732441471572e-06, - "loss": 5.3605, + "loss": 4.4075, "step": 5000 }, { "epoch": 35.288928098809, - "eval_loss": 5.221481800079346, - "eval_runtime": 193.0338, - "eval_samples_per_second": 51.804, - "eval_steps_per_second": 6.476, + "eval_loss": 4.202410697937012, + "eval_runtime": 192.8791, + "eval_samples_per_second": 51.846, + "eval_steps_per_second": 6.481, "step": 5000 }, { - "epoch": 35.99470666078518, - "grad_norm": 5.789709568023682, + "epoch": 72.8348623853211, + "grad_norm": 3.394275188446045, "learning_rate": 9.180602006688965e-06, - "loss": 5.3473, + "loss": 4.3981, "step": 5100 }, { - "epoch": 35.99470666078518, - "eval_loss": 5.1859660148620605, - "eval_runtime": 192.9823, - "eval_samples_per_second": 51.818, - "eval_steps_per_second": 6.477, + "epoch": 72.8348623853211, + "eval_loss": 4.189794063568115, + "eval_runtime": 75.5217, + "eval_samples_per_second": 132.412, + "eval_steps_per_second": 8.276, "step": 5100 }, { - "epoch": 36.70048522276136, - "grad_norm": 5.602957248687744, + "epoch": 74.24629498941425, + "grad_norm": 2.187847375869751, "learning_rate": 9.163879598662207e-06, - "loss": 5.3413, + "loss": 4.3878, "step": 5200 }, { - "epoch": 36.70048522276136, - "eval_loss": 5.215543270111084, - "eval_runtime": 192.9831, - "eval_samples_per_second": 51.818, - "eval_steps_per_second": 6.477, + "epoch": 74.24629498941425, + "eval_loss": 4.175257205963135, + "eval_runtime": 75.2341, + "eval_samples_per_second": 132.918, + "eval_steps_per_second": 8.307, "step": 5200 }, { - "epoch": 37.40626378473754, - "grad_norm": 5.233423709869385, + "epoch": 75.65772759350742, + "grad_norm": 2.895294427871704, "learning_rate": 9.147157190635451e-06, - "loss": 5.3315, + "loss": 4.3818, "step": 5300 }, { - "epoch": 37.40626378473754, - "eval_loss": 5.199837684631348, - "eval_runtime": 192.9629, - "eval_samples_per_second": 51.823, - "eval_steps_per_second": 6.478, + "epoch": 75.65772759350742, + "eval_loss": 4.169253349304199, + "eval_runtime": 75.1154, + "eval_samples_per_second": 133.129, + "eval_steps_per_second": 8.321, "step": 5300 }, { - "epoch": 38.11204234671372, - "grad_norm": 2.9341530799865723, + "epoch": 77.06916019760057, + "grad_norm": 2.3158841133117676, "learning_rate": 9.130434782608697e-06, - "loss": 5.3182, + "loss": 4.3728, "step": 5400 }, { - "epoch": 38.11204234671372, - "eval_loss": 5.1667256355285645, - "eval_runtime": 193.0978, - "eval_samples_per_second": 51.787, - "eval_steps_per_second": 6.473, + "epoch": 77.06916019760057, + "eval_loss": 4.166540622711182, + "eval_runtime": 75.1279, + "eval_samples_per_second": 133.106, + "eval_steps_per_second": 8.319, "step": 5400 }, { - "epoch": 38.817820908689896, - "grad_norm": 3.455245018005371, + "epoch": 78.48059280169372, + "grad_norm": 2.402764081954956, "learning_rate": 9.11371237458194e-06, - "loss": 5.3047, + "loss": 4.3659, "step": 5500 }, { - "epoch": 38.817820908689896, - "eval_loss": 5.1457672119140625, - "eval_runtime": 193.0186, - "eval_samples_per_second": 51.808, - "eval_steps_per_second": 6.476, + "epoch": 78.48059280169372, + "eval_loss": 4.161038875579834, + "eval_runtime": 74.9365, + "eval_samples_per_second": 133.446, + "eval_steps_per_second": 8.34, "step": 5500 }, { - "epoch": 39.52359947066608, - "grad_norm": 5.229405403137207, + "epoch": 79.89202540578687, + "grad_norm": 2.1741960048675537, "learning_rate": 9.096989966555184e-06, - "loss": 5.2895, + "loss": 4.3566, "step": 5600 }, { - "epoch": 39.52359947066608, - "eval_loss": 5.149688720703125, - "eval_runtime": 192.813, - "eval_samples_per_second": 51.864, - "eval_steps_per_second": 6.483, + "epoch": 79.89202540578687, + "eval_loss": 4.145488262176514, + "eval_runtime": 75.1211, + "eval_samples_per_second": 133.118, + "eval_steps_per_second": 8.32, "step": 5600 }, { - "epoch": 40.22937803264226, - "grad_norm": 3.6030755043029785, + "epoch": 81.30345800988003, + "grad_norm": 1.9842256307601929, "learning_rate": 9.080267558528428e-06, - "loss": 5.2806, + "loss": 4.3473, "step": 5700 }, { - "epoch": 40.22937803264226, - "eval_loss": 5.13616943359375, - "eval_runtime": 192.7776, - "eval_samples_per_second": 51.873, - "eval_steps_per_second": 6.484, + "epoch": 81.30345800988003, + "eval_loss": 4.139003753662109, + "eval_runtime": 75.1615, + "eval_samples_per_second": 133.047, + "eval_steps_per_second": 8.315, "step": 5700 }, { - "epoch": 40.93515659461844, - "grad_norm": 5.300965785980225, + "epoch": 82.71489061397318, + "grad_norm": 2.0038816928863525, "learning_rate": 9.063545150501673e-06, - "loss": 5.2704, + "loss": 4.3389, "step": 5800 }, { - "epoch": 40.93515659461844, - "eval_loss": 5.1248393058776855, - "eval_runtime": 192.9978, - "eval_samples_per_second": 51.814, - "eval_steps_per_second": 6.477, + "epoch": 82.71489061397318, + "eval_loss": 4.1378560066223145, + "eval_runtime": 75.1493, + "eval_samples_per_second": 133.068, + "eval_steps_per_second": 8.317, "step": 5800 }, { - "epoch": 41.640935156594615, - "grad_norm": 3.444725751876831, + "epoch": 84.12632321806633, + "grad_norm": 2.4109838008880615, "learning_rate": 9.046822742474917e-06, - "loss": 5.2586, + "loss": 4.3293, "step": 5900 }, { - "epoch": 41.640935156594615, - "eval_loss": 5.108316421508789, - "eval_runtime": 191.8297, - "eval_samples_per_second": 52.13, - "eval_steps_per_second": 6.516, + "epoch": 84.12632321806633, + "eval_loss": 4.127132415771484, + "eval_runtime": 75.1873, + "eval_samples_per_second": 133.001, + "eval_steps_per_second": 8.313, "step": 5900 }, { - "epoch": 42.3467137185708, - "grad_norm": 3.918243646621704, + "epoch": 85.5377558221595, + "grad_norm": 2.1353368759155273, "learning_rate": 9.03010033444816e-06, - "loss": 5.2472, + "loss": 4.3203, "step": 6000 }, { - "epoch": 42.3467137185708, - "eval_loss": 5.101113319396973, - "eval_runtime": 193.0403, - "eval_samples_per_second": 51.803, - "eval_steps_per_second": 6.475, + "epoch": 85.5377558221595, + "eval_loss": 4.116780757904053, + "eval_runtime": 74.9341, + "eval_samples_per_second": 133.451, + "eval_steps_per_second": 8.341, "step": 6000 }, { - "epoch": 43.05249228054698, - "grad_norm": 4.100588321685791, + "epoch": 86.94918842625265, + "grad_norm": 2.372614860534668, "learning_rate": 9.013377926421405e-06, - "loss": 5.236, + "loss": 4.3126, "step": 6100 }, { - "epoch": 43.05249228054698, - "eval_loss": 5.10250997543335, - "eval_runtime": 193.0357, - "eval_samples_per_second": 51.804, - "eval_steps_per_second": 6.475, + "epoch": 86.94918842625265, + "eval_loss": 4.109887599945068, + "eval_runtime": 75.1685, + "eval_samples_per_second": 133.034, + "eval_steps_per_second": 8.315, "step": 6100 }, { - "epoch": 43.75827084252316, - "grad_norm": 1.832243800163269, + "epoch": 88.3606210303458, + "grad_norm": 1.8894693851470947, "learning_rate": 8.996655518394648e-06, - "loss": 5.2244, + "loss": 4.3013, "step": 6200 }, { - "epoch": 43.75827084252316, - "eval_loss": 5.086152076721191, - "eval_runtime": 193.0384, - "eval_samples_per_second": 51.803, - "eval_steps_per_second": 6.475, + "epoch": 88.3606210303458, + "eval_loss": 4.100041389465332, + "eval_runtime": 75.1752, + "eval_samples_per_second": 133.023, + "eval_steps_per_second": 8.314, "step": 6200 }, { - "epoch": 44.46404940449934, - "grad_norm": 6.217077732086182, + "epoch": 89.77205363443896, + "grad_norm": 1.9410585165023804, "learning_rate": 8.979933110367894e-06, - "loss": 5.2212, + "loss": 4.2934, "step": 6300 }, { - "epoch": 44.46404940449934, - "eval_loss": 5.0455851554870605, - "eval_runtime": 191.6044, - "eval_samples_per_second": 52.191, - "eval_steps_per_second": 6.524, + "epoch": 89.77205363443896, + "eval_loss": 4.090487480163574, + "eval_runtime": 75.1691, + "eval_samples_per_second": 133.033, + "eval_steps_per_second": 8.315, "step": 6300 }, { - "epoch": 45.169827966475516, - "grad_norm": 6.123181343078613, + "epoch": 91.18348623853211, + "grad_norm": 3.4204158782958984, "learning_rate": 8.963210702341138e-06, - "loss": 5.2143, + "loss": 4.2875, "step": 6400 }, { - "epoch": 45.169827966475516, - "eval_loss": 5.037074089050293, - "eval_runtime": 193.1132, - "eval_samples_per_second": 51.783, - "eval_steps_per_second": 6.473, + "epoch": 91.18348623853211, + "eval_loss": 4.08727502822876, + "eval_runtime": 75.1834, + "eval_samples_per_second": 133.008, + "eval_steps_per_second": 8.313, "step": 6400 }, { - "epoch": 45.8756065284517, - "grad_norm": 4.41541862487793, + "epoch": 92.59491884262526, + "grad_norm": 3.0449135303497314, "learning_rate": 8.946488294314381e-06, - "loss": 5.2033, + "loss": 4.2808, "step": 6500 }, { - "epoch": 45.8756065284517, - "eval_loss": 5.042657852172852, - "eval_runtime": 193.071, - "eval_samples_per_second": 51.794, - "eval_steps_per_second": 6.474, + "epoch": 92.59491884262526, + "eval_loss": 4.0819501876831055, + "eval_runtime": 74.8907, + "eval_samples_per_second": 133.528, + "eval_steps_per_second": 8.345, "step": 6500 }, { - "epoch": 46.58138509042788, - "grad_norm": 4.971529960632324, + "epoch": 94.00635144671843, + "grad_norm": 1.9112857580184937, "learning_rate": 8.929765886287625e-06, - "loss": 5.1915, + "loss": 4.2735, "step": 6600 }, { - "epoch": 46.58138509042788, - "eval_loss": 5.053555488586426, - "eval_runtime": 192.9877, - "eval_samples_per_second": 51.817, - "eval_steps_per_second": 6.477, + "epoch": 94.00635144671843, + "eval_loss": 4.071495056152344, + "eval_runtime": 75.2761, + "eval_samples_per_second": 132.844, + "eval_steps_per_second": 8.303, "step": 6600 }, { - "epoch": 47.28716365240406, - "grad_norm": 4.590278148651123, + "epoch": 95.41778405081158, + "grad_norm": 1.7011171579360962, "learning_rate": 8.91304347826087e-06, - "loss": 5.183, + "loss": 4.264, "step": 6700 }, { - "epoch": 47.28716365240406, - "eval_loss": 5.035336494445801, - "eval_runtime": 193.0308, - "eval_samples_per_second": 51.805, - "eval_steps_per_second": 6.476, + "epoch": 95.41778405081158, + "eval_loss": 4.066438674926758, + "eval_runtime": 75.215, + "eval_samples_per_second": 132.952, + "eval_steps_per_second": 8.31, "step": 6700 }, { - "epoch": 47.992942214380236, - "grad_norm": 4.004854202270508, + "epoch": 96.82921665490473, + "grad_norm": 1.510539174079895, "learning_rate": 8.896321070234114e-06, - "loss": 5.1736, + "loss": 4.2556, "step": 6800 }, { - "epoch": 47.992942214380236, - "eval_loss": 5.029347896575928, - "eval_runtime": 193.7814, - "eval_samples_per_second": 51.605, - "eval_steps_per_second": 6.451, + "epoch": 96.82921665490473, + "eval_loss": 4.061617851257324, + "eval_runtime": 75.2534, + "eval_samples_per_second": 132.884, + "eval_steps_per_second": 8.305, "step": 6800 }, { - "epoch": 48.69872077635642, - "grad_norm": 3.0193686485290527, + "epoch": 98.24064925899788, + "grad_norm": 1.6027462482452393, "learning_rate": 8.879598662207358e-06, - "loss": 5.1617, + "loss": 4.2455, "step": 6900 }, { - "epoch": 48.69872077635642, - "eval_loss": 5.004369735717773, - "eval_runtime": 192.3323, - "eval_samples_per_second": 51.993, - "eval_steps_per_second": 6.499, + "epoch": 98.24064925899788, + "eval_loss": 4.0491743087768555, + "eval_runtime": 75.2588, + "eval_samples_per_second": 132.875, + "eval_steps_per_second": 8.305, "step": 6900 }, { - "epoch": 49.4044993383326, - "grad_norm": 3.199803590774536, + "epoch": 99.65208186309104, + "grad_norm": 1.8287110328674316, "learning_rate": 8.862876254180602e-06, - "loss": 5.1549, + "loss": 4.2361, "step": 7000 }, { - "epoch": 49.4044993383326, - "eval_loss": 5.010226726531982, - "eval_runtime": 192.7784, - "eval_samples_per_second": 51.873, - "eval_steps_per_second": 6.484, + "epoch": 99.65208186309104, + "eval_loss": 4.048295974731445, + "eval_runtime": 74.9626, + "eval_samples_per_second": 133.4, + "eval_steps_per_second": 8.337, "step": 7000 }, { - "epoch": 50.11027790030878, - "grad_norm": 4.2160563468933105, + "epoch": 101.06351446718419, + "grad_norm": 1.6138700246810913, "learning_rate": 8.846153846153847e-06, - "loss": 5.1425, + "loss": 4.2272, "step": 7100 }, { - "epoch": 50.11027790030878, - "eval_loss": 5.005931854248047, - "eval_runtime": 192.9712, - "eval_samples_per_second": 51.821, - "eval_steps_per_second": 6.478, + "epoch": 101.06351446718419, + "eval_loss": 4.032159805297852, + "eval_runtime": 75.1461, + "eval_samples_per_second": 133.074, + "eval_steps_per_second": 8.317, "step": 7100 }, { - "epoch": 50.816056462284955, - "grad_norm": 4.614100933074951, + "epoch": 102.47494707127734, + "grad_norm": 1.5357685089111328, "learning_rate": 8.829431438127091e-06, - "loss": 5.1403, + "loss": 4.2171, "step": 7200 }, { - "epoch": 50.816056462284955, - "eval_loss": 5.039505481719971, - "eval_runtime": 192.1426, - "eval_samples_per_second": 52.045, - "eval_steps_per_second": 6.506, + "epoch": 102.47494707127734, + "eval_loss": 4.029723167419434, + "eval_runtime": 75.1903, + "eval_samples_per_second": 132.996, + "eval_steps_per_second": 8.312, "step": 7200 }, { - "epoch": 51.52183502426114, - "grad_norm": 4.06365442276001, + "epoch": 103.8863796753705, + "grad_norm": 2.423367977142334, "learning_rate": 8.812709030100335e-06, - "loss": 5.1324, + "loss": 4.2108, "step": 7300 }, { - "epoch": 51.52183502426114, - "eval_loss": 5.02189826965332, - "eval_runtime": 192.8089, - "eval_samples_per_second": 51.865, - "eval_steps_per_second": 6.483, + "epoch": 103.8863796753705, + "eval_loss": 4.024651527404785, + "eval_runtime": 75.1893, + "eval_samples_per_second": 132.998, + "eval_steps_per_second": 8.312, "step": 7300 }, { - "epoch": 52.22761358623732, - "grad_norm": 3.0032901763916016, + "epoch": 105.29781227946366, + "grad_norm": 1.7042992115020752, "learning_rate": 8.795986622073578e-06, - "loss": 5.1214, + "loss": 4.203, "step": 7400 }, { - "epoch": 52.22761358623732, - "eval_loss": 4.965108394622803, - "eval_runtime": 192.9329, - "eval_samples_per_second": 51.831, - "eval_steps_per_second": 6.479, + "epoch": 105.29781227946366, + "eval_loss": 4.0152106285095215, + "eval_runtime": 74.9058, + "eval_samples_per_second": 133.501, + "eval_steps_per_second": 8.344, "step": 7400 }, { - "epoch": 52.9333921482135, - "grad_norm": 5.5580220222473145, + "epoch": 106.7092448835568, + "grad_norm": 1.934262752532959, "learning_rate": 8.779264214046824e-06, - "loss": 5.1104, + "loss": 4.1943, "step": 7500 }, { - "epoch": 52.9333921482135, - "eval_loss": 4.973033428192139, - "eval_runtime": 192.7851, - "eval_samples_per_second": 51.871, - "eval_steps_per_second": 6.484, + "epoch": 106.7092448835568, + "eval_loss": 4.006458282470703, + "eval_runtime": 75.0162, + "eval_samples_per_second": 133.305, + "eval_steps_per_second": 8.332, "step": 7500 }, { - "epoch": 53.63917071018968, - "grad_norm": 4.173678398132324, + "epoch": 108.12067748764997, + "grad_norm": 1.7501742839813232, "learning_rate": 8.762541806020068e-06, - "loss": 5.1055, + "loss": 4.1853, "step": 7600 }, { - "epoch": 53.63917071018968, - "eval_loss": 4.955119609832764, - "eval_runtime": 192.6344, - "eval_samples_per_second": 51.912, - "eval_steps_per_second": 6.489, + "epoch": 108.12067748764997, + "eval_loss": 4.001890182495117, + "eval_runtime": 75.2517, + "eval_samples_per_second": 132.887, + "eval_steps_per_second": 8.305, "step": 7600 }, { - "epoch": 54.344949272165856, - "grad_norm": 3.4894068241119385, + "epoch": 109.53211009174312, + "grad_norm": 1.874009609222412, "learning_rate": 8.745819397993311e-06, - "loss": 5.0968, + "loss": 4.1751, "step": 7700 }, { - "epoch": 54.344949272165856, - "eval_loss": 4.955358028411865, - "eval_runtime": 192.8374, - "eval_samples_per_second": 51.857, - "eval_steps_per_second": 6.482, + "epoch": 109.53211009174312, + "eval_loss": 3.9898202419281006, + "eval_runtime": 75.147, + "eval_samples_per_second": 133.072, + "eval_steps_per_second": 8.317, "step": 7700 }, { - "epoch": 55.05072783414204, - "grad_norm": 4.601551532745361, + "epoch": 110.94354269583627, + "grad_norm": 1.5682804584503174, "learning_rate": 8.729096989966555e-06, - "loss": 5.0924, + "loss": 4.1656, "step": 7800 }, { - "epoch": 55.05072783414204, - "eval_loss": 4.921985149383545, - "eval_runtime": 192.8707, - "eval_samples_per_second": 51.848, - "eval_steps_per_second": 6.481, + "epoch": 110.94354269583627, + "eval_loss": 3.9801883697509766, + "eval_runtime": 75.3127, + "eval_samples_per_second": 132.78, + "eval_steps_per_second": 8.299, "step": 7800 }, { - "epoch": 55.75650639611822, - "grad_norm": 4.37216854095459, + "epoch": 112.35497529992942, + "grad_norm": 1.5020002126693726, "learning_rate": 8.712374581939799e-06, - "loss": 5.0861, + "loss": 4.1549, "step": 7900 }, { - "epoch": 55.75650639611822, - "eval_loss": 4.916751861572266, - "eval_runtime": 193.0131, - "eval_samples_per_second": 51.81, - "eval_steps_per_second": 6.476, + "epoch": 112.35497529992942, + "eval_loss": 3.973633289337158, + "eval_runtime": 75.0416, + "eval_samples_per_second": 133.259, + "eval_steps_per_second": 8.329, "step": 7900 }, { - "epoch": 56.4622849580944, - "grad_norm": 3.791567087173462, + "epoch": 113.76640790402259, + "grad_norm": 1.4100362062454224, "learning_rate": 8.695652173913044e-06, - "loss": 5.0776, + "loss": 4.1444, "step": 8000 }, { - "epoch": 56.4622849580944, - "eval_loss": 4.91188907623291, - "eval_runtime": 193.067, - "eval_samples_per_second": 51.795, - "eval_steps_per_second": 6.474, + "epoch": 113.76640790402259, + "eval_loss": 3.9643702507019043, + "eval_runtime": 75.2941, + "eval_samples_per_second": 132.813, + "eval_steps_per_second": 8.301, "step": 8000 }, { - "epoch": 57.168063520070575, - "grad_norm": 4.161510467529297, + "epoch": 115.17784050811574, + "grad_norm": 1.2643848657608032, "learning_rate": 8.678929765886288e-06, - "loss": 5.0688, + "loss": 4.1331, "step": 8100 }, { - "epoch": 57.168063520070575, - "eval_loss": 4.917378902435303, - "eval_runtime": 192.9101, - "eval_samples_per_second": 51.838, - "eval_steps_per_second": 6.48, + "epoch": 115.17784050811574, + "eval_loss": 3.9529407024383545, + "eval_runtime": 75.2161, + "eval_samples_per_second": 132.95, + "eval_steps_per_second": 8.309, "step": 8100 }, { - "epoch": 57.87384208204676, - "grad_norm": 2.878750801086426, + "epoch": 116.58927311220889, + "grad_norm": 1.3504067659378052, "learning_rate": 8.662207357859532e-06, - "loss": 5.0624, + "loss": 4.1221, "step": 8200 }, { - "epoch": 57.87384208204676, - "eval_loss": 4.917529106140137, - "eval_runtime": 193.051, - "eval_samples_per_second": 51.8, - "eval_steps_per_second": 6.475, + "epoch": 116.58927311220889, + "eval_loss": 3.946784257888794, + "eval_runtime": 75.2097, + "eval_samples_per_second": 132.962, + "eval_steps_per_second": 8.31, "step": 8200 }, { - "epoch": 58.57962064402294, - "grad_norm": 4.06845235824585, + "epoch": 118.00070571630205, + "grad_norm": 1.3503401279449463, "learning_rate": 8.645484949832776e-06, - "loss": 5.0556, + "loss": 4.1107, "step": 8300 }, { - "epoch": 58.57962064402294, - "eval_loss": 4.9113616943359375, - "eval_runtime": 193.0375, - "eval_samples_per_second": 51.803, - "eval_steps_per_second": 6.475, + "epoch": 118.00070571630205, + "eval_loss": 3.936836004257202, + "eval_runtime": 75.1834, + "eval_samples_per_second": 133.008, + "eval_steps_per_second": 8.313, "step": 8300 }, { - "epoch": 59.28539920599912, - "grad_norm": 3.887568712234497, + "epoch": 119.4121383203952, + "grad_norm": 1.4280999898910522, "learning_rate": 8.628762541806021e-06, - "loss": 5.0451, + "loss": 4.1001, "step": 8400 }, { - "epoch": 59.28539920599912, - "eval_loss": 4.895881175994873, - "eval_runtime": 192.8532, - "eval_samples_per_second": 51.853, - "eval_steps_per_second": 6.482, + "epoch": 119.4121383203952, + "eval_loss": 3.926053047180176, + "eval_runtime": 75.2443, + "eval_samples_per_second": 132.9, + "eval_steps_per_second": 8.306, "step": 8400 }, { - "epoch": 59.991177767975294, - "grad_norm": 3.748574733734131, + "epoch": 120.82357092448835, + "grad_norm": 1.8372641801834106, "learning_rate": 8.612040133779265e-06, - "loss": 5.0394, + "loss": 4.0876, "step": 8500 }, { - "epoch": 59.991177767975294, - "eval_loss": 4.893756866455078, - "eval_runtime": 193.066, - "eval_samples_per_second": 51.796, - "eval_steps_per_second": 6.474, + "epoch": 120.82357092448835, + "eval_loss": 3.9173219203948975, + "eval_runtime": 75.0392, + "eval_samples_per_second": 133.264, + "eval_steps_per_second": 8.329, "step": 8500 }, { - "epoch": 60.696956329951476, - "grad_norm": 5.10497522354126, + "epoch": 122.23500352858152, + "grad_norm": 1.6978626251220703, "learning_rate": 8.595317725752509e-06, - "loss": 5.0348, + "loss": 4.079, "step": 8600 }, { - "epoch": 60.696956329951476, - "eval_loss": 4.908539295196533, - "eval_runtime": 192.8584, - "eval_samples_per_second": 51.852, - "eval_steps_per_second": 6.481, + "epoch": 122.23500352858152, + "eval_loss": 3.9100394248962402, + "eval_runtime": 75.1695, + "eval_samples_per_second": 133.033, + "eval_steps_per_second": 8.315, "step": 8600 }, { - "epoch": 61.40273489192766, - "grad_norm": 5.520150661468506, + "epoch": 123.64643613267467, + "grad_norm": 1.5298271179199219, "learning_rate": 8.578595317725752e-06, - "loss": 5.0289, + "loss": 4.069, "step": 8700 }, { - "epoch": 61.40273489192766, - "eval_loss": 4.903127193450928, - "eval_runtime": 193.0555, - "eval_samples_per_second": 51.799, - "eval_steps_per_second": 6.475, + "epoch": 123.64643613267467, + "eval_loss": 3.9006407260894775, + "eval_runtime": 75.13, + "eval_samples_per_second": 133.103, + "eval_steps_per_second": 8.319, "step": 8700 }, { - "epoch": 62.10851345390384, - "grad_norm": 3.925266742706299, + "epoch": 125.05786873676782, + "grad_norm": 1.4782963991165161, "learning_rate": 8.561872909698998e-06, - "loss": 5.0223, + "loss": 4.06, "step": 8800 }, { - "epoch": 62.10851345390384, - "eval_loss": 4.879857540130615, - "eval_runtime": 193.0601, - "eval_samples_per_second": 51.797, - "eval_steps_per_second": 6.475, + "epoch": 125.05786873676782, + "eval_loss": 3.8956563472747803, + "eval_runtime": 75.325, + "eval_samples_per_second": 132.758, + "eval_steps_per_second": 8.297, "step": 8800 }, { - "epoch": 62.81429201588002, - "grad_norm": 3.215996026992798, + "epoch": 126.46930134086098, + "grad_norm": 1.5350950956344604, "learning_rate": 8.545150501672242e-06, - "loss": 5.0125, + "loss": 4.0513, "step": 8900 }, { - "epoch": 62.81429201588002, - "eval_loss": 4.8555908203125, - "eval_runtime": 193.0262, - "eval_samples_per_second": 51.806, - "eval_steps_per_second": 6.476, + "epoch": 126.46930134086098, + "eval_loss": 3.887305498123169, + "eval_runtime": 75.2815, + "eval_samples_per_second": 132.835, + "eval_steps_per_second": 8.302, "step": 8900 }, { - "epoch": 63.520070577856195, - "grad_norm": 4.219306468963623, + "epoch": 127.88073394495413, + "grad_norm": 1.1390595436096191, "learning_rate": 8.528428093645485e-06, - "loss": 5.0056, + "loss": 4.0414, "step": 9000 }, { - "epoch": 63.520070577856195, - "eval_loss": 4.858587265014648, - "eval_runtime": 192.8868, - "eval_samples_per_second": 51.844, - "eval_steps_per_second": 6.48, + "epoch": 127.88073394495413, + "eval_loss": 3.882507801055908, + "eval_runtime": 75.0211, + "eval_samples_per_second": 133.296, + "eval_steps_per_second": 8.331, "step": 9000 }, { - "epoch": 64.22584913983238, - "grad_norm": 4.462502479553223, + "epoch": 129.2921665490473, + "grad_norm": 1.2423325777053833, "learning_rate": 8.511705685618729e-06, - "loss": 5.001, + "loss": 4.0329, "step": 9100 }, { - "epoch": 64.22584913983238, - "eval_loss": 4.859830379486084, - "eval_runtime": 193.1534, - "eval_samples_per_second": 51.772, - "eval_steps_per_second": 6.472, + "epoch": 129.2921665490473, + "eval_loss": 3.8772571086883545, + "eval_runtime": 75.2328, + "eval_samples_per_second": 132.921, + "eval_steps_per_second": 8.308, "step": 9100 }, { - "epoch": 64.93162770180855, - "grad_norm": 3.2140142917633057, + "epoch": 130.70359915314043, + "grad_norm": 1.1587265729904175, "learning_rate": 8.494983277591975e-06, - "loss": 4.9932, + "loss": 4.0247, "step": 9200 }, { - "epoch": 64.93162770180855, - "eval_loss": 4.84881591796875, - "eval_runtime": 193.0696, - "eval_samples_per_second": 51.795, - "eval_steps_per_second": 6.474, + "epoch": 130.70359915314043, + "eval_loss": 3.8697094917297363, + "eval_runtime": 75.2987, + "eval_samples_per_second": 132.804, + "eval_steps_per_second": 8.3, "step": 9200 }, { - "epoch": 65.63740626378474, - "grad_norm": 2.495354175567627, + "epoch": 132.1150317572336, + "grad_norm": 1.3564627170562744, "learning_rate": 8.478260869565218e-06, - "loss": 4.9838, + "loss": 4.0162, "step": 9300 }, { - "epoch": 65.63740626378474, - "eval_loss": 4.839079856872559, - "eval_runtime": 193.1026, - "eval_samples_per_second": 51.786, - "eval_steps_per_second": 6.473, + "epoch": 132.1150317572336, + "eval_loss": 3.8612961769104004, + "eval_runtime": 75.0366, + "eval_samples_per_second": 133.268, + "eval_steps_per_second": 8.329, "step": 9300 }, { - "epoch": 66.34318482576091, - "grad_norm": 2.639589548110962, + "epoch": 133.52646436132676, + "grad_norm": 1.1728644371032715, "learning_rate": 8.461538461538462e-06, - "loss": 4.977, + "loss": 4.0088, "step": 9400 }, { - "epoch": 66.34318482576091, - "eval_loss": 4.817923069000244, - "eval_runtime": 193.0942, - "eval_samples_per_second": 51.788, - "eval_steps_per_second": 6.474, + "epoch": 133.52646436132676, + "eval_loss": 3.853496789932251, + "eval_runtime": 75.1117, + "eval_samples_per_second": 133.135, + "eval_steps_per_second": 8.321, "step": 9400 }, { - "epoch": 67.0489633877371, - "grad_norm": 4.6429548263549805, + "epoch": 134.9378969654199, + "grad_norm": 1.221337080001831, "learning_rate": 8.444816053511706e-06, - "loss": 4.9753, + "loss": 3.9999, "step": 9500 }, { - "epoch": 67.0489633877371, - "eval_loss": 4.809008598327637, - "eval_runtime": 192.8929, - "eval_samples_per_second": 51.842, - "eval_steps_per_second": 6.48, + "epoch": 134.9378969654199, + "eval_loss": 3.848421812057495, + "eval_runtime": 75.1472, + "eval_samples_per_second": 133.072, + "eval_steps_per_second": 8.317, "step": 9500 }, { - "epoch": 67.75474194971328, - "grad_norm": 4.23100471496582, + "epoch": 136.34932956951306, + "grad_norm": 1.2611275911331177, "learning_rate": 8.42809364548495e-06, - "loss": 4.9687, + "loss": 3.9922, "step": 9600 }, { - "epoch": 67.75474194971328, - "eval_loss": 4.804023265838623, - "eval_runtime": 192.1986, - "eval_samples_per_second": 52.03, - "eval_steps_per_second": 6.504, + "epoch": 136.34932956951306, + "eval_loss": 3.8415913581848145, + "eval_runtime": 75.2162, + "eval_samples_per_second": 132.95, + "eval_steps_per_second": 8.309, "step": 9600 }, { - "epoch": 68.46052051168945, - "grad_norm": 3.7249045372009277, + "epoch": 137.76076217360622, + "grad_norm": 1.0138766765594482, "learning_rate": 8.411371237458195e-06, - "loss": 4.9612, + "loss": 3.9831, "step": 9700 }, { - "epoch": 68.46052051168945, - "eval_loss": 4.817007541656494, - "eval_runtime": 192.9147, - "eval_samples_per_second": 51.836, - "eval_steps_per_second": 6.48, + "epoch": 137.76076217360622, + "eval_loss": 3.8360984325408936, + "eval_runtime": 75.2818, + "eval_samples_per_second": 132.834, + "eval_steps_per_second": 8.302, "step": 9700 }, { - "epoch": 69.16629907366564, - "grad_norm": 4.318994045257568, + "epoch": 139.17219477769936, + "grad_norm": 1.4916014671325684, "learning_rate": 8.394648829431439e-06, - "loss": 4.9536, + "loss": 3.9754, "step": 9800 }, { - "epoch": 69.16629907366564, - "eval_loss": 4.804600715637207, - "eval_runtime": 192.8555, - "eval_samples_per_second": 51.852, - "eval_steps_per_second": 6.482, + "epoch": 139.17219477769936, + "eval_loss": 3.8284976482391357, + "eval_runtime": 75.0346, + "eval_samples_per_second": 133.272, + "eval_steps_per_second": 8.329, "step": 9800 }, { - "epoch": 69.87207763564182, - "grad_norm": 3.2238450050354004, + "epoch": 140.58362738179252, + "grad_norm": 1.2558103799819946, "learning_rate": 8.377926421404683e-06, - "loss": 4.9453, + "loss": 3.9674, "step": 9900 }, { - "epoch": 69.87207763564182, - "eval_loss": 4.803778648376465, - "eval_runtime": 192.938, - "eval_samples_per_second": 51.83, - "eval_steps_per_second": 6.479, + "epoch": 140.58362738179252, + "eval_loss": 3.8220624923706055, + "eval_runtime": 75.3076, + "eval_samples_per_second": 132.789, + "eval_steps_per_second": 8.299, "step": 9900 }, { - "epoch": 70.577856197618, - "grad_norm": 2.2831103801727295, + "epoch": 141.99505998588566, + "grad_norm": 1.076314926147461, "learning_rate": 8.361204013377926e-06, - "loss": 4.941, + "loss": 3.9601, "step": 10000 }, { - "epoch": 70.577856197618, - "eval_loss": 4.7956366539001465, - "eval_runtime": 193.0276, - "eval_samples_per_second": 51.806, - "eval_steps_per_second": 6.476, + "epoch": 141.99505998588566, + "eval_loss": 3.814333915710449, + "eval_runtime": 75.2689, + "eval_samples_per_second": 132.857, + "eval_steps_per_second": 8.304, "step": 10000 }, { - "epoch": 71.28363475959418, - "grad_norm": 3.363302707672119, + "epoch": 143.40649258997882, + "grad_norm": 1.3485060930252075, "learning_rate": 8.344481605351172e-06, - "loss": 4.9354, + "loss": 3.9518, "step": 10100 }, { - "epoch": 71.28363475959418, - "eval_loss": 4.778770923614502, - "eval_runtime": 193.1191, - "eval_samples_per_second": 51.782, - "eval_steps_per_second": 6.473, + "epoch": 143.40649258997882, + "eval_loss": 3.8039023876190186, + "eval_runtime": 75.2506, + "eval_samples_per_second": 132.889, + "eval_steps_per_second": 8.306, "step": 10100 }, { - "epoch": 71.98941332157035, - "grad_norm": 3.9507765769958496, + "epoch": 144.817925194072, + "grad_norm": 1.477800726890564, "learning_rate": 8.327759197324416e-06, - "loss": 4.9266, + "loss": 3.9453, "step": 10200 }, { - "epoch": 71.98941332157035, - "eval_loss": 4.776257038116455, - "eval_runtime": 193.1076, - "eval_samples_per_second": 51.785, - "eval_steps_per_second": 6.473, + "epoch": 144.817925194072, + "eval_loss": 3.8053476810455322, + "eval_runtime": 75.7225, + "eval_samples_per_second": 132.061, + "eval_steps_per_second": 8.254, "step": 10200 }, { - "epoch": 72.69519188354654, - "grad_norm": 2.7980282306671143, + "epoch": 146.22935779816513, + "grad_norm": 0.9907758235931396, "learning_rate": 8.31103678929766e-06, - "loss": 4.9215, + "loss": 3.9382, "step": 10300 }, { - "epoch": 72.69519188354654, - "eval_loss": 4.771121025085449, - "eval_runtime": 193.0467, - "eval_samples_per_second": 51.801, - "eval_steps_per_second": 6.475, + "epoch": 146.22935779816513, + "eval_loss": 3.7936344146728516, + "eval_runtime": 75.3213, + "eval_samples_per_second": 132.765, + "eval_steps_per_second": 8.298, "step": 10300 }, { - "epoch": 73.40097044552272, - "grad_norm": 4.593254089355469, + "epoch": 147.6407904022583, + "grad_norm": 0.9641264081001282, "learning_rate": 8.294314381270903e-06, - "loss": 4.9115, + "loss": 3.93, "step": 10400 }, { - "epoch": 73.40097044552272, - "eval_loss": 4.752311706542969, - "eval_runtime": 192.8041, - "eval_samples_per_second": 51.866, - "eval_steps_per_second": 6.483, + "epoch": 147.6407904022583, + "eval_loss": 3.787029266357422, + "eval_runtime": 75.7774, + "eval_samples_per_second": 131.965, + "eval_steps_per_second": 8.248, "step": 10400 }, { - "epoch": 74.10674900749889, - "grad_norm": 5.291107177734375, + "epoch": 149.05222300635145, + "grad_norm": 0.9813300371170044, "learning_rate": 8.277591973244149e-06, - "loss": 4.9132, + "loss": 3.9242, "step": 10500 }, { - "epoch": 74.10674900749889, - "eval_loss": 4.77565860748291, - "eval_runtime": 192.9688, - "eval_samples_per_second": 51.822, - "eval_steps_per_second": 6.478, + "epoch": 149.05222300635145, + "eval_loss": 3.785655975341797, + "eval_runtime": 75.7954, + "eval_samples_per_second": 131.934, + "eval_steps_per_second": 8.246, "step": 10500 }, { - "epoch": 74.81252756947508, - "grad_norm": 4.680324554443359, + "epoch": 150.4636556104446, + "grad_norm": 1.3373197317123413, "learning_rate": 8.260869565217392e-06, - "loss": 4.9081, + "loss": 3.9171, "step": 10600 }, { - "epoch": 74.81252756947508, - "eval_loss": 4.771040916442871, - "eval_runtime": 192.7101, - "eval_samples_per_second": 51.891, - "eval_steps_per_second": 6.486, + "epoch": 150.4636556104446, + "eval_loss": 3.7779102325439453, + "eval_runtime": 75.7673, + "eval_samples_per_second": 131.983, + "eval_steps_per_second": 8.249, "step": 10600 }, { - "epoch": 75.51830613145125, - "grad_norm": 3.9646756649017334, + "epoch": 151.87508821453775, + "grad_norm": 1.0385922193527222, "learning_rate": 8.244147157190636e-06, - "loss": 4.9007, + "loss": 3.91, "step": 10700 }, { - "epoch": 75.51830613145125, - "eval_loss": 4.759604454040527, - "eval_runtime": 192.6574, - "eval_samples_per_second": 51.906, - "eval_steps_per_second": 6.488, + "epoch": 151.87508821453775, + "eval_loss": 3.7687366008758545, + "eval_runtime": 75.7457, + "eval_samples_per_second": 132.021, + "eval_steps_per_second": 8.251, "step": 10700 }, { - "epoch": 76.22408469342744, - "grad_norm": 3.668570041656494, + "epoch": 153.28652081863092, + "grad_norm": 1.0471336841583252, "learning_rate": 8.22742474916388e-06, - "loss": 4.896, + "loss": 3.9039, "step": 10800 }, { - "epoch": 76.22408469342744, - "eval_loss": 4.750930309295654, - "eval_runtime": 192.8954, - "eval_samples_per_second": 51.842, - "eval_steps_per_second": 6.48, + "epoch": 153.28652081863092, + "eval_loss": 3.7694108486175537, + "eval_runtime": 75.7692, + "eval_samples_per_second": 131.98, + "eval_steps_per_second": 8.249, "step": 10800 }, { - "epoch": 76.92986325540362, - "grad_norm": 3.887561082839966, + "epoch": 154.69795342272405, + "grad_norm": 1.0338389873504639, "learning_rate": 8.210702341137125e-06, - "loss": 4.8899, + "loss": 3.8976, "step": 10900 }, { - "epoch": 76.92986325540362, - "eval_loss": 4.750051498413086, - "eval_runtime": 192.85, - "eval_samples_per_second": 51.854, - "eval_steps_per_second": 6.482, + "epoch": 154.69795342272405, + "eval_loss": 3.762704610824585, + "eval_runtime": 75.3453, + "eval_samples_per_second": 132.722, + "eval_steps_per_second": 8.295, "step": 10900 }, { - "epoch": 77.63564181737979, - "grad_norm": 2.5720551013946533, + "epoch": 156.10938602681722, + "grad_norm": 0.9690730571746826, "learning_rate": 8.193979933110369e-06, - "loss": 4.8833, + "loss": 3.8914, "step": 11000 }, { - "epoch": 77.63564181737979, - "eval_loss": 4.728267192840576, - "eval_runtime": 192.8798, - "eval_samples_per_second": 51.846, - "eval_steps_per_second": 6.481, + "epoch": 156.10938602681722, + "eval_loss": 3.7522528171539307, + "eval_runtime": 75.7714, + "eval_samples_per_second": 131.976, + "eval_steps_per_second": 8.248, "step": 11000 }, { - "epoch": 78.34142037935598, - "grad_norm": 3.3731729984283447, + "epoch": 157.52081863091038, + "grad_norm": 1.2407863140106201, "learning_rate": 8.177257525083613e-06, - "loss": 4.878, + "loss": 3.8845, "step": 11100 }, { - "epoch": 78.34142037935598, - "eval_loss": 4.728886604309082, - "eval_runtime": 193.0271, - "eval_samples_per_second": 51.806, - "eval_steps_per_second": 6.476, + "epoch": 157.52081863091038, + "eval_loss": 3.752902030944824, + "eval_runtime": 75.7459, + "eval_samples_per_second": 132.02, + "eval_steps_per_second": 8.251, "step": 11100 }, { - "epoch": 79.04719894133216, - "grad_norm": 3.4648077487945557, + "epoch": 158.93225123500352, + "grad_norm": 1.0596587657928467, "learning_rate": 8.160535117056857e-06, - "loss": 4.8726, + "loss": 3.8795, "step": 11200 }, { - "epoch": 79.04719894133216, - "eval_loss": 4.735613822937012, - "eval_runtime": 193.0564, - "eval_samples_per_second": 51.798, - "eval_steps_per_second": 6.475, + "epoch": 158.93225123500352, + "eval_loss": 3.7448883056640625, + "eval_runtime": 75.3391, + "eval_samples_per_second": 132.733, + "eval_steps_per_second": 8.296, "step": 11200 }, { - "epoch": 79.75297750330834, - "grad_norm": 3.5216147899627686, + "epoch": 160.34368383909668, + "grad_norm": 1.1886705160140991, "learning_rate": 8.143812709030102e-06, - "loss": 4.8677, + "loss": 3.872, "step": 11300 }, { - "epoch": 79.75297750330834, - "eval_loss": 4.705838203430176, - "eval_runtime": 192.8873, - "eval_samples_per_second": 51.844, - "eval_steps_per_second": 6.48, + "epoch": 160.34368383909668, + "eval_loss": 3.740713357925415, + "eval_runtime": 75.3057, + "eval_samples_per_second": 132.792, + "eval_steps_per_second": 8.3, "step": 11300 }, { - "epoch": 80.45875606528452, - "grad_norm": 3.0523054599761963, + "epoch": 161.75511644318985, + "grad_norm": 1.0268244743347168, "learning_rate": 8.127090301003346e-06, - "loss": 4.8623, + "loss": 3.8648, "step": 11400 }, { - "epoch": 80.45875606528452, - "eval_loss": 4.70980978012085, - "eval_runtime": 192.9937, - "eval_samples_per_second": 51.815, - "eval_steps_per_second": 6.477, + "epoch": 161.75511644318985, + "eval_loss": 3.732024669647217, + "eval_runtime": 75.3507, + "eval_samples_per_second": 132.713, + "eval_steps_per_second": 8.295, "step": 11400 }, { - "epoch": 81.1645346272607, - "grad_norm": 5.240682125091553, + "epoch": 163.16654904728298, + "grad_norm": 1.1183993816375732, "learning_rate": 8.11036789297659e-06, - "loss": 4.8558, + "loss": 3.8592, "step": 11500 }, { - "epoch": 81.1645346272607, - "eval_loss": 4.7231903076171875, - "eval_runtime": 192.9049, - "eval_samples_per_second": 51.839, - "eval_steps_per_second": 6.48, + "epoch": 163.16654904728298, + "eval_loss": 3.727663993835449, + "eval_runtime": 75.0877, + "eval_samples_per_second": 133.178, + "eval_steps_per_second": 8.324, "step": 11500 }, { - "epoch": 81.87031318923688, - "grad_norm": 5.212253570556641, + "epoch": 164.57798165137615, + "grad_norm": 0.9553079605102539, "learning_rate": 8.093645484949833e-06, - "loss": 4.8556, + "loss": 3.8529, "step": 11600 }, { - "epoch": 81.87031318923688, - "eval_loss": 4.718225955963135, - "eval_runtime": 192.885, - "eval_samples_per_second": 51.844, - "eval_steps_per_second": 6.481, + "epoch": 164.57798165137615, + "eval_loss": 3.7202141284942627, + "eval_runtime": 75.341, + "eval_samples_per_second": 132.73, + "eval_steps_per_second": 8.296, "step": 11600 }, { - "epoch": 82.57609175121306, - "grad_norm": 4.824629783630371, + "epoch": 165.9894142554693, + "grad_norm": 0.9820226430892944, "learning_rate": 8.076923076923077e-06, - "loss": 4.8512, + "loss": 3.8473, "step": 11700 }, { - "epoch": 82.57609175121306, - "eval_loss": 4.711867809295654, - "eval_runtime": 192.778, - "eval_samples_per_second": 51.873, - "eval_steps_per_second": 6.484, + "epoch": 165.9894142554693, + "eval_loss": 3.7166054248809814, + "eval_runtime": 75.2501, + "eval_samples_per_second": 132.89, + "eval_steps_per_second": 8.306, "step": 11700 }, { - "epoch": 83.28187031318923, - "grad_norm": 3.154628038406372, + "epoch": 167.40084685956245, + "grad_norm": 0.9525455236434937, "learning_rate": 8.060200668896322e-06, - "loss": 4.8433, + "loss": 3.8402, "step": 11800 }, { - "epoch": 83.28187031318923, - "eval_loss": 4.686385154724121, - "eval_runtime": 192.9018, - "eval_samples_per_second": 51.84, - "eval_steps_per_second": 6.48, + "epoch": 167.40084685956245, + "eval_loss": 3.7105445861816406, + "eval_runtime": 75.2571, + "eval_samples_per_second": 132.878, + "eval_steps_per_second": 8.305, "step": 11800 }, { - "epoch": 83.98764887516542, - "grad_norm": 4.0113911628723145, + "epoch": 168.8122794636556, + "grad_norm": 1.0878891944885254, "learning_rate": 8.043478260869566e-06, - "loss": 4.8382, + "loss": 3.8338, "step": 11900 }, { - "epoch": 83.98764887516542, - "eval_loss": 4.685367107391357, - "eval_runtime": 192.9419, - "eval_samples_per_second": 51.829, - "eval_steps_per_second": 6.479, + "epoch": 168.8122794636556, + "eval_loss": 3.70639967918396, + "eval_runtime": 75.2616, + "eval_samples_per_second": 132.87, + "eval_steps_per_second": 8.304, "step": 11900 }, { - "epoch": 84.6934274371416, - "grad_norm": 4.566628456115723, + "epoch": 170.22371206774878, + "grad_norm": 0.8236098289489746, "learning_rate": 8.02675585284281e-06, - "loss": 4.8345, + "loss": 3.8274, "step": 12000 }, { - "epoch": 84.6934274371416, - "eval_loss": 4.6770782470703125, - "eval_runtime": 192.9046, - "eval_samples_per_second": 51.839, - "eval_steps_per_second": 6.48, + "epoch": 170.22371206774878, + "eval_loss": 3.6996874809265137, + "eval_runtime": 75.0887, + "eval_samples_per_second": 133.176, + "eval_steps_per_second": 8.323, "step": 12000 }, { - "epoch": 85.39920599911778, - "grad_norm": 3.314836025238037, + "epoch": 171.6351446718419, + "grad_norm": 0.997268557548523, "learning_rate": 8.010033444816054e-06, - "loss": 4.8254, + "loss": 3.8213, "step": 12100 }, { - "epoch": 85.39920599911778, - "eval_loss": 4.668924331665039, - "eval_runtime": 192.8425, - "eval_samples_per_second": 51.856, - "eval_steps_per_second": 6.482, + "epoch": 171.6351446718419, + "eval_loss": 3.6938281059265137, + "eval_runtime": 75.2392, + "eval_samples_per_second": 132.909, + "eval_steps_per_second": 8.307, "step": 12100 }, { - "epoch": 86.10498456109396, - "grad_norm": 2.746588706970215, + "epoch": 173.04657727593508, + "grad_norm": 0.7722117900848389, "learning_rate": 7.9933110367893e-06, - "loss": 4.8203, + "loss": 3.8171, "step": 12200 }, { - "epoch": 86.10498456109396, - "eval_loss": 4.6668782234191895, - "eval_runtime": 192.8653, - "eval_samples_per_second": 51.85, - "eval_steps_per_second": 6.481, + "epoch": 173.04657727593508, + "eval_loss": 3.6874475479125977, + "eval_runtime": 75.2829, + "eval_samples_per_second": 132.832, + "eval_steps_per_second": 8.302, "step": 12200 }, { - "epoch": 86.81076312307013, - "grad_norm": 3.50884747505188, + "epoch": 174.45800988002824, + "grad_norm": 0.7989856004714966, "learning_rate": 7.976588628762543e-06, - "loss": 4.8144, + "loss": 3.8094, "step": 12300 }, { - "epoch": 86.81076312307013, - "eval_loss": 4.659768581390381, - "eval_runtime": 192.7649, - "eval_samples_per_second": 51.877, - "eval_steps_per_second": 6.485, + "epoch": 174.45800988002824, + "eval_loss": 3.681959390640259, + "eval_runtime": 75.3827, + "eval_samples_per_second": 132.657, + "eval_steps_per_second": 8.291, "step": 12300 }, { - "epoch": 87.51654168504632, - "grad_norm": 3.403320550918579, + "epoch": 175.86944248412138, + "grad_norm": 0.8630412817001343, "learning_rate": 7.959866220735787e-06, - "loss": 4.8102, + "loss": 3.8039, "step": 12400 }, { - "epoch": 87.51654168504632, - "eval_loss": 4.6468186378479, - "eval_runtime": 192.6623, - "eval_samples_per_second": 51.904, - "eval_steps_per_second": 6.488, + "epoch": 175.86944248412138, + "eval_loss": 3.6759207248687744, + "eval_runtime": 75.0135, + "eval_samples_per_second": 133.309, + "eval_steps_per_second": 8.332, "step": 12400 }, { - "epoch": 88.2223202470225, - "grad_norm": 3.2265071868896484, + "epoch": 177.28087508821454, + "grad_norm": 0.7929290533065796, "learning_rate": 7.94314381270903e-06, - "loss": 4.805, + "loss": 3.7975, "step": 12500 }, { - "epoch": 88.2223202470225, - "eval_loss": 4.641306400299072, - "eval_runtime": 193.0472, - "eval_samples_per_second": 51.801, - "eval_steps_per_second": 6.475, + "epoch": 177.28087508821454, + "eval_loss": 3.6663968563079834, + "eval_runtime": 75.3138, + "eval_samples_per_second": 132.778, + "eval_steps_per_second": 8.299, "step": 12500 }, { - "epoch": 88.92809880899868, - "grad_norm": 1.8542296886444092, + "epoch": 178.69230769230768, + "grad_norm": 0.9302893280982971, "learning_rate": 7.926421404682276e-06, - "loss": 4.7997, + "loss": 3.7915, "step": 12600 }, { - "epoch": 88.92809880899868, - "eval_loss": 4.643083572387695, - "eval_runtime": 193.1144, - "eval_samples_per_second": 51.783, - "eval_steps_per_second": 6.473, + "epoch": 178.69230769230768, + "eval_loss": 3.665828227996826, + "eval_runtime": 75.2973, + "eval_samples_per_second": 132.807, + "eval_steps_per_second": 8.3, "step": 12600 }, { - "epoch": 89.63387737097486, - "grad_norm": 5.688361644744873, + "epoch": 180.10374029640084, + "grad_norm": 0.9526273608207703, "learning_rate": 7.90969899665552e-06, - "loss": 4.7915, + "loss": 3.7845, "step": 12700 }, { - "epoch": 89.63387737097486, - "eval_loss": 4.648108005523682, - "eval_runtime": 193.0918, - "eval_samples_per_second": 51.789, - "eval_steps_per_second": 6.474, + "epoch": 180.10374029640084, + "eval_loss": 3.6617684364318848, + "eval_runtime": 75.2728, + "eval_samples_per_second": 132.85, + "eval_steps_per_second": 8.303, "step": 12700 }, { - "epoch": 90.33965593295103, - "grad_norm": 4.801341533660889, + "epoch": 181.515172900494, + "grad_norm": 0.99673992395401, "learning_rate": 7.892976588628763e-06, - "loss": 4.7923, + "loss": 3.7792, "step": 12800 }, { - "epoch": 90.33965593295103, - "eval_loss": 4.623334884643555, - "eval_runtime": 196.2377, - "eval_samples_per_second": 50.959, - "eval_steps_per_second": 6.37, + "epoch": 181.515172900494, + "eval_loss": 3.6547155380249023, + "eval_runtime": 75.0256, + "eval_samples_per_second": 133.288, + "eval_steps_per_second": 8.33, "step": 12800 }, { - "epoch": 91.04543449492722, - "grad_norm": 4.9018402099609375, + "epoch": 182.92660550458714, + "grad_norm": 0.9285475015640259, "learning_rate": 7.876254180602007e-06, - "loss": 4.7882, + "loss": 3.7708, "step": 12900 }, { - "epoch": 91.04543449492722, - "eval_loss": 4.61915922164917, - "eval_runtime": 192.843, - "eval_samples_per_second": 51.856, - "eval_steps_per_second": 6.482, + "epoch": 182.92660550458714, + "eval_loss": 3.649796485900879, + "eval_runtime": 75.3331, + "eval_samples_per_second": 132.744, + "eval_steps_per_second": 8.296, "step": 12900 }, { - "epoch": 91.7512130569034, - "grad_norm": 4.708747863769531, + "epoch": 184.3380381086803, + "grad_norm": 1.0837434530258179, "learning_rate": 7.859531772575253e-06, - "loss": 4.784, + "loss": 3.7654, "step": 13000 }, { - "epoch": 91.7512130569034, - "eval_loss": 4.616652965545654, - "eval_runtime": 191.8729, - "eval_samples_per_second": 52.118, - "eval_steps_per_second": 6.515, + "epoch": 184.3380381086803, + "eval_loss": 3.645569086074829, + "eval_runtime": 75.2864, + "eval_samples_per_second": 132.826, + "eval_steps_per_second": 8.302, "step": 13000 }, { - "epoch": 92.45699161887957, - "grad_norm": 4.2313032150268555, + "epoch": 185.74947071277347, + "grad_norm": 0.8819906711578369, "learning_rate": 7.842809364548496e-06, - "loss": 4.7797, + "loss": 3.7596, "step": 13100 }, { - "epoch": 92.45699161887957, - "eval_loss": 4.622072696685791, - "eval_runtime": 192.9897, - "eval_samples_per_second": 51.816, - "eval_steps_per_second": 6.477, + "epoch": 185.74947071277347, + "eval_loss": 3.637125015258789, + "eval_runtime": 75.2984, + "eval_samples_per_second": 132.805, + "eval_steps_per_second": 8.3, "step": 13100 }, { - "epoch": 93.16277018085576, - "grad_norm": 3.857269763946533, + "epoch": 187.1609033168666, + "grad_norm": 0.8664088249206543, "learning_rate": 7.82608695652174e-06, - "loss": 4.7737, + "loss": 3.7528, "step": 13200 }, { - "epoch": 93.16277018085576, - "eval_loss": 4.623579502105713, - "eval_runtime": 193.0706, - "eval_samples_per_second": 51.795, - "eval_steps_per_second": 6.474, + "epoch": 187.1609033168666, + "eval_loss": 3.6319968700408936, + "eval_runtime": 75.3308, + "eval_samples_per_second": 132.748, + "eval_steps_per_second": 8.297, "step": 13200 }, { - "epoch": 93.86854874283193, - "grad_norm": 3.544672727584839, + "epoch": 188.57233592095977, + "grad_norm": 1.0038634538650513, "learning_rate": 7.809364548494984e-06, - "loss": 4.7665, + "loss": 3.7462, "step": 13300 }, { - "epoch": 93.86854874283193, - "eval_loss": 4.604065418243408, - "eval_runtime": 192.9925, - "eval_samples_per_second": 51.815, - "eval_steps_per_second": 6.477, + "epoch": 188.57233592095977, + "eval_loss": 3.6271727085113525, + "eval_runtime": 75.185, + "eval_samples_per_second": 133.005, + "eval_steps_per_second": 8.313, "step": 13300 }, { - "epoch": 94.57432730480812, - "grad_norm": 3.2030069828033447, + "epoch": 189.98376852505294, + "grad_norm": 0.8182855248451233, "learning_rate": 7.792642140468228e-06, - "loss": 4.7633, + "loss": 3.7395, "step": 13400 }, { - "epoch": 94.57432730480812, - "eval_loss": 4.604846954345703, - "eval_runtime": 192.8851, - "eval_samples_per_second": 51.844, - "eval_steps_per_second": 6.481, + "epoch": 189.98376852505294, + "eval_loss": 3.621074676513672, + "eval_runtime": 75.3792, + "eval_samples_per_second": 132.663, + "eval_steps_per_second": 8.291, "step": 13400 }, { - "epoch": 95.2801058667843, - "grad_norm": 3.9439663887023926, + "epoch": 191.39520112914607, + "grad_norm": 0.7293921113014221, "learning_rate": 7.775919732441473e-06, - "loss": 4.7577, + "loss": 3.7333, "step": 13500 }, { - "epoch": 95.2801058667843, - "eval_loss": 4.624764919281006, - "eval_runtime": 192.852, - "eval_samples_per_second": 51.853, - "eval_steps_per_second": 6.482, + "epoch": 191.39520112914607, + "eval_loss": 3.61885929107666, + "eval_runtime": 75.3514, + "eval_samples_per_second": 132.712, + "eval_steps_per_second": 8.294, "step": 13500 }, { - "epoch": 95.98588442876047, - "grad_norm": 4.150108337402344, + "epoch": 192.80663373323924, + "grad_norm": 0.7797924876213074, "learning_rate": 7.759197324414717e-06, - "loss": 4.7548, + "loss": 3.7263, "step": 13600 }, { - "epoch": 95.98588442876047, - "eval_loss": 4.613603115081787, - "eval_runtime": 192.8547, - "eval_samples_per_second": 51.853, - "eval_steps_per_second": 6.482, + "epoch": 192.80663373323924, + "eval_loss": 3.612473726272583, + "eval_runtime": 75.3512, + "eval_samples_per_second": 132.712, + "eval_steps_per_second": 8.294, "step": 13600 }, { - "epoch": 96.69166299073666, - "grad_norm": 4.102692127227783, + "epoch": 194.2180663373324, + "grad_norm": 0.8082018494606018, "learning_rate": 7.74247491638796e-06, - "loss": 4.7516, + "loss": 3.7209, "step": 13700 }, { - "epoch": 96.69166299073666, - "eval_loss": 4.6045427322387695, - "eval_runtime": 192.9146, - "eval_samples_per_second": 51.836, - "eval_steps_per_second": 6.48, + "epoch": 194.2180663373324, + "eval_loss": 3.6049203872680664, + "eval_runtime": 75.3686, + "eval_samples_per_second": 132.681, + "eval_steps_per_second": 8.293, "step": 13700 }, { - "epoch": 97.39744155271283, - "grad_norm": 3.7328782081604004, + "epoch": 195.62949894142554, + "grad_norm": 0.7965067625045776, "learning_rate": 7.725752508361204e-06, - "loss": 4.7459, + "loss": 3.7147, "step": 13800 }, { - "epoch": 97.39744155271283, - "eval_loss": 4.60223388671875, - "eval_runtime": 192.8585, - "eval_samples_per_second": 51.851, - "eval_steps_per_second": 6.481, + "epoch": 195.62949894142554, + "eval_loss": 3.599919080734253, + "eval_runtime": 75.1572, + "eval_samples_per_second": 133.055, + "eval_steps_per_second": 8.316, "step": 13800 }, { - "epoch": 98.10322011468902, - "grad_norm": 3.9376449584960938, + "epoch": 197.0409315455187, + "grad_norm": 0.8104972839355469, "learning_rate": 7.70903010033445e-06, - "loss": 4.7443, + "loss": 3.7082, "step": 13900 }, { - "epoch": 98.10322011468902, - "eval_loss": 4.591341495513916, - "eval_runtime": 191.4837, - "eval_samples_per_second": 52.224, - "eval_steps_per_second": 6.528, + "epoch": 197.0409315455187, + "eval_loss": 3.5933010578155518, + "eval_runtime": 75.3918, + "eval_samples_per_second": 132.641, + "eval_steps_per_second": 8.29, "step": 13900 }, { - "epoch": 98.8089986766652, - "grad_norm": 4.19343376159668, + "epoch": 198.45236414961187, + "grad_norm": 0.9350934624671936, "learning_rate": 7.692307692307694e-06, - "loss": 4.7368, + "loss": 3.7011, "step": 14000 }, { - "epoch": 98.8089986766652, - "eval_loss": 4.589985370635986, - "eval_runtime": 193.0875, - "eval_samples_per_second": 51.79, - "eval_steps_per_second": 6.474, + "epoch": 198.45236414961187, + "eval_loss": 3.586028814315796, + "eval_runtime": 75.495, + "eval_samples_per_second": 132.459, + "eval_steps_per_second": 8.279, "step": 14000 }, { - "epoch": 99.51477723864137, - "grad_norm": 3.7229936122894287, + "epoch": 199.863796753705, + "grad_norm": 0.7946003079414368, "learning_rate": 7.675585284280937e-06, - "loss": 4.7331, + "loss": 3.6949, "step": 14100 }, { - "epoch": 99.51477723864137, - "eval_loss": 4.577477931976318, - "eval_runtime": 192.9965, - "eval_samples_per_second": 51.814, - "eval_steps_per_second": 6.477, + "epoch": 199.863796753705, + "eval_loss": 3.581510543823242, + "eval_runtime": 75.3493, + "eval_samples_per_second": 132.715, + "eval_steps_per_second": 8.295, "step": 14100 }, { - "epoch": 100.22055580061756, - "grad_norm": 2.3142991065979004, + "epoch": 201.27522935779817, + "grad_norm": 0.776996910572052, "learning_rate": 7.658862876254181e-06, - "loss": 4.7267, + "loss": 3.689, "step": 14200 }, { - "epoch": 100.22055580061756, - "eval_loss": 4.569549083709717, - "eval_runtime": 192.8919, - "eval_samples_per_second": 51.842, - "eval_steps_per_second": 6.48, + "epoch": 201.27522935779817, + "eval_loss": 3.5773942470550537, + "eval_runtime": 75.3953, + "eval_samples_per_second": 132.634, + "eval_steps_per_second": 8.29, "step": 14200 }, { - "epoch": 100.92633436259374, - "grad_norm": 5.407183647155762, + "epoch": 202.68666196189133, + "grad_norm": 0.6654204726219177, "learning_rate": 7.642140468227427e-06, - "loss": 4.7244, + "loss": 3.6824, "step": 14300 }, { - "epoch": 100.92633436259374, - "eval_loss": 4.56317663192749, - "eval_runtime": 191.8718, - "eval_samples_per_second": 52.118, - "eval_steps_per_second": 6.515, + "epoch": 202.68666196189133, + "eval_loss": 3.574646472930908, + "eval_runtime": 75.4068, + "eval_samples_per_second": 132.614, + "eval_steps_per_second": 8.288, "step": 14300 }, { - "epoch": 101.63211292456991, - "grad_norm": 4.50723934173584, + "epoch": 204.09809456598447, + "grad_norm": 0.7471653819084167, "learning_rate": 7.62541806020067e-06, - "loss": 4.7217, + "loss": 3.6764, "step": 14400 }, { - "epoch": 101.63211292456991, - "eval_loss": 4.5554351806640625, - "eval_runtime": 192.9116, - "eval_samples_per_second": 51.837, - "eval_steps_per_second": 6.48, + "epoch": 204.09809456598447, + "eval_loss": 3.5689644813537598, + "eval_runtime": 75.2186, + "eval_samples_per_second": 132.946, + "eval_steps_per_second": 8.309, "step": 14400 }, { - "epoch": 102.3378914865461, - "grad_norm": 5.067579746246338, + "epoch": 205.50952717007763, + "grad_norm": 0.6326889991760254, "learning_rate": 7.608695652173914e-06, - "loss": 4.7166, + "loss": 3.6708, "step": 14500 }, { - "epoch": 102.3378914865461, - "eval_loss": 4.554645538330078, - "eval_runtime": 192.9203, - "eval_samples_per_second": 51.835, - "eval_steps_per_second": 6.479, + "epoch": 205.50952717007763, + "eval_loss": 3.5631089210510254, + "eval_runtime": 75.269, + "eval_samples_per_second": 132.857, + "eval_steps_per_second": 8.304, "step": 14500 }, { - "epoch": 103.04367004852227, - "grad_norm": 4.08118200302124, + "epoch": 206.9209597741708, + "grad_norm": 0.6415218710899353, "learning_rate": 7.591973244147159e-06, - "loss": 4.7128, + "loss": 3.666, "step": 14600 }, { - "epoch": 103.04367004852227, - "eval_loss": 4.5624799728393555, - "eval_runtime": 192.9179, - "eval_samples_per_second": 51.836, - "eval_steps_per_second": 6.479, + "epoch": 206.9209597741708, + "eval_loss": 3.559755563735962, + "eval_runtime": 75.341, + "eval_samples_per_second": 132.73, + "eval_steps_per_second": 8.296, "step": 14600 }, { - "epoch": 103.74944861049846, - "grad_norm": 3.2750284671783447, + "epoch": 208.33239237826393, + "grad_norm": 0.6695938110351562, "learning_rate": 7.5752508361204024e-06, - "loss": 4.7063, + "loss": 3.66, "step": 14700 }, { - "epoch": 103.74944861049846, - "eval_loss": 4.5471391677856445, - "eval_runtime": 199.6361, - "eval_samples_per_second": 50.091, - "eval_steps_per_second": 6.261, + "epoch": 208.33239237826393, + "eval_loss": 3.5543429851531982, + "eval_runtime": 75.3608, + "eval_samples_per_second": 132.695, + "eval_steps_per_second": 8.293, "step": 14700 }, { - "epoch": 104.45522717247464, - "grad_norm": 3.3068602085113525, + "epoch": 209.7438249823571, + "grad_norm": 0.8358725309371948, "learning_rate": 7.558528428093647e-06, - "loss": 4.7037, + "loss": 3.6542, "step": 14800 }, { - "epoch": 104.45522717247464, - "eval_loss": 4.545187473297119, - "eval_runtime": 192.8578, - "eval_samples_per_second": 51.852, - "eval_steps_per_second": 6.481, + "epoch": 209.7438249823571, + "eval_loss": 3.5488317012786865, + "eval_runtime": 75.3691, + "eval_samples_per_second": 132.68, + "eval_steps_per_second": 8.293, "step": 14800 }, { - "epoch": 105.16100573445081, - "grad_norm": 3.0094478130340576, + "epoch": 211.15525758645023, + "grad_norm": 0.87603360414505, "learning_rate": 7.541806020066891e-06, - "loss": 4.6981, + "loss": 3.649, "step": 14900 }, { - "epoch": 105.16100573445081, - "eval_loss": 4.53700065612793, - "eval_runtime": 192.9906, - "eval_samples_per_second": 51.816, - "eval_steps_per_second": 6.477, + "epoch": 211.15525758645023, + "eval_loss": 3.545093297958374, + "eval_runtime": 75.0836, + "eval_samples_per_second": 133.185, + "eval_steps_per_second": 8.324, "step": 14900 }, { - "epoch": 105.866784296427, - "grad_norm": 3.983729839324951, + "epoch": 212.5666901905434, + "grad_norm": 0.7373155951499939, "learning_rate": 7.5250836120401346e-06, - "loss": 4.6944, + "loss": 3.6429, "step": 15000 }, { - "epoch": 105.866784296427, - "eval_loss": 4.532439708709717, - "eval_runtime": 192.8361, - "eval_samples_per_second": 51.858, - "eval_steps_per_second": 6.482, + "epoch": 212.5666901905434, + "eval_loss": 3.5412116050720215, + "eval_runtime": 75.3512, + "eval_samples_per_second": 132.712, + "eval_steps_per_second": 8.294, "step": 15000 }, { - "epoch": 106.57256285840317, - "grad_norm": 2.918281078338623, + "epoch": 213.97812279463656, + "grad_norm": 0.6794809699058533, "learning_rate": 7.508361204013379e-06, - "loss": 4.6889, + "loss": 3.6389, "step": 15100 }, { - "epoch": 106.57256285840317, - "eval_loss": 4.531869411468506, - "eval_runtime": 192.7064, - "eval_samples_per_second": 51.892, - "eval_steps_per_second": 6.487, + "epoch": 213.97812279463656, + "eval_loss": 3.53684139251709, + "eval_runtime": 75.1526, + "eval_samples_per_second": 133.063, + "eval_steps_per_second": 8.316, "step": 15100 }, { - "epoch": 107.27834142037936, - "grad_norm": 4.346628665924072, + "epoch": 215.3895553987297, + "grad_norm": 0.7504755854606628, "learning_rate": 7.491638795986622e-06, - "loss": 4.6891, + "loss": 3.6338, "step": 15200 }, { - "epoch": 107.27834142037936, - "eval_loss": 4.541425704956055, - "eval_runtime": 192.8625, - "eval_samples_per_second": 51.85, - "eval_steps_per_second": 6.481, + "epoch": 215.3895553987297, + "eval_loss": 3.530754804611206, + "eval_runtime": 75.3259, + "eval_samples_per_second": 132.757, + "eval_steps_per_second": 8.297, "step": 15200 }, { - "epoch": 107.98411998235554, - "grad_norm": 4.056755065917969, + "epoch": 216.80098800282286, + "grad_norm": 0.6434842348098755, "learning_rate": 7.474916387959867e-06, - "loss": 4.6849, + "loss": 3.6291, "step": 15300 }, { - "epoch": 107.98411998235554, - "eval_loss": 4.537045478820801, - "eval_runtime": 192.7834, - "eval_samples_per_second": 51.872, - "eval_steps_per_second": 6.484, + "epoch": 216.80098800282286, + "eval_loss": 3.5268187522888184, + "eval_runtime": 75.4572, + "eval_samples_per_second": 132.525, + "eval_steps_per_second": 8.283, "step": 15300 }, { - "epoch": 108.68989854433171, - "grad_norm": 3.7352023124694824, + "epoch": 218.21242060691603, + "grad_norm": 0.7057756781578064, "learning_rate": 7.4581939799331104e-06, - "loss": 4.6794, + "loss": 3.6233, "step": 15400 }, { - "epoch": 108.68989854433171, - "eval_loss": 4.529075622558594, - "eval_runtime": 192.8094, - "eval_samples_per_second": 51.865, - "eval_steps_per_second": 6.483, + "epoch": 218.21242060691603, + "eval_loss": 3.5211195945739746, + "eval_runtime": 75.4018, + "eval_samples_per_second": 132.623, + "eval_steps_per_second": 8.289, "step": 15400 }, { - "epoch": 109.3956771063079, - "grad_norm": 3.330331563949585, + "epoch": 219.62385321100916, + "grad_norm": 0.6788062453269958, "learning_rate": 7.441471571906354e-06, - "loss": 4.6755, + "loss": 3.6187, "step": 15500 }, { - "epoch": 109.3956771063079, - "eval_loss": 4.511175155639648, - "eval_runtime": 192.7628, - "eval_samples_per_second": 51.877, - "eval_steps_per_second": 6.485, + "epoch": 219.62385321100916, + "eval_loss": 3.5227015018463135, + "eval_runtime": 75.1928, + "eval_samples_per_second": 132.992, + "eval_steps_per_second": 8.312, "step": 15500 }, { - "epoch": 110.10145566828407, - "grad_norm": 3.188854455947876, + "epoch": 221.03528581510233, + "grad_norm": 0.5607024431228638, "learning_rate": 7.424749163879599e-06, - "loss": 4.6707, + "loss": 3.6145, "step": 15600 }, { - "epoch": 110.10145566828407, - "eval_loss": 4.5077056884765625, - "eval_runtime": 192.935, - "eval_samples_per_second": 51.831, - "eval_steps_per_second": 6.479, + "epoch": 221.03528581510233, + "eval_loss": 3.514875888824463, + "eval_runtime": 75.3921, + "eval_samples_per_second": 132.64, + "eval_steps_per_second": 8.29, "step": 15600 }, { - "epoch": 110.80723423026025, - "grad_norm": 3.105900764465332, + "epoch": 222.4467184191955, + "grad_norm": 1.0785976648330688, "learning_rate": 7.4080267558528426e-06, - "loss": 4.6668, + "loss": 3.6091, "step": 15700 }, { - "epoch": 110.80723423026025, - "eval_loss": 4.517475605010986, - "eval_runtime": 192.991, - "eval_samples_per_second": 51.816, - "eval_steps_per_second": 6.477, + "epoch": 222.4467184191955, + "eval_loss": 3.5113186836242676, + "eval_runtime": 75.365, + "eval_samples_per_second": 132.688, + "eval_steps_per_second": 8.293, "step": 15700 }, { - "epoch": 111.51301279223644, - "grad_norm": 4.242959499359131, + "epoch": 223.85815102328863, + "grad_norm": 0.8430230617523193, "learning_rate": 7.391304347826087e-06, - "loss": 4.6635, + "loss": 3.6059, "step": 15800 }, { - "epoch": 111.51301279223644, - "eval_loss": 4.49766731262207, - "eval_runtime": 193.0181, - "eval_samples_per_second": 51.809, - "eval_steps_per_second": 6.476, + "epoch": 223.85815102328863, + "eval_loss": 3.5113461017608643, + "eval_runtime": 75.3749, + "eval_samples_per_second": 132.67, + "eval_steps_per_second": 8.292, "step": 15800 }, { - "epoch": 112.21879135421261, - "grad_norm": 4.33656120300293, + "epoch": 225.2695836273818, + "grad_norm": 0.5923272967338562, "learning_rate": 7.374581939799331e-06, - "loss": 4.6616, + "loss": 3.6016, "step": 15900 }, { - "epoch": 112.21879135421261, - "eval_loss": 4.495267391204834, - "eval_runtime": 192.7551, - "eval_samples_per_second": 51.879, - "eval_steps_per_second": 6.485, + "epoch": 225.2695836273818, + "eval_loss": 3.5035176277160645, + "eval_runtime": 75.2962, + "eval_samples_per_second": 132.809, + "eval_steps_per_second": 8.301, "step": 15900 }, { - "epoch": 112.9245699161888, - "grad_norm": 3.9022886753082275, + "epoch": 226.68101623147496, + "grad_norm": 0.7486432194709778, "learning_rate": 7.3578595317725755e-06, - "loss": 4.6576, + "loss": 3.598, "step": 16000 }, { - "epoch": 112.9245699161888, - "eval_loss": 4.488037109375, - "eval_runtime": 192.6469, - "eval_samples_per_second": 51.908, - "eval_steps_per_second": 6.489, + "epoch": 226.68101623147496, + "eval_loss": 3.4993245601654053, + "eval_runtime": 75.1842, + "eval_samples_per_second": 133.007, + "eval_steps_per_second": 8.313, "step": 16000 }, { - "epoch": 113.63034847816498, - "grad_norm": 4.036642551422119, + "epoch": 228.0924488355681, + "grad_norm": 0.8732613325119019, "learning_rate": 7.341137123745819e-06, - "loss": 4.6531, + "loss": 3.5925, "step": 16100 }, { - "epoch": 113.63034847816498, - "eval_loss": 4.481316566467285, - "eval_runtime": 192.8177, - "eval_samples_per_second": 51.862, - "eval_steps_per_second": 6.483, + "epoch": 228.0924488355681, + "eval_loss": 3.495875835418701, + "eval_runtime": 75.4084, + "eval_samples_per_second": 132.611, + "eval_steps_per_second": 8.288, "step": 16100 }, { - "epoch": 114.33612704014115, - "grad_norm": 2.9444971084594727, + "epoch": 229.50388143966126, + "grad_norm": 0.6386623978614807, "learning_rate": 7.324414715719064e-06, - "loss": 4.6515, + "loss": 3.5889, "step": 16200 }, { - "epoch": 114.33612704014115, - "eval_loss": 4.487556457519531, - "eval_runtime": 192.9071, - "eval_samples_per_second": 51.838, - "eval_steps_per_second": 6.48, + "epoch": 229.50388143966126, + "eval_loss": 3.492525815963745, + "eval_runtime": 75.3941, + "eval_samples_per_second": 132.636, + "eval_steps_per_second": 8.29, "step": 16200 }, { - "epoch": 115.04190560211734, - "grad_norm": 3.479867458343506, + "epoch": 230.91531404375442, + "grad_norm": 0.7436226606369019, "learning_rate": 7.307692307692308e-06, - "loss": 4.6443, + "loss": 3.5848, "step": 16300 }, { - "epoch": 115.04190560211734, - "eval_loss": 4.487501621246338, - "eval_runtime": 192.8449, - "eval_samples_per_second": 51.855, - "eval_steps_per_second": 6.482, + "epoch": 230.91531404375442, + "eval_loss": 3.4906632900238037, + "eval_runtime": 75.3655, + "eval_samples_per_second": 132.687, + "eval_steps_per_second": 8.293, "step": 16300 }, { - "epoch": 115.74768416409351, - "grad_norm": 3.0684421062469482, + "epoch": 232.32674664784756, + "grad_norm": 0.8418622016906738, "learning_rate": 7.290969899665552e-06, - "loss": 4.6404, + "loss": 3.5803, "step": 16400 }, { - "epoch": 115.74768416409351, - "eval_loss": 4.476925849914551, - "eval_runtime": 192.7395, - "eval_samples_per_second": 51.883, - "eval_steps_per_second": 6.485, + "epoch": 232.32674664784756, + "eval_loss": 3.4869649410247803, + "eval_runtime": 75.3226, + "eval_samples_per_second": 132.762, + "eval_steps_per_second": 8.298, "step": 16400 }, { - "epoch": 116.4534627260697, - "grad_norm": 2.6923413276672363, + "epoch": 233.73817925194072, + "grad_norm": 0.7934303879737854, "learning_rate": 7.274247491638796e-06, - "loss": 4.6367, + "loss": 3.5778, "step": 16500 }, { - "epoch": 116.4534627260697, - "eval_loss": 4.472157955169678, - "eval_runtime": 192.8153, - "eval_samples_per_second": 51.863, - "eval_steps_per_second": 6.483, + "epoch": 233.73817925194072, + "eval_loss": 3.478532314300537, + "eval_runtime": 75.0535, + "eval_samples_per_second": 133.238, + "eval_steps_per_second": 8.327, "step": 16500 }, { - "epoch": 117.15924128804588, - "grad_norm": 2.108584403991699, + "epoch": 235.14961185603389, + "grad_norm": 0.6809811592102051, "learning_rate": 7.257525083612041e-06, - "loss": 4.6313, + "loss": 3.5736, "step": 16600 }, { - "epoch": 117.15924128804588, - "eval_loss": 4.469875812530518, - "eval_runtime": 192.7752, - "eval_samples_per_second": 51.874, - "eval_steps_per_second": 6.484, + "epoch": 235.14961185603389, + "eval_loss": 3.4779250621795654, + "eval_runtime": 75.3809, + "eval_samples_per_second": 132.66, + "eval_steps_per_second": 8.291, "step": 16600 }, { - "epoch": 117.86501985002205, - "grad_norm": 2.614610433578491, + "epoch": 236.56104446012702, + "grad_norm": 0.8483596444129944, "learning_rate": 7.240802675585284e-06, - "loss": 4.6285, + "loss": 3.5682, "step": 16700 }, { - "epoch": 117.86501985002205, - "eval_loss": 4.4715576171875, - "eval_runtime": 192.9832, - "eval_samples_per_second": 51.818, - "eval_steps_per_second": 6.477, + "epoch": 236.56104446012702, + "eval_loss": 3.473116874694824, + "eval_runtime": 75.3936, + "eval_samples_per_second": 132.637, + "eval_steps_per_second": 8.29, "step": 16700 }, { - "epoch": 118.57079841199824, - "grad_norm": 3.742584705352783, + "epoch": 237.9724770642202, + "grad_norm": 0.6600437164306641, "learning_rate": 7.224080267558529e-06, - "loss": 4.6251, + "loss": 3.5656, "step": 16800 }, { - "epoch": 118.57079841199824, - "eval_loss": 4.470496654510498, - "eval_runtime": 192.6888, - "eval_samples_per_second": 51.897, - "eval_steps_per_second": 6.487, + "epoch": 237.9724770642202, + "eval_loss": 3.47174072265625, + "eval_runtime": 75.3681, + "eval_samples_per_second": 132.682, + "eval_steps_per_second": 8.293, "step": 16800 }, { - "epoch": 119.27657697397441, - "grad_norm": 3.862029790878296, + "epoch": 239.38390966831335, + "grad_norm": 0.607857882976532, "learning_rate": 7.207357859531773e-06, - "loss": 4.6208, + "loss": 3.5616, "step": 16900 }, { - "epoch": 119.27657697397441, - "eval_loss": 4.456589221954346, - "eval_runtime": 192.6316, - "eval_samples_per_second": 51.913, - "eval_steps_per_second": 6.489, + "epoch": 239.38390966831335, + "eval_loss": 3.467322587966919, + "eval_runtime": 75.3846, + "eval_samples_per_second": 132.653, + "eval_steps_per_second": 8.291, "step": 16900 }, { - "epoch": 119.98235553595059, - "grad_norm": 4.002711772918701, + "epoch": 240.7953422724065, + "grad_norm": 0.6345656514167786, "learning_rate": 7.190635451505017e-06, - "loss": 4.6192, + "loss": 3.5582, "step": 17000 }, { - "epoch": 119.98235553595059, - "eval_loss": 4.452151775360107, - "eval_runtime": 192.6019, - "eval_samples_per_second": 51.921, - "eval_steps_per_second": 6.49, + "epoch": 240.7953422724065, + "eval_loss": 3.4662554264068604, + "eval_runtime": 75.3233, + "eval_samples_per_second": 132.761, + "eval_steps_per_second": 8.298, "step": 17000 }, { - "epoch": 120.68813409792678, - "grad_norm": 3.7718334197998047, + "epoch": 242.20677487649965, + "grad_norm": 0.6691942811012268, "learning_rate": 7.173913043478261e-06, - "loss": 4.6138, + "loss": 3.5552, "step": 17100 }, { - "epoch": 120.68813409792678, - "eval_loss": 4.4539289474487305, - "eval_runtime": 192.6066, - "eval_samples_per_second": 51.919, - "eval_steps_per_second": 6.49, + "epoch": 242.20677487649965, + "eval_loss": 3.4594500064849854, + "eval_runtime": 75.1343, + "eval_samples_per_second": 133.095, + "eval_steps_per_second": 8.318, "step": 17100 }, { - "epoch": 121.39391265990295, - "grad_norm": 2.563103199005127, + "epoch": 243.61820748059282, + "grad_norm": 0.6638765931129456, "learning_rate": 7.157190635451506e-06, - "loss": 4.6111, + "loss": 3.551, "step": 17200 }, { - "epoch": 121.39391265990295, - "eval_loss": 4.452628135681152, - "eval_runtime": 192.8153, - "eval_samples_per_second": 51.863, - "eval_steps_per_second": 6.483, + "epoch": 243.61820748059282, + "eval_loss": 3.459233283996582, + "eval_runtime": 75.3664, + "eval_samples_per_second": 132.685, + "eval_steps_per_second": 8.293, "step": 17200 }, { - "epoch": 122.09969122187914, - "grad_norm": 3.4357354640960693, + "epoch": 245.02964008468595, + "grad_norm": 0.7361800670623779, "learning_rate": 7.1404682274247495e-06, - "loss": 4.607, + "loss": 3.5476, "step": 17300 }, { - "epoch": 122.09969122187914, - "eval_loss": 4.443899154663086, - "eval_runtime": 192.7526, - "eval_samples_per_second": 51.88, - "eval_steps_per_second": 6.485, + "epoch": 245.02964008468595, + "eval_loss": 3.456892251968384, + "eval_runtime": 75.3908, + "eval_samples_per_second": 132.642, + "eval_steps_per_second": 8.29, "step": 17300 }, { - "epoch": 122.80546978385532, - "grad_norm": 2.8962490558624268, + "epoch": 246.44107268877912, + "grad_norm": 0.7599518299102783, "learning_rate": 7.123745819397993e-06, - "loss": 4.6034, + "loss": 3.544, "step": 17400 }, { - "epoch": 122.80546978385532, - "eval_loss": 4.439050197601318, - "eval_runtime": 192.63, - "eval_samples_per_second": 51.913, - "eval_steps_per_second": 6.489, + "epoch": 246.44107268877912, + "eval_loss": 3.453518867492676, + "eval_runtime": 75.3827, + "eval_samples_per_second": 132.656, + "eval_steps_per_second": 8.291, "step": 17400 }, { - "epoch": 123.51124834583149, - "grad_norm": 3.929617166519165, + "epoch": 247.85250529287225, + "grad_norm": 0.6123988032341003, "learning_rate": 7.107023411371238e-06, - "loss": 4.6003, + "loss": 3.5408, "step": 17500 }, { - "epoch": 123.51124834583149, - "eval_loss": 4.444461345672607, - "eval_runtime": 191.7463, - "eval_samples_per_second": 52.152, - "eval_steps_per_second": 6.519, + "epoch": 247.85250529287225, + "eval_loss": 3.4502522945404053, + "eval_runtime": 75.3683, + "eval_samples_per_second": 132.682, + "eval_steps_per_second": 8.293, "step": 17500 }, { - "epoch": 124.21702690780768, - "grad_norm": 3.257054328918457, + "epoch": 249.26393789696542, + "grad_norm": 0.6615849137306213, "learning_rate": 7.0903010033444816e-06, - "loss": 4.5958, + "loss": 3.5376, "step": 17600 }, { - "epoch": 124.21702690780768, - "eval_loss": 4.434076309204102, - "eval_runtime": 192.5214, - "eval_samples_per_second": 51.942, - "eval_steps_per_second": 6.493, + "epoch": 249.26393789696542, + "eval_loss": 3.446369171142578, + "eval_runtime": 75.2088, + "eval_samples_per_second": 132.963, + "eval_steps_per_second": 8.31, "step": 17600 }, { - "epoch": 124.92280546978385, - "grad_norm": 3.739959955215454, + "epoch": 250.67537050105858, + "grad_norm": 0.6400436162948608, "learning_rate": 7.073578595317726e-06, - "loss": 4.5943, + "loss": 3.5338, "step": 17700 }, { - "epoch": 124.92280546978385, - "eval_loss": 4.446750640869141, - "eval_runtime": 192.937, - "eval_samples_per_second": 51.83, - "eval_steps_per_second": 6.479, + "epoch": 250.67537050105858, + "eval_loss": 3.444706678390503, + "eval_runtime": 75.2466, + "eval_samples_per_second": 132.896, + "eval_steps_per_second": 8.306, "step": 17700 }, { - "epoch": 125.62858403176004, - "grad_norm": 4.178181171417236, + "epoch": 252.08680310515172, + "grad_norm": 0.6528608798980713, "learning_rate": 7.05685618729097e-06, - "loss": 4.5918, + "loss": 3.5307, "step": 17800 }, { - "epoch": 125.62858403176004, - "eval_loss": 4.437300682067871, - "eval_runtime": 192.8353, - "eval_samples_per_second": 51.858, - "eval_steps_per_second": 6.482, + "epoch": 252.08680310515172, + "eval_loss": 3.4401872158050537, + "eval_runtime": 75.4051, + "eval_samples_per_second": 132.617, + "eval_steps_per_second": 8.289, "step": 17800 }, { - "epoch": 126.33436259373622, - "grad_norm": 3.977018356323242, + "epoch": 253.49823570924488, + "grad_norm": 0.7888281345367432, "learning_rate": 7.0401337792642145e-06, - "loss": 4.5878, + "loss": 3.5276, "step": 17900 }, { - "epoch": 126.33436259373622, - "eval_loss": 4.44089937210083, - "eval_runtime": 193.0731, - "eval_samples_per_second": 51.794, - "eval_steps_per_second": 6.474, + "epoch": 253.49823570924488, + "eval_loss": 3.438217878341675, + "eval_runtime": 76.8212, + "eval_samples_per_second": 130.172, + "eval_steps_per_second": 8.136, "step": 17900 }, { - "epoch": 127.04014115571239, - "grad_norm": 3.078171730041504, + "epoch": 254.90966831333805, + "grad_norm": 0.6938881874084473, "learning_rate": 7.023411371237458e-06, - "loss": 4.5852, + "loss": 3.5237, "step": 18000 }, { - "epoch": 127.04014115571239, - "eval_loss": 4.439931869506836, - "eval_runtime": 192.9618, - "eval_samples_per_second": 51.824, - "eval_steps_per_second": 6.478, + "epoch": 254.90966831333805, + "eval_loss": 3.434736728668213, + "eval_runtime": 76.8401, + "eval_samples_per_second": 130.14, + "eval_steps_per_second": 8.134, "step": 18000 }, { - "epoch": 127.74591971768858, - "grad_norm": 3.410306453704834, + "epoch": 256.3211009174312, + "grad_norm": 0.6238895654678345, "learning_rate": 7.006688963210703e-06, - "loss": 4.5818, + "loss": 3.5208, "step": 18100 }, { - "epoch": 127.74591971768858, - "eval_loss": 4.428420543670654, - "eval_runtime": 192.0087, - "eval_samples_per_second": 52.081, - "eval_steps_per_second": 6.51, + "epoch": 256.3211009174312, + "eval_loss": 3.4336953163146973, + "eval_runtime": 76.7196, + "eval_samples_per_second": 130.345, + "eval_steps_per_second": 8.147, "step": 18100 }, { - "epoch": 128.45169827966475, - "grad_norm": 3.11771821975708, + "epoch": 257.7325335215244, + "grad_norm": 0.5130559206008911, "learning_rate": 6.989966555183947e-06, - "loss": 4.5779, + "loss": 3.5184, "step": 18200 }, { - "epoch": 128.45169827966475, - "eval_loss": 4.426939964294434, - "eval_runtime": 192.6134, - "eval_samples_per_second": 51.917, - "eval_steps_per_second": 6.49, + "epoch": 257.7325335215244, + "eval_loss": 3.4297399520874023, + "eval_runtime": 76.8567, + "eval_samples_per_second": 130.112, + "eval_steps_per_second": 8.132, "step": 18200 }, { - "epoch": 129.15747684164094, - "grad_norm": 3.2677371501922607, + "epoch": 259.1439661256175, + "grad_norm": 0.480792373418808, "learning_rate": 6.973244147157191e-06, - "loss": 4.5726, + "loss": 3.5156, "step": 18300 }, { - "epoch": 129.15747684164094, - "eval_loss": 4.415538311004639, - "eval_runtime": 192.607, - "eval_samples_per_second": 51.919, - "eval_steps_per_second": 6.49, + "epoch": 259.1439661256175, + "eval_loss": 3.4281508922576904, + "eval_runtime": 76.9652, + "eval_samples_per_second": 129.929, + "eval_steps_per_second": 8.121, "step": 18300 }, { - "epoch": 129.8632554036171, - "grad_norm": 2.232645034790039, + "epoch": 260.55539872971065, + "grad_norm": 0.6780312657356262, "learning_rate": 6.956521739130435e-06, - "loss": 4.5702, + "loss": 3.5118, "step": 18400 }, { - "epoch": 129.8632554036171, - "eval_loss": 4.415960788726807, - "eval_runtime": 192.7593, - "eval_samples_per_second": 51.878, - "eval_steps_per_second": 6.485, + "epoch": 260.55539872971065, + "eval_loss": 3.4258944988250732, + "eval_runtime": 76.8943, + "eval_samples_per_second": 130.049, + "eval_steps_per_second": 8.128, "step": 18400 }, { - "epoch": 130.5690339655933, - "grad_norm": 3.8693525791168213, + "epoch": 261.96683133380384, + "grad_norm": 0.5994666814804077, "learning_rate": 6.93979933110368e-06, - "loss": 4.5658, + "loss": 3.5086, "step": 18500 }, { - "epoch": 130.5690339655933, - "eval_loss": 4.400354862213135, - "eval_runtime": 192.8222, - "eval_samples_per_second": 51.861, - "eval_steps_per_second": 6.483, + "epoch": 261.96683133380384, + "eval_loss": 3.422513246536255, + "eval_runtime": 76.8723, + "eval_samples_per_second": 130.086, + "eval_steps_per_second": 8.13, "step": 18500 }, { - "epoch": 131.27481252756948, - "grad_norm": 3.49723744392395, + "epoch": 263.378263937897, + "grad_norm": 0.7816299796104431, "learning_rate": 6.923076923076923e-06, - "loss": 4.5651, + "loss": 3.505, "step": 18600 }, { - "epoch": 131.27481252756948, - "eval_loss": 4.400233745574951, - "eval_runtime": 192.8045, - "eval_samples_per_second": 51.866, - "eval_steps_per_second": 6.483, + "epoch": 263.378263937897, + "eval_loss": 3.4186975955963135, + "eval_runtime": 76.7391, + "eval_samples_per_second": 130.312, + "eval_steps_per_second": 8.144, "step": 18600 }, { - "epoch": 131.98059108954567, - "grad_norm": 2.549008369445801, + "epoch": 264.7896965419901, + "grad_norm": 0.6295963525772095, "learning_rate": 6.906354515050168e-06, - "loss": 4.5591, + "loss": 3.5025, "step": 18700 }, { - "epoch": 131.98059108954567, - "eval_loss": 4.412561893463135, - "eval_runtime": 192.859, - "eval_samples_per_second": 51.851, - "eval_steps_per_second": 6.481, + "epoch": 264.7896965419901, + "eval_loss": 3.416973352432251, + "eval_runtime": 76.7644, + "eval_samples_per_second": 130.269, + "eval_steps_per_second": 8.142, "step": 18700 }, { - "epoch": 132.68636965152183, - "grad_norm": 3.369398355484009, + "epoch": 266.20112914608325, + "grad_norm": 0.5306549072265625, "learning_rate": 6.889632107023412e-06, - "loss": 4.5559, + "loss": 3.5002, "step": 18800 }, { - "epoch": 132.68636965152183, - "eval_loss": 4.398661136627197, - "eval_runtime": 191.3185, - "eval_samples_per_second": 52.269, - "eval_steps_per_second": 6.534, + "epoch": 266.20112914608325, + "eval_loss": 3.4140942096710205, + "eval_runtime": 76.9199, + "eval_samples_per_second": 130.005, + "eval_steps_per_second": 8.125, "step": 18800 }, { - "epoch": 133.39214821349802, - "grad_norm": 4.078741550445557, + "epoch": 267.61256175017644, + "grad_norm": 0.6763441562652588, "learning_rate": 6.872909698996656e-06, - "loss": 4.552, + "loss": 3.4971, "step": 18900 }, { - "epoch": 133.39214821349802, - "eval_loss": 4.395702838897705, - "eval_runtime": 193.0075, - "eval_samples_per_second": 51.811, - "eval_steps_per_second": 6.476, + "epoch": 267.61256175017644, + "eval_loss": 3.4103851318359375, + "eval_runtime": 76.776, + "eval_samples_per_second": 130.249, + "eval_steps_per_second": 8.141, "step": 18900 }, { - "epoch": 134.0979267754742, - "grad_norm": 3.5382561683654785, + "epoch": 269.0239943542696, + "grad_norm": 0.557765781879425, "learning_rate": 6.8561872909699e-06, - "loss": 4.5518, + "loss": 3.4944, "step": 19000 }, { - "epoch": 134.0979267754742, - "eval_loss": 4.388858795166016, - "eval_runtime": 192.7771, - "eval_samples_per_second": 51.873, - "eval_steps_per_second": 6.484, + "epoch": 269.0239943542696, + "eval_loss": 3.4099249839782715, + "eval_runtime": 76.8515, + "eval_samples_per_second": 130.121, + "eval_steps_per_second": 8.133, "step": 19000 }, { - "epoch": 134.80370533745037, - "grad_norm": 3.3008577823638916, + "epoch": 270.4354269583627, + "grad_norm": 0.6447959542274475, "learning_rate": 6.839464882943144e-06, - "loss": 4.5479, + "loss": 3.492, "step": 19100 }, { - "epoch": 134.80370533745037, - "eval_loss": 4.383645534515381, - "eval_runtime": 192.8234, - "eval_samples_per_second": 51.861, - "eval_steps_per_second": 6.483, + "epoch": 270.4354269583627, + "eval_loss": 3.408655881881714, + "eval_runtime": 76.7546, + "eval_samples_per_second": 130.285, + "eval_steps_per_second": 8.143, "step": 19100 }, { - "epoch": 135.50948389942656, - "grad_norm": 2.746685028076172, + "epoch": 271.8468595624559, + "grad_norm": 0.5875563621520996, "learning_rate": 6.8227424749163885e-06, - "loss": 4.5429, + "loss": 3.4898, "step": 19200 }, { - "epoch": 135.50948389942656, - "eval_loss": 4.391398906707764, - "eval_runtime": 192.9035, - "eval_samples_per_second": 51.839, - "eval_steps_per_second": 6.48, + "epoch": 271.8468595624559, + "eval_loss": 3.404829263687134, + "eval_runtime": 76.8832, + "eval_samples_per_second": 130.067, + "eval_steps_per_second": 8.129, "step": 19200 }, { - "epoch": 136.21526246140274, - "grad_norm": 3.0366287231445312, + "epoch": 273.25829216654904, + "grad_norm": 0.6500788927078247, "learning_rate": 6.806020066889632e-06, - "loss": 4.5418, + "loss": 3.4858, "step": 19300 }, { - "epoch": 136.21526246140274, - "eval_loss": 4.388709545135498, - "eval_runtime": 192.8256, - "eval_samples_per_second": 51.86, - "eval_steps_per_second": 6.483, + "epoch": 273.25829216654904, + "eval_loss": 3.40264892578125, + "eval_runtime": 76.8385, + "eval_samples_per_second": 130.143, + "eval_steps_per_second": 8.134, "step": 19300 }, { - "epoch": 136.9210410233789, - "grad_norm": 2.791844129562378, + "epoch": 274.6697247706422, + "grad_norm": 0.7083373069763184, "learning_rate": 6.789297658862877e-06, - "loss": 4.5374, + "loss": 3.4824, "step": 19400 }, { - "epoch": 136.9210410233789, - "eval_loss": 4.3709540367126465, - "eval_runtime": 192.7779, - "eval_samples_per_second": 51.873, - "eval_steps_per_second": 6.484, + "epoch": 274.6697247706422, + "eval_loss": 3.398090124130249, + "eval_runtime": 76.8694, + "eval_samples_per_second": 130.091, + "eval_steps_per_second": 8.131, "step": 19400 }, { - "epoch": 137.6268195853551, - "grad_norm": 3.523527145385742, + "epoch": 276.08115737473537, + "grad_norm": 0.7219327092170715, "learning_rate": 6.772575250836121e-06, - "loss": 4.5342, + "loss": 3.4811, "step": 19500 }, { - "epoch": 137.6268195853551, - "eval_loss": 4.375741958618164, - "eval_runtime": 192.8076, - "eval_samples_per_second": 51.865, - "eval_steps_per_second": 6.483, + "epoch": 276.08115737473537, + "eval_loss": 3.396939516067505, + "eval_runtime": 76.9317, + "eval_samples_per_second": 129.985, + "eval_steps_per_second": 8.124, "step": 19500 }, { - "epoch": 138.33259814733128, - "grad_norm": 3.4836888313293457, + "epoch": 277.4925899788285, + "grad_norm": 0.49477267265319824, "learning_rate": 6.755852842809365e-06, - "loss": 4.5332, + "loss": 3.4782, "step": 19600 }, { - "epoch": 138.33259814733128, - "eval_loss": 4.368445873260498, - "eval_runtime": 192.8365, - "eval_samples_per_second": 51.857, - "eval_steps_per_second": 6.482, + "epoch": 277.4925899788285, + "eval_loss": 3.395801544189453, + "eval_runtime": 76.7586, + "eval_samples_per_second": 130.279, + "eval_steps_per_second": 8.142, "step": 19600 }, { - "epoch": 139.03837670930744, - "grad_norm": 3.156789541244507, + "epoch": 278.90402258292164, + "grad_norm": 0.7135903835296631, "learning_rate": 6.739130434782609e-06, - "loss": 4.5292, + "loss": 3.4749, "step": 19700 }, { - "epoch": 139.03837670930744, - "eval_loss": 4.36691951751709, - "eval_runtime": 192.79, - "eval_samples_per_second": 51.87, - "eval_steps_per_second": 6.484, + "epoch": 278.90402258292164, + "eval_loss": 3.393001079559326, + "eval_runtime": 76.9157, + "eval_samples_per_second": 130.012, + "eval_steps_per_second": 8.126, "step": 19700 }, { - "epoch": 139.74415527128363, - "grad_norm": 2.8722000122070312, + "epoch": 280.31545518701483, + "grad_norm": 0.7002623677253723, "learning_rate": 6.7224080267558536e-06, - "loss": 4.5268, + "loss": 3.4731, "step": 19800 }, { - "epoch": 139.74415527128363, - "eval_loss": 4.3648576736450195, - "eval_runtime": 192.6108, - "eval_samples_per_second": 51.918, - "eval_steps_per_second": 6.49, + "epoch": 280.31545518701483, + "eval_loss": 3.3903141021728516, + "eval_runtime": 75.5607, + "eval_samples_per_second": 132.344, + "eval_steps_per_second": 8.271, "step": 19800 }, { - "epoch": 140.44993383325982, - "grad_norm": 3.5349180698394775, + "epoch": 281.72688779110797, + "grad_norm": 0.6401045322418213, "learning_rate": 6.705685618729097e-06, - "loss": 4.5219, + "loss": 3.4704, "step": 19900 }, { - "epoch": 140.44993383325982, - "eval_loss": 4.363093852996826, - "eval_runtime": 194.2702, - "eval_samples_per_second": 51.475, - "eval_steps_per_second": 6.434, + "epoch": 281.72688779110797, + "eval_loss": 3.3897156715393066, + "eval_runtime": 76.9187, + "eval_samples_per_second": 130.007, + "eval_steps_per_second": 8.125, "step": 19900 }, { - "epoch": 141.155712395236, - "grad_norm": 3.690683603286743, + "epoch": 283.1383203952011, + "grad_norm": 0.6009179353713989, "learning_rate": 6.688963210702342e-06, - "loss": 4.5195, + "loss": 3.4667, "step": 20000 }, { - "epoch": 141.155712395236, - "eval_loss": 4.358035087585449, - "eval_runtime": 193.0962, - "eval_samples_per_second": 51.788, - "eval_steps_per_second": 6.473, + "epoch": 283.1383203952011, + "eval_loss": 3.388314962387085, + "eval_runtime": 76.761, + "eval_samples_per_second": 130.274, + "eval_steps_per_second": 8.142, "step": 20000 }, { - "epoch": 141.86149095721217, - "grad_norm": 2.9390950202941895, + "epoch": 284.5497529992943, + "grad_norm": 0.6317482590675354, "learning_rate": 6.672240802675586e-06, - "loss": 4.5166, + "loss": 3.4653, "step": 20100 }, { - "epoch": 141.86149095721217, - "eval_loss": 4.358652114868164, - "eval_runtime": 192.7746, - "eval_samples_per_second": 51.874, - "eval_steps_per_second": 6.484, + "epoch": 284.5497529992943, + "eval_loss": 3.382216691970825, + "eval_runtime": 76.6247, + "eval_samples_per_second": 130.506, + "eval_steps_per_second": 8.157, "step": 20100 }, { - "epoch": 142.56726951918836, - "grad_norm": 2.8932175636291504, + "epoch": 285.96118560338743, + "grad_norm": 0.5381720066070557, "learning_rate": 6.65551839464883e-06, - "loss": 4.5114, + "loss": 3.4639, "step": 20200 }, { - "epoch": 142.56726951918836, - "eval_loss": 4.352255344390869, - "eval_runtime": 192.7204, - "eval_samples_per_second": 51.889, - "eval_steps_per_second": 6.486, + "epoch": 285.96118560338743, + "eval_loss": 3.379488229751587, + "eval_runtime": 76.8042, + "eval_samples_per_second": 130.201, + "eval_steps_per_second": 8.138, "step": 20200 }, { - "epoch": 143.27304808116455, - "grad_norm": 2.0399158000946045, + "epoch": 287.37261820748057, + "grad_norm": 0.5575243234634399, "learning_rate": 6.638795986622074e-06, - "loss": 4.5087, + "loss": 3.4614, "step": 20300 }, { - "epoch": 143.27304808116455, - "eval_loss": 4.344502925872803, - "eval_runtime": 191.5654, - "eval_samples_per_second": 52.201, - "eval_steps_per_second": 6.525, + "epoch": 287.37261820748057, + "eval_loss": 3.37882137298584, + "eval_runtime": 76.6731, + "eval_samples_per_second": 130.424, + "eval_steps_per_second": 8.151, "step": 20300 }, { - "epoch": 143.9788266431407, - "grad_norm": 2.7520031929016113, + "epoch": 288.78405081157376, + "grad_norm": 0.6351670622825623, "learning_rate": 6.622073578595319e-06, - "loss": 4.5059, + "loss": 3.4581, "step": 20400 }, { - "epoch": 143.9788266431407, - "eval_loss": 4.3483452796936035, - "eval_runtime": 192.6443, - "eval_samples_per_second": 51.909, - "eval_steps_per_second": 6.489, + "epoch": 288.78405081157376, + "eval_loss": 3.376701593399048, + "eval_runtime": 76.6829, + "eval_samples_per_second": 130.407, + "eval_steps_per_second": 8.15, "step": 20400 }, { - "epoch": 144.6846052051169, - "grad_norm": 3.5588791370391846, + "epoch": 290.1954834156669, + "grad_norm": 0.8166664242744446, "learning_rate": 6.605351170568562e-06, - "loss": 4.5034, + "loss": 3.4559, "step": 20500 }, { - "epoch": 144.6846052051169, - "eval_loss": 4.353188991546631, - "eval_runtime": 192.9208, - "eval_samples_per_second": 51.835, - "eval_steps_per_second": 6.479, + "epoch": 290.1954834156669, + "eval_loss": 3.37479305267334, + "eval_runtime": 76.6609, + "eval_samples_per_second": 130.445, + "eval_steps_per_second": 8.153, "step": 20500 }, { - "epoch": 145.39038376709308, - "grad_norm": 3.6025145053863525, + "epoch": 291.60691601976004, + "grad_norm": 0.6857735514640808, "learning_rate": 6.588628762541807e-06, - "loss": 4.5004, + "loss": 3.452, "step": 20600 }, { - "epoch": 145.39038376709308, - "eval_loss": 4.348476886749268, - "eval_runtime": 192.8971, - "eval_samples_per_second": 51.841, - "eval_steps_per_second": 6.48, + "epoch": 291.60691601976004, + "eval_loss": 3.372075080871582, + "eval_runtime": 76.5027, + "eval_samples_per_second": 130.714, + "eval_steps_per_second": 8.17, "step": 20600 }, { - "epoch": 146.09616232906924, - "grad_norm": 3.118164300918579, + "epoch": 293.0183486238532, + "grad_norm": 0.5841640830039978, "learning_rate": 6.571906354515051e-06, - "loss": 4.4985, + "loss": 3.4506, "step": 20700 }, { - "epoch": 146.09616232906924, - "eval_loss": 4.34409236907959, - "eval_runtime": 192.7938, - "eval_samples_per_second": 51.869, - "eval_steps_per_second": 6.484, + "epoch": 293.0183486238532, + "eval_loss": 3.367901563644409, + "eval_runtime": 75.957, + "eval_samples_per_second": 131.653, + "eval_steps_per_second": 8.228, "step": 20700 }, { - "epoch": 146.80194089104543, - "grad_norm": 3.1238110065460205, + "epoch": 294.42978122794636, + "grad_norm": 0.6171954870223999, "learning_rate": 6.5551839464882945e-06, - "loss": 4.4967, + "loss": 3.4472, "step": 20800 }, { - "epoch": 146.80194089104543, - "eval_loss": 4.336893081665039, - "eval_runtime": 192.9869, - "eval_samples_per_second": 51.817, - "eval_steps_per_second": 6.477, + "epoch": 294.42978122794636, + "eval_loss": 3.3702898025512695, + "eval_runtime": 75.9329, + "eval_samples_per_second": 131.695, + "eval_steps_per_second": 8.231, "step": 20800 }, { - "epoch": 147.50771945302162, - "grad_norm": 2.9554052352905273, + "epoch": 295.8412138320395, + "grad_norm": 0.5663770437240601, "learning_rate": 6.538461538461539e-06, - "loss": 4.4941, + "loss": 3.4471, "step": 20900 }, { - "epoch": 147.50771945302162, - "eval_loss": 4.332675933837891, - "eval_runtime": 193.2794, - "eval_samples_per_second": 51.739, - "eval_steps_per_second": 6.467, + "epoch": 295.8412138320395, + "eval_loss": 3.363970994949341, + "eval_runtime": 75.4098, + "eval_samples_per_second": 132.609, + "eval_steps_per_second": 8.288, "step": 20900 }, { - "epoch": 148.21349801499778, - "grad_norm": 3.8570468425750732, + "epoch": 297.2526464361327, + "grad_norm": 0.5675836801528931, "learning_rate": 6.521739130434783e-06, - "loss": 4.4896, + "loss": 3.4444, "step": 21000 }, { - "epoch": 148.21349801499778, - "eval_loss": 4.33201265335083, - "eval_runtime": 192.1673, - "eval_samples_per_second": 52.038, - "eval_steps_per_second": 6.505, + "epoch": 297.2526464361327, + "eval_loss": 3.366093397140503, + "eval_runtime": 75.0156, + "eval_samples_per_second": 133.306, + "eval_steps_per_second": 8.332, "step": 21000 }, { - "epoch": 148.91927657697397, - "grad_norm": 3.5484137535095215, + "epoch": 298.66407904022583, + "grad_norm": 0.6213370561599731, "learning_rate": 6.5050167224080275e-06, - "loss": 4.4865, + "loss": 3.4411, "step": 21100 }, { - "epoch": 148.91927657697397, - "eval_loss": 4.323690891265869, - "eval_runtime": 192.7014, - "eval_samples_per_second": 51.894, - "eval_steps_per_second": 6.487, + "epoch": 298.66407904022583, + "eval_loss": 3.362675666809082, + "eval_runtime": 75.5557, + "eval_samples_per_second": 132.353, + "eval_steps_per_second": 8.272, "step": 21100 }, { - "epoch": 149.62505513895016, - "grad_norm": 2.477860689163208, + "epoch": 300.07551164431896, + "grad_norm": 0.5837761759757996, "learning_rate": 6.488294314381271e-06, - "loss": 4.4854, + "loss": 3.4401, "step": 21200 }, { - "epoch": 149.62505513895016, - "eval_loss": 4.325220108032227, - "eval_runtime": 192.711, - "eval_samples_per_second": 51.891, - "eval_steps_per_second": 6.486, + "epoch": 300.07551164431896, + "eval_loss": 3.3615338802337646, + "eval_runtime": 76.9026, + "eval_samples_per_second": 130.035, + "eval_steps_per_second": 8.127, "step": 21200 }, { - "epoch": 150.33083370092635, - "grad_norm": 2.7688395977020264, + "epoch": 301.48694424841216, + "grad_norm": 0.512986958026886, "learning_rate": 6.471571906354516e-06, - "loss": 4.4811, + "loss": 3.4376, "step": 21300 }, { - "epoch": 150.33083370092635, - "eval_loss": 4.318790912628174, - "eval_runtime": 191.6687, - "eval_samples_per_second": 52.173, - "eval_steps_per_second": 6.522, + "epoch": 301.48694424841216, + "eval_loss": 3.360222578048706, + "eval_runtime": 77.0669, + "eval_samples_per_second": 129.757, + "eval_steps_per_second": 8.11, "step": 21300 }, { - "epoch": 151.0366122629025, - "grad_norm": 2.488335132598877, + "epoch": 302.8983768525053, + "grad_norm": 0.6404605507850647, "learning_rate": 6.45484949832776e-06, - "loss": 4.4779, + "loss": 3.435, "step": 21400 }, { - "epoch": 151.0366122629025, - "eval_loss": 4.315010070800781, - "eval_runtime": 192.8194, - "eval_samples_per_second": 51.862, - "eval_steps_per_second": 6.483, + "epoch": 302.8983768525053, + "eval_loss": 3.355327844619751, + "eval_runtime": 77.0252, + "eval_samples_per_second": 129.828, + "eval_steps_per_second": 8.114, "step": 21400 }, { - "epoch": 151.7423908248787, - "grad_norm": 4.23807430267334, + "epoch": 304.30980945659843, + "grad_norm": 0.6251723766326904, "learning_rate": 6.438127090301004e-06, - "loss": 4.4748, + "loss": 3.4317, "step": 21500 }, { - "epoch": 151.7423908248787, - "eval_loss": 4.310873508453369, - "eval_runtime": 192.7367, - "eval_samples_per_second": 51.884, - "eval_steps_per_second": 6.486, + "epoch": 304.30980945659843, + "eval_loss": 3.3560776710510254, + "eval_runtime": 76.6868, + "eval_samples_per_second": 130.4, + "eval_steps_per_second": 8.15, "step": 21500 }, { - "epoch": 152.4481693868549, - "grad_norm": 3.4757931232452393, + "epoch": 305.7212420606916, + "grad_norm": 0.6398562788963318, "learning_rate": 6.421404682274248e-06, - "loss": 4.4709, + "loss": 3.4313, "step": 21600 }, { - "epoch": 152.4481693868549, - "eval_loss": 4.311969757080078, - "eval_runtime": 192.7365, - "eval_samples_per_second": 51.884, - "eval_steps_per_second": 6.486, + "epoch": 305.7212420606916, + "eval_loss": 3.3519680500030518, + "eval_runtime": 76.6282, + "eval_samples_per_second": 130.5, + "eval_steps_per_second": 8.156, "step": 21600 }, { - "epoch": 153.15394794883105, - "grad_norm": 2.7201569080352783, + "epoch": 307.13267466478476, + "grad_norm": 0.6046691536903381, "learning_rate": 6.4046822742474926e-06, - "loss": 4.4702, + "loss": 3.4288, "step": 21700 }, { - "epoch": 153.15394794883105, - "eval_loss": 4.309377670288086, - "eval_runtime": 191.683, - "eval_samples_per_second": 52.169, - "eval_steps_per_second": 6.521, + "epoch": 307.13267466478476, + "eval_loss": 3.3501694202423096, + "eval_runtime": 76.6941, + "eval_samples_per_second": 130.388, + "eval_steps_per_second": 8.149, "step": 21700 }, { - "epoch": 153.85972651080723, - "grad_norm": 2.9689505100250244, + "epoch": 308.5441072688779, + "grad_norm": 0.5842333436012268, "learning_rate": 6.387959866220736e-06, - "loss": 4.4676, + "loss": 3.4263, "step": 21800 }, { - "epoch": 153.85972651080723, - "eval_loss": 4.306754112243652, - "eval_runtime": 192.9533, - "eval_samples_per_second": 51.826, - "eval_steps_per_second": 6.478, + "epoch": 308.5441072688779, + "eval_loss": 3.3471367359161377, + "eval_runtime": 76.9426, + "eval_samples_per_second": 129.967, + "eval_steps_per_second": 8.123, "step": 21800 }, { - "epoch": 154.56550507278342, - "grad_norm": 2.387266159057617, + "epoch": 309.9555398729711, + "grad_norm": 0.6153472065925598, "learning_rate": 6.371237458193981e-06, - "loss": 4.4627, + "loss": 3.4239, "step": 21900 }, { - "epoch": 154.56550507278342, - "eval_loss": 4.30245304107666, - "eval_runtime": 193.1693, - "eval_samples_per_second": 51.768, - "eval_steps_per_second": 6.471, + "epoch": 309.9555398729711, + "eval_loss": 3.3457906246185303, + "eval_runtime": 76.7426, + "eval_samples_per_second": 130.306, + "eval_steps_per_second": 8.144, "step": 21900 }, { - "epoch": 155.27128363475958, - "grad_norm": 3.121486186981201, + "epoch": 311.3669724770642, + "grad_norm": 0.5566316246986389, "learning_rate": 6.354515050167225e-06, - "loss": 4.4611, + "loss": 3.4213, "step": 22000 }, { - "epoch": 155.27128363475958, - "eval_loss": 4.305142402648926, - "eval_runtime": 192.9869, - "eval_samples_per_second": 51.817, - "eval_steps_per_second": 6.477, + "epoch": 311.3669724770642, + "eval_loss": 3.343456268310547, + "eval_runtime": 76.5426, + "eval_samples_per_second": 130.646, + "eval_steps_per_second": 8.165, "step": 22000 }, { - "epoch": 155.97706219673577, - "grad_norm": 2.7891833782196045, + "epoch": 312.77840508115736, + "grad_norm": 0.5340844988822937, "learning_rate": 6.337792642140469e-06, - "loss": 4.4584, + "loss": 3.4212, "step": 22100 }, { - "epoch": 155.97706219673577, - "eval_loss": 4.298542499542236, - "eval_runtime": 192.7261, - "eval_samples_per_second": 51.887, - "eval_steps_per_second": 6.486, + "epoch": 312.77840508115736, + "eval_loss": 3.3435897827148438, + "eval_runtime": 76.6598, + "eval_samples_per_second": 130.446, + "eval_steps_per_second": 8.153, "step": 22100 }, { - "epoch": 156.68284075871196, - "grad_norm": 2.5039618015289307, + "epoch": 314.18983768525055, + "grad_norm": 0.5751451849937439, "learning_rate": 6.321070234113713e-06, - "loss": 4.4559, + "loss": 3.4176, "step": 22200 }, { - "epoch": 156.68284075871196, - "eval_loss": 4.297332286834717, - "eval_runtime": 193.002, - "eval_samples_per_second": 51.813, - "eval_steps_per_second": 6.477, + "epoch": 314.18983768525055, + "eval_loss": 3.340423345565796, + "eval_runtime": 76.7241, + "eval_samples_per_second": 130.337, + "eval_steps_per_second": 8.146, "step": 22200 }, { - "epoch": 157.38861932068812, - "grad_norm": 2.1365339756011963, + "epoch": 315.6012702893437, + "grad_norm": 0.6988760828971863, "learning_rate": 6.304347826086958e-06, - "loss": 4.4513, + "loss": 3.4165, "step": 22300 }, { - "epoch": 157.38861932068812, - "eval_loss": 4.295238971710205, - "eval_runtime": 192.8492, - "eval_samples_per_second": 51.854, - "eval_steps_per_second": 6.482, + "epoch": 315.6012702893437, + "eval_loss": 3.3365683555603027, + "eval_runtime": 76.7082, + "eval_samples_per_second": 130.364, + "eval_steps_per_second": 8.148, "step": 22300 }, { - "epoch": 158.0943978826643, - "grad_norm": 3.097100257873535, + "epoch": 317.0127028934368, + "grad_norm": 0.6003105044364929, "learning_rate": 6.287625418060201e-06, - "loss": 4.4497, + "loss": 3.4139, "step": 22400 }, { - "epoch": 158.0943978826643, - "eval_loss": 4.300503253936768, - "eval_runtime": 192.6224, - "eval_samples_per_second": 51.915, - "eval_steps_per_second": 6.489, + "epoch": 317.0127028934368, + "eval_loss": 3.3391008377075195, + "eval_runtime": 76.6667, + "eval_samples_per_second": 130.435, + "eval_steps_per_second": 8.152, "step": 22400 }, { - "epoch": 158.8001764446405, - "grad_norm": 3.8582558631896973, + "epoch": 318.42413549753, + "grad_norm": 0.6000874042510986, "learning_rate": 6.270903010033445e-06, - "loss": 4.4474, + "loss": 3.4116, "step": 22500 }, { - "epoch": 158.8001764446405, - "eval_loss": 4.287741184234619, - "eval_runtime": 192.9249, - "eval_samples_per_second": 51.834, - "eval_steps_per_second": 6.479, + "epoch": 318.42413549753, + "eval_loss": 3.3361566066741943, + "eval_runtime": 76.5668, + "eval_samples_per_second": 130.605, + "eval_steps_per_second": 8.163, "step": 22500 }, { - "epoch": 159.5059550066167, - "grad_norm": 4.082438945770264, + "epoch": 319.83556810162315, + "grad_norm": 0.5550394058227539, "learning_rate": 6.25418060200669e-06, - "loss": 4.4448, + "loss": 3.4107, "step": 22600 }, { - "epoch": 159.5059550066167, - "eval_loss": 4.294191360473633, - "eval_runtime": 192.8498, - "eval_samples_per_second": 51.854, - "eval_steps_per_second": 6.482, + "epoch": 319.83556810162315, + "eval_loss": 3.334117889404297, + "eval_runtime": 76.601, + "eval_samples_per_second": 130.547, + "eval_steps_per_second": 8.159, "step": 22600 }, { - "epoch": 160.21173356859285, - "grad_norm": 3.6704208850860596, + "epoch": 321.2470007057163, + "grad_norm": 0.6073735356330872, "learning_rate": 6.2374581939799335e-06, - "loss": 4.4423, + "loss": 3.4081, "step": 22700 }, { - "epoch": 160.21173356859285, - "eval_loss": 4.287137031555176, - "eval_runtime": 192.9058, - "eval_samples_per_second": 51.839, - "eval_steps_per_second": 6.48, + "epoch": 321.2470007057163, + "eval_loss": 3.331463575363159, + "eval_runtime": 76.8708, + "eval_samples_per_second": 130.088, + "eval_steps_per_second": 8.131, "step": 22700 }, { - "epoch": 160.91751213056904, - "grad_norm": 3.7998480796813965, + "epoch": 322.6584333098095, + "grad_norm": 0.4972577691078186, "learning_rate": 6.220735785953178e-06, - "loss": 4.4398, + "loss": 3.4063, "step": 22800 }, { - "epoch": 160.91751213056904, - "eval_loss": 4.290914535522461, - "eval_runtime": 193.0269, - "eval_samples_per_second": 51.806, - "eval_steps_per_second": 6.476, + "epoch": 322.6584333098095, + "eval_loss": 3.3266327381134033, + "eval_runtime": 76.6986, + "eval_samples_per_second": 130.381, + "eval_steps_per_second": 8.149, "step": 22800 }, { - "epoch": 161.62329069254523, - "grad_norm": 2.859389066696167, + "epoch": 324.0698659139026, + "grad_norm": 0.753437340259552, "learning_rate": 6.204013377926422e-06, - "loss": 4.4382, + "loss": 3.4039, "step": 22900 }, { - "epoch": 161.62329069254523, - "eval_loss": 4.279192924499512, - "eval_runtime": 192.9277, - "eval_samples_per_second": 51.833, - "eval_steps_per_second": 6.479, + "epoch": 324.0698659139026, + "eval_loss": 3.3264803886413574, + "eval_runtime": 76.8541, + "eval_samples_per_second": 130.117, + "eval_steps_per_second": 8.132, "step": 22900 }, { - "epoch": 162.3290692545214, - "grad_norm": 3.8973562717437744, + "epoch": 325.48129851799575, + "grad_norm": 0.6331711411476135, "learning_rate": 6.1872909698996665e-06, - "loss": 4.4363, + "loss": 3.4021, "step": 23000 }, { - "epoch": 162.3290692545214, - "eval_loss": 4.272188186645508, - "eval_runtime": 192.7815, - "eval_samples_per_second": 51.872, - "eval_steps_per_second": 6.484, + "epoch": 325.48129851799575, + "eval_loss": 3.328427314758301, + "eval_runtime": 76.5771, + "eval_samples_per_second": 130.587, + "eval_steps_per_second": 8.162, "step": 23000 + }, + { + "epoch": 326.89273112208895, + "grad_norm": 0.5499814748764038, + "learning_rate": 6.17056856187291e-06, + "loss": 3.4007, + "step": 23100 + }, + { + "epoch": 326.89273112208895, + "eval_loss": 3.325409412384033, + "eval_runtime": 76.8356, + "eval_samples_per_second": 130.148, + "eval_steps_per_second": 8.134, + "step": 23100 + }, + { + "epoch": 328.3041637261821, + "grad_norm": 0.6611850261688232, + "learning_rate": 6.153846153846155e-06, + "loss": 3.3982, + "step": 23200 + }, + { + "epoch": 328.3041637261821, + "eval_loss": 3.325766086578369, + "eval_runtime": 76.7214, + "eval_samples_per_second": 130.342, + "eval_steps_per_second": 8.146, + "step": 23200 + }, + { + "epoch": 329.7155963302752, + "grad_norm": 0.7029439806938171, + "learning_rate": 6.137123745819399e-06, + "loss": 3.3966, + "step": 23300 + }, + { + "epoch": 329.7155963302752, + "eval_loss": 3.3255183696746826, + "eval_runtime": 76.7208, + "eval_samples_per_second": 130.343, + "eval_steps_per_second": 8.146, + "step": 23300 + }, + { + "epoch": 331.1270289343684, + "grad_norm": 0.5664273500442505, + "learning_rate": 6.120401337792643e-06, + "loss": 3.3942, + "step": 23400 + }, + { + "epoch": 331.1270289343684, + "eval_loss": 3.3201589584350586, + "eval_runtime": 76.9218, + "eval_samples_per_second": 130.002, + "eval_steps_per_second": 8.125, + "step": 23400 + }, + { + "epoch": 332.53846153846155, + "grad_norm": 0.49630409479141235, + "learning_rate": 6.103678929765887e-06, + "loss": 3.3915, + "step": 23500 + }, + { + "epoch": 332.53846153846155, + "eval_loss": 3.3179125785827637, + "eval_runtime": 76.7322, + "eval_samples_per_second": 130.323, + "eval_steps_per_second": 8.145, + "step": 23500 + }, + { + "epoch": 333.9498941425547, + "grad_norm": 0.5176893472671509, + "learning_rate": 6.086956521739132e-06, + "loss": 3.3909, + "step": 23600 + }, + { + "epoch": 333.9498941425547, + "eval_loss": 3.316556215286255, + "eval_runtime": 76.9107, + "eval_samples_per_second": 130.021, + "eval_steps_per_second": 8.126, + "step": 23600 + }, + { + "epoch": 335.3613267466478, + "grad_norm": 0.5305048823356628, + "learning_rate": 6.070234113712375e-06, + "loss": 3.3892, + "step": 23700 + }, + { + "epoch": 335.3613267466478, + "eval_loss": 3.314192533493042, + "eval_runtime": 76.9191, + "eval_samples_per_second": 130.007, + "eval_steps_per_second": 8.125, + "step": 23700 + }, + { + "epoch": 336.772759350741, + "grad_norm": 0.5576648116111755, + "learning_rate": 6.05351170568562e-06, + "loss": 3.3875, + "step": 23800 + }, + { + "epoch": 336.772759350741, + "eval_loss": 3.314286708831787, + "eval_runtime": 80.2593, + "eval_samples_per_second": 124.596, + "eval_steps_per_second": 7.787, + "step": 23800 + }, + { + "epoch": 338.18419195483415, + "grad_norm": 0.5328962802886963, + "learning_rate": 6.036789297658864e-06, + "loss": 3.3851, + "step": 23900 + }, + { + "epoch": 338.18419195483415, + "eval_loss": 3.312880516052246, + "eval_runtime": 76.9752, + "eval_samples_per_second": 129.912, + "eval_steps_per_second": 8.119, + "step": 23900 + }, + { + "epoch": 339.5956245589273, + "grad_norm": 0.568818211555481, + "learning_rate": 6.020066889632108e-06, + "loss": 3.3839, + "step": 24000 + }, + { + "epoch": 339.5956245589273, + "eval_loss": 3.3099491596221924, + "eval_runtime": 76.771, + "eval_samples_per_second": 130.258, + "eval_steps_per_second": 8.141, + "step": 24000 + }, + { + "epoch": 341.0070571630205, + "grad_norm": 0.7009373307228088, + "learning_rate": 6.003344481605352e-06, + "loss": 3.3816, + "step": 24100 + }, + { + "epoch": 341.0070571630205, + "eval_loss": 3.3101296424865723, + "eval_runtime": 76.9806, + "eval_samples_per_second": 129.903, + "eval_steps_per_second": 8.119, + "step": 24100 + }, + { + "epoch": 342.4184897671136, + "grad_norm": 0.677617073059082, + "learning_rate": 5.986622073578597e-06, + "loss": 3.3797, + "step": 24200 + }, + { + "epoch": 342.4184897671136, + "eval_loss": 3.3081202507019043, + "eval_runtime": 77.0716, + "eval_samples_per_second": 129.75, + "eval_steps_per_second": 8.109, + "step": 24200 + }, + { + "epoch": 343.82992237120675, + "grad_norm": 0.5626755952835083, + "learning_rate": 5.9698996655518404e-06, + "loss": 3.3783, + "step": 24300 + }, + { + "epoch": 343.82992237120675, + "eval_loss": 3.306441307067871, + "eval_runtime": 77.1155, + "eval_samples_per_second": 129.676, + "eval_steps_per_second": 8.105, + "step": 24300 + }, + { + "epoch": 345.24135497529994, + "grad_norm": 0.5604245662689209, + "learning_rate": 5.953177257525084e-06, + "loss": 3.3767, + "step": 24400 + }, + { + "epoch": 345.24135497529994, + "eval_loss": 3.3060827255249023, + "eval_runtime": 77.0349, + "eval_samples_per_second": 129.811, + "eval_steps_per_second": 8.113, + "step": 24400 + }, + { + "epoch": 346.6527875793931, + "grad_norm": 0.4447239339351654, + "learning_rate": 5.936454849498329e-06, + "loss": 3.3753, + "step": 24500 + }, + { + "epoch": 346.6527875793931, + "eval_loss": 3.299449920654297, + "eval_runtime": 77.0415, + "eval_samples_per_second": 129.8, + "eval_steps_per_second": 8.113, + "step": 24500 + }, + { + "epoch": 348.0642201834862, + "grad_norm": 0.4330262839794159, + "learning_rate": 5.9197324414715726e-06, + "loss": 3.3728, + "step": 24600 + }, + { + "epoch": 348.0642201834862, + "eval_loss": 3.3012359142303467, + "eval_runtime": 76.9455, + "eval_samples_per_second": 129.962, + "eval_steps_per_second": 8.123, + "step": 24600 + }, + { + "epoch": 349.4756527875794, + "grad_norm": 0.5388196110725403, + "learning_rate": 5.903010033444817e-06, + "loss": 3.3712, + "step": 24700 + }, + { + "epoch": 349.4756527875794, + "eval_loss": 3.3015191555023193, + "eval_runtime": 77.0484, + "eval_samples_per_second": 129.789, + "eval_steps_per_second": 8.112, + "step": 24700 + }, + { + "epoch": 350.88708539167254, + "grad_norm": 0.5516813397407532, + "learning_rate": 5.886287625418061e-06, + "loss": 3.3687, + "step": 24800 + }, + { + "epoch": 350.88708539167254, + "eval_loss": 3.299619436264038, + "eval_runtime": 77.2423, + "eval_samples_per_second": 129.463, + "eval_steps_per_second": 8.091, + "step": 24800 + }, + { + "epoch": 352.2985179957657, + "grad_norm": 0.5282019972801208, + "learning_rate": 5.8695652173913055e-06, + "loss": 3.3675, + "step": 24900 + }, + { + "epoch": 352.2985179957657, + "eval_loss": 3.2979164123535156, + "eval_runtime": 77.1133, + "eval_samples_per_second": 129.679, + "eval_steps_per_second": 8.105, + "step": 24900 + }, + { + "epoch": 353.70995059985887, + "grad_norm": 0.5380076169967651, + "learning_rate": 5.852842809364549e-06, + "loss": 3.366, + "step": 25000 + }, + { + "epoch": 353.70995059985887, + "eval_loss": 3.2956883907318115, + "eval_runtime": 77.1138, + "eval_samples_per_second": 129.679, + "eval_steps_per_second": 8.105, + "step": 25000 + }, + { + "epoch": 355.121383203952, + "grad_norm": 0.5667893290519714, + "learning_rate": 5.836120401337794e-06, + "loss": 3.3642, + "step": 25100 + }, + { + "epoch": 355.121383203952, + "eval_loss": 3.292149543762207, + "eval_runtime": 76.8472, + "eval_samples_per_second": 130.128, + "eval_steps_per_second": 8.133, + "step": 25100 + }, + { + "epoch": 356.53281580804514, + "grad_norm": 0.5600453019142151, + "learning_rate": 5.819397993311037e-06, + "loss": 3.363, + "step": 25200 + }, + { + "epoch": 356.53281580804514, + "eval_loss": 3.2910449504852295, + "eval_runtime": 77.1271, + "eval_samples_per_second": 129.656, + "eval_steps_per_second": 8.104, + "step": 25200 + }, + { + "epoch": 357.94424841213834, + "grad_norm": 0.5726943016052246, + "learning_rate": 5.8026755852842806e-06, + "loss": 3.3617, + "step": 25300 + }, + { + "epoch": 357.94424841213834, + "eval_loss": 3.292724609375, + "eval_runtime": 77.1078, + "eval_samples_per_second": 129.689, + "eval_steps_per_second": 8.106, + "step": 25300 + }, + { + "epoch": 359.3556810162315, + "grad_norm": 0.5794702768325806, + "learning_rate": 5.785953177257525e-06, + "loss": 3.3605, + "step": 25400 + }, + { + "epoch": 359.3556810162315, + "eval_loss": 3.2890124320983887, + "eval_runtime": 75.9883, + "eval_samples_per_second": 131.599, + "eval_steps_per_second": 8.225, + "step": 25400 + }, + { + "epoch": 360.7671136203246, + "grad_norm": 0.5813022255897522, + "learning_rate": 5.769230769230769e-06, + "loss": 3.3579, + "step": 25500 + }, + { + "epoch": 360.7671136203246, + "eval_loss": 3.288177251815796, + "eval_runtime": 76.09, + "eval_samples_per_second": 131.423, + "eval_steps_per_second": 8.214, + "step": 25500 + }, + { + "epoch": 362.1785462244178, + "grad_norm": 0.5103623270988464, + "learning_rate": 5.7525083612040135e-06, + "loss": 3.3559, + "step": 25600 + }, + { + "epoch": 362.1785462244178, + "eval_loss": 3.2898659706115723, + "eval_runtime": 75.9717, + "eval_samples_per_second": 131.628, + "eval_steps_per_second": 8.227, + "step": 25600 + }, + { + "epoch": 363.58997882851094, + "grad_norm": 0.5904416441917419, + "learning_rate": 5.735785953177257e-06, + "loss": 3.3546, + "step": 25700 + }, + { + "epoch": 363.58997882851094, + "eval_loss": 3.2834417819976807, + "eval_runtime": 76.1148, + "eval_samples_per_second": 131.38, + "eval_steps_per_second": 8.211, + "step": 25700 + }, + { + "epoch": 365.0014114326041, + "grad_norm": 0.5598199367523193, + "learning_rate": 5.719063545150502e-06, + "loss": 3.3532, + "step": 25800 + }, + { + "epoch": 365.0014114326041, + "eval_loss": 3.2836413383483887, + "eval_runtime": 76.0815, + "eval_samples_per_second": 131.438, + "eval_steps_per_second": 8.215, + "step": 25800 + }, + { + "epoch": 366.41284403669727, + "grad_norm": 0.4987429976463318, + "learning_rate": 5.702341137123746e-06, + "loss": 3.3506, + "step": 25900 + }, + { + "epoch": 366.41284403669727, + "eval_loss": 3.2850100994110107, + "eval_runtime": 76.1264, + "eval_samples_per_second": 131.361, + "eval_steps_per_second": 8.21, + "step": 25900 + }, + { + "epoch": 367.8242766407904, + "grad_norm": 0.5134937763214111, + "learning_rate": 5.68561872909699e-06, + "loss": 3.3502, + "step": 26000 + }, + { + "epoch": 367.8242766407904, + "eval_loss": 3.2814862728118896, + "eval_runtime": 74.6669, + "eval_samples_per_second": 133.928, + "eval_steps_per_second": 8.371, + "step": 26000 + }, + { + "epoch": 369.23570924488354, + "grad_norm": 0.44258707761764526, + "learning_rate": 5.668896321070234e-06, + "loss": 3.3486, + "step": 26100 + }, + { + "epoch": 369.23570924488354, + "eval_loss": 3.277559757232666, + "eval_runtime": 74.7207, + "eval_samples_per_second": 133.832, + "eval_steps_per_second": 8.364, + "step": 26100 + }, + { + "epoch": 370.64714184897673, + "grad_norm": 0.5384514927864075, + "learning_rate": 5.652173913043479e-06, + "loss": 3.3449, + "step": 26200 + }, + { + "epoch": 370.64714184897673, + "eval_loss": 3.2763917446136475, + "eval_runtime": 74.5448, + "eval_samples_per_second": 134.148, + "eval_steps_per_second": 8.384, + "step": 26200 + }, + { + "epoch": 372.05857445306987, + "grad_norm": 0.6275975108146667, + "learning_rate": 5.635451505016722e-06, + "loss": 3.3452, + "step": 26300 + }, + { + "epoch": 372.05857445306987, + "eval_loss": 3.273946523666382, + "eval_runtime": 74.5431, + "eval_samples_per_second": 134.151, + "eval_steps_per_second": 8.384, + "step": 26300 + }, + { + "epoch": 373.470007057163, + "grad_norm": 0.44813865423202515, + "learning_rate": 5.618729096989967e-06, + "loss": 3.3436, + "step": 26400 + }, + { + "epoch": 373.470007057163, + "eval_loss": 3.272003412246704, + "eval_runtime": 74.6769, + "eval_samples_per_second": 133.91, + "eval_steps_per_second": 8.369, + "step": 26400 + }, + { + "epoch": 374.8814396612562, + "grad_norm": 0.5522853136062622, + "learning_rate": 5.602006688963211e-06, + "loss": 3.3418, + "step": 26500 + }, + { + "epoch": 374.8814396612562, + "eval_loss": 3.271955966949463, + "eval_runtime": 74.7546, + "eval_samples_per_second": 133.771, + "eval_steps_per_second": 8.361, + "step": 26500 + }, + { + "epoch": 376.29287226534933, + "grad_norm": 0.6033642888069153, + "learning_rate": 5.585284280936455e-06, + "loss": 3.3406, + "step": 26600 + }, + { + "epoch": 376.29287226534933, + "eval_loss": 3.2729640007019043, + "eval_runtime": 74.7615, + "eval_samples_per_second": 133.759, + "eval_steps_per_second": 8.36, + "step": 26600 + }, + { + "epoch": 377.70430486944247, + "grad_norm": 0.39052197337150574, + "learning_rate": 5.568561872909699e-06, + "loss": 3.3389, + "step": 26700 + }, + { + "epoch": 377.70430486944247, + "eval_loss": 3.2696945667266846, + "eval_runtime": 74.854, + "eval_samples_per_second": 133.593, + "eval_steps_per_second": 8.35, + "step": 26700 + }, + { + "epoch": 379.11573747353566, + "grad_norm": 0.6026007533073425, + "learning_rate": 5.551839464882943e-06, + "loss": 3.3365, + "step": 26800 + }, + { + "epoch": 379.11573747353566, + "eval_loss": 3.2685930728912354, + "eval_runtime": 74.7049, + "eval_samples_per_second": 133.86, + "eval_steps_per_second": 8.366, + "step": 26800 + }, + { + "epoch": 380.5271700776288, + "grad_norm": 0.47670257091522217, + "learning_rate": 5.5351170568561875e-06, + "loss": 3.3368, + "step": 26900 + }, + { + "epoch": 380.5271700776288, + "eval_loss": 3.2685508728027344, + "eval_runtime": 74.7567, + "eval_samples_per_second": 133.767, + "eval_steps_per_second": 8.36, + "step": 26900 + }, + { + "epoch": 381.93860268172193, + "grad_norm": 0.5947960615158081, + "learning_rate": 5.518394648829431e-06, + "loss": 3.3337, + "step": 27000 + }, + { + "epoch": 381.93860268172193, + "eval_loss": 3.2665364742279053, + "eval_runtime": 75.0954, + "eval_samples_per_second": 133.164, + "eval_steps_per_second": 8.323, + "step": 27000 + }, + { + "epoch": 383.3500352858151, + "grad_norm": 0.5476217865943909, + "learning_rate": 5.501672240802676e-06, + "loss": 3.3329, + "step": 27100 + }, + { + "epoch": 383.3500352858151, + "eval_loss": 3.2639975547790527, + "eval_runtime": 75.1039, + "eval_samples_per_second": 133.149, + "eval_steps_per_second": 8.322, + "step": 27100 + }, + { + "epoch": 384.76146788990826, + "grad_norm": 0.4159116744995117, + "learning_rate": 5.48494983277592e-06, + "loss": 3.3315, + "step": 27200 + }, + { + "epoch": 384.76146788990826, + "eval_loss": 3.263084650039673, + "eval_runtime": 74.9436, + "eval_samples_per_second": 133.434, + "eval_steps_per_second": 8.34, + "step": 27200 + }, + { + "epoch": 386.1729004940014, + "grad_norm": 0.5419652462005615, + "learning_rate": 5.468227424749164e-06, + "loss": 3.3303, + "step": 27300 + }, + { + "epoch": 386.1729004940014, + "eval_loss": 3.2612051963806152, + "eval_runtime": 75.1117, + "eval_samples_per_second": 133.135, + "eval_steps_per_second": 8.321, + "step": 27300 + }, + { + "epoch": 387.5843330980946, + "grad_norm": 0.512188732624054, + "learning_rate": 5.451505016722408e-06, + "loss": 3.3292, + "step": 27400 + }, + { + "epoch": 387.5843330980946, + "eval_loss": 3.259936809539795, + "eval_runtime": 75.0941, + "eval_samples_per_second": 133.166, + "eval_steps_per_second": 8.323, + "step": 27400 + }, + { + "epoch": 388.9957657021877, + "grad_norm": 0.727654755115509, + "learning_rate": 5.4347826086956525e-06, + "loss": 3.3269, + "step": 27500 + }, + { + "epoch": 388.9957657021877, + "eval_loss": 3.261517286300659, + "eval_runtime": 75.0628, + "eval_samples_per_second": 133.222, + "eval_steps_per_second": 8.326, + "step": 27500 + }, + { + "epoch": 390.40719830628086, + "grad_norm": 0.5372282862663269, + "learning_rate": 5.418060200668896e-06, + "loss": 3.3254, + "step": 27600 + }, + { + "epoch": 390.40719830628086, + "eval_loss": 3.258228302001953, + "eval_runtime": 74.9449, + "eval_samples_per_second": 133.431, + "eval_steps_per_second": 8.339, + "step": 27600 + }, + { + "epoch": 391.81863091037405, + "grad_norm": 0.5655878186225891, + "learning_rate": 5.401337792642141e-06, + "loss": 3.3242, + "step": 27700 + }, + { + "epoch": 391.81863091037405, + "eval_loss": 3.2560343742370605, + "eval_runtime": 75.2757, + "eval_samples_per_second": 132.845, + "eval_steps_per_second": 8.303, + "step": 27700 + }, + { + "epoch": 393.2300635144672, + "grad_norm": 0.6290187835693359, + "learning_rate": 5.384615384615385e-06, + "loss": 3.3238, + "step": 27800 + }, + { + "epoch": 393.2300635144672, + "eval_loss": 3.2537336349487305, + "eval_runtime": 75.0543, + "eval_samples_per_second": 133.237, + "eval_steps_per_second": 8.327, + "step": 27800 + }, + { + "epoch": 394.6414961185603, + "grad_norm": 0.6421985626220703, + "learning_rate": 5.367892976588629e-06, + "loss": 3.3218, + "step": 27900 + }, + { + "epoch": 394.6414961185603, + "eval_loss": 3.254143238067627, + "eval_runtime": 75.0661, + "eval_samples_per_second": 133.216, + "eval_steps_per_second": 8.326, + "step": 27900 + }, + { + "epoch": 396.0529287226535, + "grad_norm": 0.4675317108631134, + "learning_rate": 5.351170568561873e-06, + "loss": 3.3203, + "step": 28000 + }, + { + "epoch": 396.0529287226535, + "eval_loss": 3.2528676986694336, + "eval_runtime": 74.9957, + "eval_samples_per_second": 133.341, + "eval_steps_per_second": 8.334, + "step": 28000 + }, + { + "epoch": 397.46436132674665, + "grad_norm": 0.5349506735801697, + "learning_rate": 5.334448160535118e-06, + "loss": 3.3185, + "step": 28100 + }, + { + "epoch": 397.46436132674665, + "eval_loss": 3.2534239292144775, + "eval_runtime": 74.7782, + "eval_samples_per_second": 133.729, + "eval_steps_per_second": 8.358, + "step": 28100 + }, + { + "epoch": 398.8757939308398, + "grad_norm": 0.6712241768836975, + "learning_rate": 5.317725752508361e-06, + "loss": 3.3173, + "step": 28200 + }, + { + "epoch": 398.8757939308398, + "eval_loss": 3.2485058307647705, + "eval_runtime": 74.9968, + "eval_samples_per_second": 133.339, + "eval_steps_per_second": 8.334, + "step": 28200 + }, + { + "epoch": 400.287226534933, + "grad_norm": 0.4701189696788788, + "learning_rate": 5.301003344481606e-06, + "loss": 3.3163, + "step": 28300 + }, + { + "epoch": 400.287226534933, + "eval_loss": 3.2490499019622803, + "eval_runtime": 75.0541, + "eval_samples_per_second": 133.237, + "eval_steps_per_second": 8.327, + "step": 28300 + }, + { + "epoch": 401.6986591390261, + "grad_norm": 0.5290805101394653, + "learning_rate": 5.28428093645485e-06, + "loss": 3.315, + "step": 28400 + }, + { + "epoch": 401.6986591390261, + "eval_loss": 3.2493200302124023, + "eval_runtime": 75.1066, + "eval_samples_per_second": 133.144, + "eval_steps_per_second": 8.322, + "step": 28400 + }, + { + "epoch": 403.11009174311926, + "grad_norm": 0.5082231760025024, + "learning_rate": 5.2675585284280935e-06, + "loss": 3.313, + "step": 28500 + }, + { + "epoch": 403.11009174311926, + "eval_loss": 3.2446095943450928, + "eval_runtime": 75.0521, + "eval_samples_per_second": 133.241, + "eval_steps_per_second": 8.328, + "step": 28500 + }, + { + "epoch": 404.52152434721245, + "grad_norm": 0.6178306937217712, + "learning_rate": 5.250836120401338e-06, + "loss": 3.3124, + "step": 28600 + }, + { + "epoch": 404.52152434721245, + "eval_loss": 3.243537425994873, + "eval_runtime": 74.8287, + "eval_samples_per_second": 133.638, + "eval_steps_per_second": 8.352, + "step": 28600 + }, + { + "epoch": 405.9329569513056, + "grad_norm": 0.48364028334617615, + "learning_rate": 5.234113712374582e-06, + "loss": 3.3104, + "step": 28700 + }, + { + "epoch": 405.9329569513056, + "eval_loss": 3.243783712387085, + "eval_runtime": 74.8558, + "eval_samples_per_second": 133.59, + "eval_steps_per_second": 8.349, + "step": 28700 + }, + { + "epoch": 407.3443895553987, + "grad_norm": 0.6693919897079468, + "learning_rate": 5.2173913043478265e-06, + "loss": 3.3088, + "step": 28800 + }, + { + "epoch": 407.3443895553987, + "eval_loss": 3.2437820434570312, + "eval_runtime": 75.0675, + "eval_samples_per_second": 133.213, + "eval_steps_per_second": 8.326, + "step": 28800 + }, + { + "epoch": 408.75582215949186, + "grad_norm": 0.5126225352287292, + "learning_rate": 5.20066889632107e-06, + "loss": 3.3087, + "step": 28900 + }, + { + "epoch": 408.75582215949186, + "eval_loss": 3.2396769523620605, + "eval_runtime": 74.9695, + "eval_samples_per_second": 133.388, + "eval_steps_per_second": 8.337, + "step": 28900 + }, + { + "epoch": 410.16725476358505, + "grad_norm": 0.48420536518096924, + "learning_rate": 5.183946488294315e-06, + "loss": 3.3072, + "step": 29000 + }, + { + "epoch": 410.16725476358505, + "eval_loss": 3.2414257526397705, + "eval_runtime": 75.1143, + "eval_samples_per_second": 133.13, + "eval_steps_per_second": 8.321, + "step": 29000 + }, + { + "epoch": 411.5786873676782, + "grad_norm": 0.4449322819709778, + "learning_rate": 5.167224080267559e-06, + "loss": 3.3052, + "step": 29100 + }, + { + "epoch": 411.5786873676782, + "eval_loss": 3.2392234802246094, + "eval_runtime": 75.0722, + "eval_samples_per_second": 133.205, + "eval_steps_per_second": 8.325, + "step": 29100 + }, + { + "epoch": 412.9901199717713, + "grad_norm": 0.5432822704315186, + "learning_rate": 5.150501672240803e-06, + "loss": 3.3039, + "step": 29200 + }, + { + "epoch": 412.9901199717713, + "eval_loss": 3.236405372619629, + "eval_runtime": 74.8808, + "eval_samples_per_second": 133.546, + "eval_steps_per_second": 8.347, + "step": 29200 + }, + { + "epoch": 414.4015525758645, + "grad_norm": 0.5220522880554199, + "learning_rate": 5.133779264214047e-06, + "loss": 3.3038, + "step": 29300 + }, + { + "epoch": 414.4015525758645, + "eval_loss": 3.237210273742676, + "eval_runtime": 75.1138, + "eval_samples_per_second": 133.131, + "eval_steps_per_second": 8.321, + "step": 29300 + }, + { + "epoch": 415.81298517995765, + "grad_norm": 0.40687912702560425, + "learning_rate": 5.1170568561872916e-06, + "loss": 3.3017, + "step": 29400 + }, + { + "epoch": 415.81298517995765, + "eval_loss": 3.2353012561798096, + "eval_runtime": 75.0181, + "eval_samples_per_second": 133.301, + "eval_steps_per_second": 8.331, + "step": 29400 + }, + { + "epoch": 417.2244177840508, + "grad_norm": 0.5893601775169373, + "learning_rate": 5.100334448160535e-06, + "loss": 3.3005, + "step": 29500 + }, + { + "epoch": 417.2244177840508, + "eval_loss": 3.2351584434509277, + "eval_runtime": 75.0403, + "eval_samples_per_second": 133.262, + "eval_steps_per_second": 8.329, + "step": 29500 + }, + { + "epoch": 418.635850388144, + "grad_norm": 0.5068885087966919, + "learning_rate": 5.08361204013378e-06, + "loss": 3.2986, + "step": 29600 + }, + { + "epoch": 418.635850388144, + "eval_loss": 3.234811782836914, + "eval_runtime": 75.0157, + "eval_samples_per_second": 133.306, + "eval_steps_per_second": 8.332, + "step": 29600 + }, + { + "epoch": 420.0472829922371, + "grad_norm": 0.5675772428512573, + "learning_rate": 5.066889632107024e-06, + "loss": 3.2984, + "step": 29700 + }, + { + "epoch": 420.0472829922371, + "eval_loss": 3.231552839279175, + "eval_runtime": 74.8696, + "eval_samples_per_second": 133.566, + "eval_steps_per_second": 8.348, + "step": 29700 + }, + { + "epoch": 421.45871559633025, + "grad_norm": 0.5616305470466614, + "learning_rate": 5.050167224080268e-06, + "loss": 3.2971, + "step": 29800 + }, + { + "epoch": 421.45871559633025, + "eval_loss": 3.2294559478759766, + "eval_runtime": 74.9526, + "eval_samples_per_second": 133.418, + "eval_steps_per_second": 8.339, + "step": 29800 + }, + { + "epoch": 422.87014820042344, + "grad_norm": 0.4248732328414917, + "learning_rate": 5.033444816053512e-06, + "loss": 3.2951, + "step": 29900 + }, + { + "epoch": 422.87014820042344, + "eval_loss": 3.2306745052337646, + "eval_runtime": 74.9879, + "eval_samples_per_second": 133.355, + "eval_steps_per_second": 8.335, + "step": 29900 + }, + { + "epoch": 424.2815808045166, + "grad_norm": 0.4150717556476593, + "learning_rate": 5.016722408026757e-06, + "loss": 3.2934, + "step": 30000 + }, + { + "epoch": 424.2815808045166, + "eval_loss": 3.2273478507995605, + "eval_runtime": 75.0475, + "eval_samples_per_second": 133.249, + "eval_steps_per_second": 8.328, + "step": 30000 + }, + { + "epoch": 425.6930134086097, + "grad_norm": 0.43221816420555115, + "learning_rate": 5e-06, + "loss": 3.293, + "step": 30100 + }, + { + "epoch": 425.6930134086097, + "eval_loss": 3.2289834022521973, + "eval_runtime": 75.3497, + "eval_samples_per_second": 132.715, + "eval_steps_per_second": 8.295, + "step": 30100 + }, + { + "epoch": 427.1044460127029, + "grad_norm": 0.5517231822013855, + "learning_rate": 4.983277591973244e-06, + "loss": 3.2926, + "step": 30200 + }, + { + "epoch": 427.1044460127029, + "eval_loss": 3.2246079444885254, + "eval_runtime": 75.5521, + "eval_samples_per_second": 132.359, + "eval_steps_per_second": 8.272, + "step": 30200 + }, + { + "epoch": 428.51587861679604, + "grad_norm": 0.47511130571365356, + "learning_rate": 4.966555183946489e-06, + "loss": 3.2895, + "step": 30300 + }, + { + "epoch": 428.51587861679604, + "eval_loss": 3.2235753536224365, + "eval_runtime": 75.5117, + "eval_samples_per_second": 132.43, + "eval_steps_per_second": 8.277, + "step": 30300 + }, + { + "epoch": 429.9273112208892, + "grad_norm": 0.4866757392883301, + "learning_rate": 4.9498327759197325e-06, + "loss": 3.2895, + "step": 30400 + }, + { + "epoch": 429.9273112208892, + "eval_loss": 3.2261710166931152, + "eval_runtime": 75.5689, + "eval_samples_per_second": 132.33, + "eval_steps_per_second": 8.271, + "step": 30400 + }, + { + "epoch": 431.3387438249824, + "grad_norm": 0.4943363666534424, + "learning_rate": 4.933110367892977e-06, + "loss": 3.2883, + "step": 30500 + }, + { + "epoch": 431.3387438249824, + "eval_loss": 3.2263576984405518, + "eval_runtime": 75.3634, + "eval_samples_per_second": 132.69, + "eval_steps_per_second": 8.293, + "step": 30500 + }, + { + "epoch": 432.7501764290755, + "grad_norm": 0.5846276879310608, + "learning_rate": 4.916387959866221e-06, + "loss": 3.2873, + "step": 30600 + }, + { + "epoch": 432.7501764290755, + "eval_loss": 3.220747709274292, + "eval_runtime": 75.4241, + "eval_samples_per_second": 132.584, + "eval_steps_per_second": 8.286, + "step": 30600 + }, + { + "epoch": 434.16160903316865, + "grad_norm": 0.4371052086353302, + "learning_rate": 4.8996655518394655e-06, + "loss": 3.2857, + "step": 30700 + }, + { + "epoch": 434.16160903316865, + "eval_loss": 3.2217774391174316, + "eval_runtime": 75.5005, + "eval_samples_per_second": 132.45, + "eval_steps_per_second": 8.278, + "step": 30700 + }, + { + "epoch": 435.57304163726184, + "grad_norm": 0.5578730702400208, + "learning_rate": 4.882943143812709e-06, + "loss": 3.2844, + "step": 30800 + }, + { + "epoch": 435.57304163726184, + "eval_loss": 3.219982862472534, + "eval_runtime": 75.31, + "eval_samples_per_second": 132.784, + "eval_steps_per_second": 8.299, + "step": 30800 + }, + { + "epoch": 436.984474241355, + "grad_norm": 0.4692535996437073, + "learning_rate": 4.866220735785954e-06, + "loss": 3.2839, + "step": 30900 + }, + { + "epoch": 436.984474241355, + "eval_loss": 3.2172024250030518, + "eval_runtime": 75.315, + "eval_samples_per_second": 132.776, + "eval_steps_per_second": 8.298, + "step": 30900 + }, + { + "epoch": 438.3959068454481, + "grad_norm": 0.5210705995559692, + "learning_rate": 4.849498327759198e-06, + "loss": 3.2817, + "step": 31000 + }, + { + "epoch": 438.3959068454481, + "eval_loss": 3.220083236694336, + "eval_runtime": 74.9928, + "eval_samples_per_second": 133.346, + "eval_steps_per_second": 8.334, + "step": 31000 + }, + { + "epoch": 439.8073394495413, + "grad_norm": 0.417936235666275, + "learning_rate": 4.832775919732442e-06, + "loss": 3.2814, + "step": 31100 + }, + { + "epoch": 439.8073394495413, + "eval_loss": 3.218825101852417, + "eval_runtime": 75.2435, + "eval_samples_per_second": 132.902, + "eval_steps_per_second": 8.306, + "step": 31100 + }, + { + "epoch": 441.21877205363444, + "grad_norm": 0.5074178576469421, + "learning_rate": 4.816053511705686e-06, + "loss": 3.2805, + "step": 31200 + }, + { + "epoch": 441.21877205363444, + "eval_loss": 3.218318462371826, + "eval_runtime": 75.2501, + "eval_samples_per_second": 132.89, + "eval_steps_per_second": 8.306, + "step": 31200 + }, + { + "epoch": 442.6302046577276, + "grad_norm": 0.6320539116859436, + "learning_rate": 4.799331103678931e-06, + "loss": 3.2791, + "step": 31300 + }, + { + "epoch": 442.6302046577276, + "eval_loss": 3.215117931365967, + "eval_runtime": 75.2339, + "eval_samples_per_second": 132.919, + "eval_steps_per_second": 8.307, + "step": 31300 + }, + { + "epoch": 444.04163726182077, + "grad_norm": 0.573082685470581, + "learning_rate": 4.782608695652174e-06, + "loss": 3.2784, + "step": 31400 + }, + { + "epoch": 444.04163726182077, + "eval_loss": 3.2140893936157227, + "eval_runtime": 75.2464, + "eval_samples_per_second": 132.897, + "eval_steps_per_second": 8.306, + "step": 31400 + }, + { + "epoch": 445.4530698659139, + "grad_norm": 0.4167691469192505, + "learning_rate": 4.765886287625419e-06, + "loss": 3.2764, + "step": 31500 + }, + { + "epoch": 445.4530698659139, + "eval_loss": 3.209411382675171, + "eval_runtime": 75.3085, + "eval_samples_per_second": 132.787, + "eval_steps_per_second": 8.299, + "step": 31500 + }, + { + "epoch": 446.86450247000704, + "grad_norm": 0.4705657958984375, + "learning_rate": 4.749163879598663e-06, + "loss": 3.2769, + "step": 31600 + }, + { + "epoch": 446.86450247000704, + "eval_loss": 3.212165117263794, + "eval_runtime": 75.4939, + "eval_samples_per_second": 132.461, + "eval_steps_per_second": 8.279, + "step": 31600 + }, + { + "epoch": 448.27593507410023, + "grad_norm": 0.48369839787483215, + "learning_rate": 4.732441471571907e-06, + "loss": 3.2737, + "step": 31700 + }, + { + "epoch": 448.27593507410023, + "eval_loss": 3.211515426635742, + "eval_runtime": 75.5083, + "eval_samples_per_second": 132.436, + "eval_steps_per_second": 8.277, + "step": 31700 + }, + { + "epoch": 449.68736767819337, + "grad_norm": 0.5090962648391724, + "learning_rate": 4.715719063545151e-06, + "loss": 3.2739, + "step": 31800 + }, + { + "epoch": 449.68736767819337, + "eval_loss": 3.2119221687316895, + "eval_runtime": 75.5485, + "eval_samples_per_second": 132.365, + "eval_steps_per_second": 8.273, + "step": 31800 + }, + { + "epoch": 451.0988002822865, + "grad_norm": 0.5034780502319336, + "learning_rate": 4.698996655518395e-06, + "loss": 3.2724, + "step": 31900 + }, + { + "epoch": 451.0988002822865, + "eval_loss": 3.2079360485076904, + "eval_runtime": 75.5032, + "eval_samples_per_second": 132.445, + "eval_steps_per_second": 8.278, + "step": 31900 + }, + { + "epoch": 452.5102328863797, + "grad_norm": 0.4804918169975281, + "learning_rate": 4.6822742474916394e-06, + "loss": 3.2711, + "step": 32000 + }, + { + "epoch": 452.5102328863797, + "eval_loss": 3.206244468688965, + "eval_runtime": 75.5098, + "eval_samples_per_second": 132.433, + "eval_steps_per_second": 8.277, + "step": 32000 + }, + { + "epoch": 453.92166549047283, + "grad_norm": 0.47639667987823486, + "learning_rate": 4.665551839464883e-06, + "loss": 3.2704, + "step": 32100 + }, + { + "epoch": 453.92166549047283, + "eval_loss": 3.2067034244537354, + "eval_runtime": 75.4292, + "eval_samples_per_second": 132.575, + "eval_steps_per_second": 8.286, + "step": 32100 + }, + { + "epoch": 455.33309809456597, + "grad_norm": 0.5194268226623535, + "learning_rate": 4.648829431438128e-06, + "loss": 3.2692, + "step": 32200 + }, + { + "epoch": 455.33309809456597, + "eval_loss": 3.2050275802612305, + "eval_runtime": 75.3819, + "eval_samples_per_second": 132.658, + "eval_steps_per_second": 8.291, + "step": 32200 + }, + { + "epoch": 456.74453069865916, + "grad_norm": 0.48084452748298645, + "learning_rate": 4.6321070234113715e-06, + "loss": 3.268, + "step": 32300 + }, + { + "epoch": 456.74453069865916, + "eval_loss": 3.205712080001831, + "eval_runtime": 75.4989, + "eval_samples_per_second": 132.452, + "eval_steps_per_second": 8.278, + "step": 32300 + }, + { + "epoch": 458.1559633027523, + "grad_norm": 0.513474702835083, + "learning_rate": 4.615384615384616e-06, + "loss": 3.2673, + "step": 32400 + }, + { + "epoch": 458.1559633027523, + "eval_loss": 3.2028987407684326, + "eval_runtime": 75.5988, + "eval_samples_per_second": 132.277, + "eval_steps_per_second": 8.267, + "step": 32400 + }, + { + "epoch": 459.56739590684543, + "grad_norm": 0.4173973798751831, + "learning_rate": 4.59866220735786e-06, + "loss": 3.2659, + "step": 32500 + }, + { + "epoch": 459.56739590684543, + "eval_loss": 3.2002434730529785, + "eval_runtime": 75.375, + "eval_samples_per_second": 132.67, + "eval_steps_per_second": 8.292, + "step": 32500 + }, + { + "epoch": 460.9788285109386, + "grad_norm": 0.44799456000328064, + "learning_rate": 4.581939799331104e-06, + "loss": 3.2648, + "step": 32600 + }, + { + "epoch": 460.9788285109386, + "eval_loss": 3.2009098529815674, + "eval_runtime": 75.4432, + "eval_samples_per_second": 132.55, + "eval_steps_per_second": 8.284, + "step": 32600 + }, + { + "epoch": 462.39026111503176, + "grad_norm": 0.4446271061897278, + "learning_rate": 4.565217391304348e-06, + "loss": 3.2645, + "step": 32700 + }, + { + "epoch": 462.39026111503176, + "eval_loss": 3.1985297203063965, + "eval_runtime": 75.461, + "eval_samples_per_second": 132.519, + "eval_steps_per_second": 8.282, + "step": 32700 + }, + { + "epoch": 463.8016937191249, + "grad_norm": 0.43961068987846375, + "learning_rate": 4.548494983277592e-06, + "loss": 3.2624, + "step": 32800 + }, + { + "epoch": 463.8016937191249, + "eval_loss": 3.202833890914917, + "eval_runtime": 75.521, + "eval_samples_per_second": 132.413, + "eval_steps_per_second": 8.276, + "step": 32800 + }, + { + "epoch": 465.2131263232181, + "grad_norm": 0.47034966945648193, + "learning_rate": 4.531772575250837e-06, + "loss": 3.2617, + "step": 32900 + }, + { + "epoch": 465.2131263232181, + "eval_loss": 3.2012619972229004, + "eval_runtime": 75.7188, + "eval_samples_per_second": 132.068, + "eval_steps_per_second": 8.254, + "step": 32900 + }, + { + "epoch": 466.6245589273112, + "grad_norm": 0.4704744517803192, + "learning_rate": 4.51505016722408e-06, + "loss": 3.2614, + "step": 33000 + }, + { + "epoch": 466.6245589273112, + "eval_loss": 3.1989195346832275, + "eval_runtime": 75.4062, + "eval_samples_per_second": 132.615, + "eval_steps_per_second": 8.288, + "step": 33000 + }, + { + "epoch": 468.03599153140436, + "grad_norm": 0.5247920155525208, + "learning_rate": 4.498327759197324e-06, + "loss": 3.2597, + "step": 33100 + }, + { + "epoch": 468.03599153140436, + "eval_loss": 3.199160099029541, + "eval_runtime": 75.54, + "eval_samples_per_second": 132.38, + "eval_steps_per_second": 8.274, + "step": 33100 + }, + { + "epoch": 469.44742413549756, + "grad_norm": 0.5080961585044861, + "learning_rate": 4.481605351170569e-06, + "loss": 3.2591, + "step": 33200 + }, + { + "epoch": 469.44742413549756, + "eval_loss": 3.1960511207580566, + "eval_runtime": 75.4862, + "eval_samples_per_second": 132.475, + "eval_steps_per_second": 8.28, + "step": 33200 + }, + { + "epoch": 470.8588567395907, + "grad_norm": 0.47639742493629456, + "learning_rate": 4.4648829431438125e-06, + "loss": 3.2584, + "step": 33300 + }, + { + "epoch": 470.8588567395907, + "eval_loss": 3.1962883472442627, + "eval_runtime": 75.505, + "eval_samples_per_second": 132.442, + "eval_steps_per_second": 8.278, + "step": 33300 + }, + { + "epoch": 472.2702893436838, + "grad_norm": 0.5254648923873901, + "learning_rate": 4.448160535117057e-06, + "loss": 3.2572, + "step": 33400 + }, + { + "epoch": 472.2702893436838, + "eval_loss": 3.1971852779388428, + "eval_runtime": 73.998, + "eval_samples_per_second": 135.139, + "eval_steps_per_second": 8.446, + "step": 33400 + }, + { + "epoch": 473.681721947777, + "grad_norm": 0.424251526594162, + "learning_rate": 4.431438127090301e-06, + "loss": 3.2564, + "step": 33500 + }, + { + "epoch": 473.681721947777, + "eval_loss": 3.1964030265808105, + "eval_runtime": 74.3305, + "eval_samples_per_second": 134.534, + "eval_steps_per_second": 8.408, + "step": 33500 + }, + { + "epoch": 475.09315455187016, + "grad_norm": 0.47462198138237, + "learning_rate": 4.4147157190635455e-06, + "loss": 3.2543, + "step": 33600 + }, + { + "epoch": 475.09315455187016, + "eval_loss": 3.1918516159057617, + "eval_runtime": 75.4902, + "eval_samples_per_second": 132.468, + "eval_steps_per_second": 8.279, + "step": 33600 + }, + { + "epoch": 476.5045871559633, + "grad_norm": 0.4497796297073364, + "learning_rate": 4.397993311036789e-06, + "loss": 3.255, + "step": 33700 + }, + { + "epoch": 476.5045871559633, + "eval_loss": 3.1941463947296143, + "eval_runtime": 75.3615, + "eval_samples_per_second": 132.694, + "eval_steps_per_second": 8.293, + "step": 33700 + }, + { + "epoch": 477.9160197600565, + "grad_norm": 0.3968687355518341, + "learning_rate": 4.381270903010034e-06, + "loss": 3.2538, + "step": 33800 + }, + { + "epoch": 477.9160197600565, + "eval_loss": 3.188390016555786, + "eval_runtime": 75.5247, + "eval_samples_per_second": 132.407, + "eval_steps_per_second": 8.275, + "step": 33800 + }, + { + "epoch": 479.3274523641496, + "grad_norm": 0.4359143376350403, + "learning_rate": 4.364548494983278e-06, + "loss": 3.2524, + "step": 33900 + }, + { + "epoch": 479.3274523641496, + "eval_loss": 3.191363573074341, + "eval_runtime": 75.4796, + "eval_samples_per_second": 132.486, + "eval_steps_per_second": 8.28, + "step": 33900 + }, + { + "epoch": 480.73888496824276, + "grad_norm": 0.39059412479400635, + "learning_rate": 4.347826086956522e-06, + "loss": 3.251, + "step": 34000 + }, + { + "epoch": 480.73888496824276, + "eval_loss": 3.191560745239258, + "eval_runtime": 73.7784, + "eval_samples_per_second": 135.541, + "eval_steps_per_second": 8.471, + "step": 34000 + }, + { + "epoch": 482.1503175723359, + "grad_norm": 0.5065882802009583, + "learning_rate": 4.331103678929766e-06, + "loss": 3.251, + "step": 34100 + }, + { + "epoch": 482.1503175723359, + "eval_loss": 3.18879771232605, + "eval_runtime": 74.1572, + "eval_samples_per_second": 134.849, + "eval_steps_per_second": 8.428, + "step": 34100 + }, + { + "epoch": 483.5617501764291, + "grad_norm": 0.5141859650611877, + "learning_rate": 4.3143812709030106e-06, + "loss": 3.2489, + "step": 34200 + }, + { + "epoch": 483.5617501764291, + "eval_loss": 3.189378023147583, + "eval_runtime": 73.8069, + "eval_samples_per_second": 135.489, + "eval_steps_per_second": 8.468, + "step": 34200 + }, + { + "epoch": 484.9731827805222, + "grad_norm": 0.4633908271789551, + "learning_rate": 4.297658862876254e-06, + "loss": 3.2497, + "step": 34300 + }, + { + "epoch": 484.9731827805222, + "eval_loss": 3.1851470470428467, + "eval_runtime": 74.1044, + "eval_samples_per_second": 134.945, + "eval_steps_per_second": 8.434, + "step": 34300 + }, + { + "epoch": 486.38461538461536, + "grad_norm": 0.5483572483062744, + "learning_rate": 4.280936454849499e-06, + "loss": 3.2476, + "step": 34400 + }, + { + "epoch": 486.38461538461536, + "eval_loss": 3.1872897148132324, + "eval_runtime": 74.0461, + "eval_samples_per_second": 135.051, + "eval_steps_per_second": 8.441, + "step": 34400 + }, + { + "epoch": 487.79604798870855, + "grad_norm": 0.46806034445762634, + "learning_rate": 4.264214046822743e-06, + "loss": 3.2475, + "step": 34500 + }, + { + "epoch": 487.79604798870855, + "eval_loss": 3.1827573776245117, + "eval_runtime": 74.1108, + "eval_samples_per_second": 134.933, + "eval_steps_per_second": 8.433, + "step": 34500 + }, + { + "epoch": 489.2074805928017, + "grad_norm": 0.47688722610473633, + "learning_rate": 4.247491638795987e-06, + "loss": 3.2457, + "step": 34600 + }, + { + "epoch": 489.2074805928017, + "eval_loss": 3.1827759742736816, + "eval_runtime": 73.8156, + "eval_samples_per_second": 135.473, + "eval_steps_per_second": 8.467, + "step": 34600 + }, + { + "epoch": 490.6189131968948, + "grad_norm": 0.40754401683807373, + "learning_rate": 4.230769230769231e-06, + "loss": 3.2449, + "step": 34700 + }, + { + "epoch": 490.6189131968948, + "eval_loss": 3.1827995777130127, + "eval_runtime": 74.1338, + "eval_samples_per_second": 134.891, + "eval_steps_per_second": 8.431, + "step": 34700 + }, + { + "epoch": 492.030345800988, + "grad_norm": 0.4188888370990753, + "learning_rate": 4.214046822742475e-06, + "loss": 3.2445, + "step": 34800 + }, + { + "epoch": 492.030345800988, + "eval_loss": 3.181915283203125, + "eval_runtime": 74.1167, + "eval_samples_per_second": 134.922, + "eval_steps_per_second": 8.433, + "step": 34800 + }, + { + "epoch": 493.44177840508115, + "grad_norm": 0.4735229015350342, + "learning_rate": 4.197324414715719e-06, + "loss": 3.2435, + "step": 34900 + }, + { + "epoch": 493.44177840508115, + "eval_loss": 3.179292917251587, + "eval_runtime": 74.1214, + "eval_samples_per_second": 134.914, + "eval_steps_per_second": 8.432, + "step": 34900 + }, + { + "epoch": 494.8532110091743, + "grad_norm": 0.5335908532142639, + "learning_rate": 4.180602006688963e-06, + "loss": 3.2426, + "step": 35000 + }, + { + "epoch": 494.8532110091743, + "eval_loss": 3.180798053741455, + "eval_runtime": 74.1495, + "eval_samples_per_second": 134.863, + "eval_steps_per_second": 8.429, + "step": 35000 + }, + { + "epoch": 496.2646436132675, + "grad_norm": 0.48122459650039673, + "learning_rate": 4.163879598662208e-06, + "loss": 3.2424, + "step": 35100 + }, + { + "epoch": 496.2646436132675, + "eval_loss": 3.1801769733428955, + "eval_runtime": 73.8056, + "eval_samples_per_second": 135.491, + "eval_steps_per_second": 8.468, + "step": 35100 + }, + { + "epoch": 497.6760762173606, + "grad_norm": 0.4501022398471832, + "learning_rate": 4.1471571906354515e-06, + "loss": 3.2409, + "step": 35200 + }, + { + "epoch": 497.6760762173606, + "eval_loss": 3.180279016494751, + "eval_runtime": 75.5755, + "eval_samples_per_second": 132.318, + "eval_steps_per_second": 8.27, + "step": 35200 + }, + { + "epoch": 499.08750882145375, + "grad_norm": 0.45166870951652527, + "learning_rate": 4.130434782608696e-06, + "loss": 3.2396, + "step": 35300 + }, + { + "epoch": 499.08750882145375, + "eval_loss": 3.1797635555267334, + "eval_runtime": 75.5905, + "eval_samples_per_second": 132.292, + "eval_steps_per_second": 8.268, + "step": 35300 + }, + { + "epoch": 500.49894142554695, + "grad_norm": 0.48286956548690796, + "learning_rate": 4.11371237458194e-06, + "loss": 3.2387, + "step": 35400 + }, + { + "epoch": 500.49894142554695, + "eval_loss": 3.180513858795166, + "eval_runtime": 75.5591, + "eval_samples_per_second": 132.347, + "eval_steps_per_second": 8.272, + "step": 35400 + }, + { + "epoch": 501.9103740296401, + "grad_norm": 0.4946323335170746, + "learning_rate": 4.0969899665551845e-06, + "loss": 3.2381, + "step": 35500 + }, + { + "epoch": 501.9103740296401, + "eval_loss": 3.1741671562194824, + "eval_runtime": 75.5669, + "eval_samples_per_second": 132.333, + "eval_steps_per_second": 8.271, + "step": 35500 + }, + { + "epoch": 503.3218066337332, + "grad_norm": 0.5477709174156189, + "learning_rate": 4.080267558528428e-06, + "loss": 3.237, + "step": 35600 + }, + { + "epoch": 503.3218066337332, + "eval_loss": 3.1784253120422363, + "eval_runtime": 74.1136, + "eval_samples_per_second": 134.928, + "eval_steps_per_second": 8.433, + "step": 35600 + }, + { + "epoch": 504.7332392378264, + "grad_norm": 0.4314318299293518, + "learning_rate": 4.063545150501673e-06, + "loss": 3.2368, + "step": 35700 + }, + { + "epoch": 504.7332392378264, + "eval_loss": 3.1778831481933594, + "eval_runtime": 74.392, + "eval_samples_per_second": 134.423, + "eval_steps_per_second": 8.401, + "step": 35700 + }, + { + "epoch": 506.14467184191955, + "grad_norm": 0.5121575593948364, + "learning_rate": 4.046822742474917e-06, + "loss": 3.2348, + "step": 35800 + }, + { + "epoch": 506.14467184191955, + "eval_loss": 3.1743390560150146, + "eval_runtime": 74.2648, + "eval_samples_per_second": 134.653, + "eval_steps_per_second": 8.416, + "step": 35800 + }, + { + "epoch": 507.5561044460127, + "grad_norm": 0.5254048705101013, + "learning_rate": 4.030100334448161e-06, + "loss": 3.2356, + "step": 35900 + }, + { + "epoch": 507.5561044460127, + "eval_loss": 3.1765570640563965, + "eval_runtime": 74.3603, + "eval_samples_per_second": 134.48, + "eval_steps_per_second": 8.405, + "step": 35900 + }, + { + "epoch": 508.9675370501059, + "grad_norm": 0.5628880858421326, + "learning_rate": 4.013377926421405e-06, + "loss": 3.2345, + "step": 36000 + }, + { + "epoch": 508.9675370501059, + "eval_loss": 3.1755409240722656, + "eval_runtime": 74.3666, + "eval_samples_per_second": 134.469, + "eval_steps_per_second": 8.404, + "step": 36000 + }, + { + "epoch": 510.378969654199, + "grad_norm": 0.43160390853881836, + "learning_rate": 3.99665551839465e-06, + "loss": 3.2327, + "step": 36100 + }, + { + "epoch": 510.378969654199, + "eval_loss": 3.1721553802490234, + "eval_runtime": 75.0918, + "eval_samples_per_second": 133.17, + "eval_steps_per_second": 8.323, + "step": 36100 + }, + { + "epoch": 511.79040225829215, + "grad_norm": 0.4397335350513458, + "learning_rate": 3.979933110367893e-06, + "loss": 3.2324, + "step": 36200 + }, + { + "epoch": 511.79040225829215, + "eval_loss": 3.174211263656616, + "eval_runtime": 75.613, + "eval_samples_per_second": 132.252, + "eval_steps_per_second": 8.266, + "step": 36200 + }, + { + "epoch": 513.2018348623853, + "grad_norm": 0.42536553740501404, + "learning_rate": 3.963210702341138e-06, + "loss": 3.2324, + "step": 36300 + }, + { + "epoch": 513.2018348623853, + "eval_loss": 3.172475814819336, + "eval_runtime": 75.4025, + "eval_samples_per_second": 132.622, + "eval_steps_per_second": 8.289, + "step": 36300 + }, + { + "epoch": 514.6132674664784, + "grad_norm": 0.46854251623153687, + "learning_rate": 3.946488294314382e-06, + "loss": 3.2306, + "step": 36400 + }, + { + "epoch": 514.6132674664784, + "eval_loss": 3.1727116107940674, + "eval_runtime": 75.467, + "eval_samples_per_second": 132.508, + "eval_steps_per_second": 8.282, + "step": 36400 + }, + { + "epoch": 516.0247000705716, + "grad_norm": 0.40861082077026367, + "learning_rate": 3.929765886287626e-06, + "loss": 3.2304, + "step": 36500 + }, + { + "epoch": 516.0247000705716, + "eval_loss": 3.171363353729248, + "eval_runtime": 75.3463, + "eval_samples_per_second": 132.721, + "eval_steps_per_second": 8.295, + "step": 36500 + }, + { + "epoch": 517.4361326746648, + "grad_norm": 0.4804244339466095, + "learning_rate": 3.91304347826087e-06, + "loss": 3.2291, + "step": 36600 + }, + { + "epoch": 517.4361326746648, + "eval_loss": 3.168253183364868, + "eval_runtime": 75.5393, + "eval_samples_per_second": 132.381, + "eval_steps_per_second": 8.274, + "step": 36600 + }, + { + "epoch": 518.8475652787579, + "grad_norm": 0.4989255964756012, + "learning_rate": 3.896321070234114e-06, + "loss": 3.2283, + "step": 36700 + }, + { + "epoch": 518.8475652787579, + "eval_loss": 3.1694819927215576, + "eval_runtime": 75.6196, + "eval_samples_per_second": 132.241, + "eval_steps_per_second": 8.265, + "step": 36700 + }, + { + "epoch": 520.2589978828511, + "grad_norm": 0.4205889403820038, + "learning_rate": 3.8795986622073584e-06, + "loss": 3.2272, + "step": 36800 + }, + { + "epoch": 520.2589978828511, + "eval_loss": 3.167910575866699, + "eval_runtime": 75.533, + "eval_samples_per_second": 132.392, + "eval_steps_per_second": 8.275, + "step": 36800 + }, + { + "epoch": 521.6704304869443, + "grad_norm": 0.49398526549339294, + "learning_rate": 3.862876254180602e-06, + "loss": 3.227, + "step": 36900 + }, + { + "epoch": 521.6704304869443, + "eval_loss": 3.166125774383545, + "eval_runtime": 75.4176, + "eval_samples_per_second": 132.595, + "eval_steps_per_second": 8.287, + "step": 36900 + }, + { + "epoch": 523.0818630910373, + "grad_norm": 0.42277684807777405, + "learning_rate": 3.846153846153847e-06, + "loss": 3.2252, + "step": 37000 + }, + { + "epoch": 523.0818630910373, + "eval_loss": 3.1657633781433105, + "eval_runtime": 75.5313, + "eval_samples_per_second": 132.395, + "eval_steps_per_second": 8.275, + "step": 37000 + }, + { + "epoch": 524.4932956951305, + "grad_norm": 0.4734378159046173, + "learning_rate": 3.8294314381270906e-06, + "loss": 3.225, + "step": 37100 + }, + { + "epoch": 524.4932956951305, + "eval_loss": 3.167834997177124, + "eval_runtime": 75.5739, + "eval_samples_per_second": 132.321, + "eval_steps_per_second": 8.27, + "step": 37100 + }, + { + "epoch": 525.9047282992237, + "grad_norm": 0.466743528842926, + "learning_rate": 3.812709030100335e-06, + "loss": 3.225, + "step": 37200 + }, + { + "epoch": 525.9047282992237, + "eval_loss": 3.165196180343628, + "eval_runtime": 75.556, + "eval_samples_per_second": 132.352, + "eval_steps_per_second": 8.272, + "step": 37200 + }, + { + "epoch": 527.3161609033168, + "grad_norm": 0.4317799210548401, + "learning_rate": 3.7959866220735793e-06, + "loss": 3.224, + "step": 37300 + }, + { + "epoch": 527.3161609033168, + "eval_loss": 3.1631827354431152, + "eval_runtime": 75.6043, + "eval_samples_per_second": 132.268, + "eval_steps_per_second": 8.267, + "step": 37300 + }, + { + "epoch": 528.72759350741, + "grad_norm": 0.39390048384666443, + "learning_rate": 3.7792642140468235e-06, + "loss": 3.2237, + "step": 37400 + }, + { + "epoch": 528.72759350741, + "eval_loss": 3.165860414505005, + "eval_runtime": 75.4156, + "eval_samples_per_second": 132.599, + "eval_steps_per_second": 8.287, + "step": 37400 + }, + { + "epoch": 530.1390261115032, + "grad_norm": 0.4415905177593231, + "learning_rate": 3.7625418060200673e-06, + "loss": 3.2233, + "step": 37500 + }, + { + "epoch": 530.1390261115032, + "eval_loss": 3.163349151611328, + "eval_runtime": 75.5719, + "eval_samples_per_second": 132.324, + "eval_steps_per_second": 8.27, + "step": 37500 + }, + { + "epoch": 531.5504587155963, + "grad_norm": 0.5063501596450806, + "learning_rate": 3.745819397993311e-06, + "loss": 3.2206, + "step": 37600 + }, + { + "epoch": 531.5504587155963, + "eval_loss": 3.163081407546997, + "eval_runtime": 75.5766, + "eval_samples_per_second": 132.316, + "eval_steps_per_second": 8.27, + "step": 37600 + }, + { + "epoch": 532.9618913196895, + "grad_norm": 0.4227210283279419, + "learning_rate": 3.7290969899665552e-06, + "loss": 3.2212, + "step": 37700 + }, + { + "epoch": 532.9618913196895, + "eval_loss": 3.1642813682556152, + "eval_runtime": 75.6173, + "eval_samples_per_second": 132.245, + "eval_steps_per_second": 8.265, + "step": 37700 + }, + { + "epoch": 534.3733239237827, + "grad_norm": 0.5058636665344238, + "learning_rate": 3.7123745819397994e-06, + "loss": 3.2198, + "step": 37800 + }, + { + "epoch": 534.3733239237827, + "eval_loss": 3.1637208461761475, + "eval_runtime": 75.5813, + "eval_samples_per_second": 132.308, + "eval_steps_per_second": 8.269, + "step": 37800 + }, + { + "epoch": 535.7847565278757, + "grad_norm": 0.37743571400642395, + "learning_rate": 3.6956521739130436e-06, + "loss": 3.2193, + "step": 37900 + }, + { + "epoch": 535.7847565278757, + "eval_loss": 3.161426544189453, + "eval_runtime": 75.4254, + "eval_samples_per_second": 132.581, + "eval_steps_per_second": 8.286, + "step": 37900 + }, + { + "epoch": 537.1961891319689, + "grad_norm": 0.4442863464355469, + "learning_rate": 3.6789297658862878e-06, + "loss": 3.2186, + "step": 38000 + }, + { + "epoch": 537.1961891319689, + "eval_loss": 3.1596264839172363, + "eval_runtime": 75.6205, + "eval_samples_per_second": 132.239, + "eval_steps_per_second": 8.265, + "step": 38000 + }, + { + "epoch": 538.6076217360621, + "grad_norm": 0.46085959672927856, + "learning_rate": 3.662207357859532e-06, + "loss": 3.2176, + "step": 38100 + }, + { + "epoch": 538.6076217360621, + "eval_loss": 3.161233901977539, + "eval_runtime": 75.6478, + "eval_samples_per_second": 132.191, + "eval_steps_per_second": 8.262, + "step": 38100 + }, + { + "epoch": 540.0190543401552, + "grad_norm": 0.45532315969467163, + "learning_rate": 3.645484949832776e-06, + "loss": 3.217, + "step": 38200 + }, + { + "epoch": 540.0190543401552, + "eval_loss": 3.159839391708374, + "eval_runtime": 75.6969, + "eval_samples_per_second": 132.106, + "eval_steps_per_second": 8.257, + "step": 38200 + }, + { + "epoch": 541.4304869442484, + "grad_norm": 0.4042709469795227, + "learning_rate": 3.6287625418060203e-06, + "loss": 3.2167, + "step": 38300 + }, + { + "epoch": 541.4304869442484, + "eval_loss": 3.1588003635406494, + "eval_runtime": 75.6208, + "eval_samples_per_second": 132.239, + "eval_steps_per_second": 8.265, + "step": 38300 + }, + { + "epoch": 542.8419195483416, + "grad_norm": 0.4348280727863312, + "learning_rate": 3.6120401337792645e-06, + "loss": 3.2158, + "step": 38400 + }, + { + "epoch": 542.8419195483416, + "eval_loss": 3.156273365020752, + "eval_runtime": 75.5256, + "eval_samples_per_second": 132.405, + "eval_steps_per_second": 8.275, + "step": 38400 + }, + { + "epoch": 544.2533521524347, + "grad_norm": 0.49892503023147583, + "learning_rate": 3.5953177257525087e-06, + "loss": 3.2145, + "step": 38500 + }, + { + "epoch": 544.2533521524347, + "eval_loss": 3.1565325260162354, + "eval_runtime": 75.5712, + "eval_samples_per_second": 132.325, + "eval_steps_per_second": 8.27, + "step": 38500 + }, + { + "epoch": 545.6647847565279, + "grad_norm": 0.522286057472229, + "learning_rate": 3.578595317725753e-06, + "loss": 3.214, + "step": 38600 + }, + { + "epoch": 545.6647847565279, + "eval_loss": 3.156520128250122, + "eval_runtime": 75.3314, + "eval_samples_per_second": 132.747, + "eval_steps_per_second": 8.297, + "step": 38600 + }, + { + "epoch": 547.0762173606211, + "grad_norm": 0.4889813959598541, + "learning_rate": 3.5618729096989966e-06, + "loss": 3.2125, + "step": 38700 + }, + { + "epoch": 547.0762173606211, + "eval_loss": 3.156526565551758, + "eval_runtime": 75.5675, + "eval_samples_per_second": 132.332, + "eval_steps_per_second": 8.271, + "step": 38700 + }, + { + "epoch": 548.4876499647141, + "grad_norm": 0.43349429965019226, + "learning_rate": 3.5451505016722408e-06, + "loss": 3.2127, + "step": 38800 + }, + { + "epoch": 548.4876499647141, + "eval_loss": 3.155153274536133, + "eval_runtime": 75.5428, + "eval_samples_per_second": 132.375, + "eval_steps_per_second": 8.273, + "step": 38800 + }, + { + "epoch": 549.8990825688073, + "grad_norm": 0.4011054039001465, + "learning_rate": 3.528428093645485e-06, + "loss": 3.2116, + "step": 38900 + }, + { + "epoch": 549.8990825688073, + "eval_loss": 3.152825355529785, + "eval_runtime": 75.5836, + "eval_samples_per_second": 132.304, + "eval_steps_per_second": 8.269, + "step": 38900 + }, + { + "epoch": 551.3105151729005, + "grad_norm": 0.4457249939441681, + "learning_rate": 3.511705685618729e-06, + "loss": 3.2117, + "step": 39000 + }, + { + "epoch": 551.3105151729005, + "eval_loss": 3.151512384414673, + "eval_runtime": 75.5827, + "eval_samples_per_second": 132.305, + "eval_steps_per_second": 8.269, + "step": 39000 + }, + { + "epoch": 552.7219477769936, + "grad_norm": 0.42113420367240906, + "learning_rate": 3.4949832775919733e-06, + "loss": 3.2107, + "step": 39100 + }, + { + "epoch": 552.7219477769936, + "eval_loss": 3.1546247005462646, + "eval_runtime": 75.4062, + "eval_samples_per_second": 132.615, + "eval_steps_per_second": 8.288, + "step": 39100 + }, + { + "epoch": 554.1333803810868, + "grad_norm": 0.3948840796947479, + "learning_rate": 3.4782608695652175e-06, + "loss": 3.2097, + "step": 39200 + }, + { + "epoch": 554.1333803810868, + "eval_loss": 3.1522581577301025, + "eval_runtime": 75.6127, + "eval_samples_per_second": 132.253, + "eval_steps_per_second": 8.266, + "step": 39200 + }, + { + "epoch": 555.54481298518, + "grad_norm": 0.4366632103919983, + "learning_rate": 3.4615384615384617e-06, + "loss": 3.2087, + "step": 39300 + }, + { + "epoch": 555.54481298518, + "eval_loss": 3.1504454612731934, + "eval_runtime": 75.6076, + "eval_samples_per_second": 132.262, + "eval_steps_per_second": 8.266, + "step": 39300 + }, + { + "epoch": 556.9562455892731, + "grad_norm": 0.48205867409706116, + "learning_rate": 3.444816053511706e-06, + "loss": 3.2093, + "step": 39400 + }, + { + "epoch": 556.9562455892731, + "eval_loss": 3.1492788791656494, + "eval_runtime": 75.6481, + "eval_samples_per_second": 132.191, + "eval_steps_per_second": 8.262, + "step": 39400 + }, + { + "epoch": 558.3676781933663, + "grad_norm": 0.37896397709846497, + "learning_rate": 3.42809364548495e-06, + "loss": 3.2076, + "step": 39500 + }, + { + "epoch": 558.3676781933663, + "eval_loss": 3.1518449783325195, + "eval_runtime": 75.7, + "eval_samples_per_second": 132.1, + "eval_steps_per_second": 8.256, + "step": 39500 + }, + { + "epoch": 559.7791107974595, + "grad_norm": 0.5973118543624878, + "learning_rate": 3.4113712374581942e-06, + "loss": 3.2063, + "step": 39600 + }, + { + "epoch": 559.7791107974595, + "eval_loss": 3.149134397506714, + "eval_runtime": 75.4698, + "eval_samples_per_second": 132.503, + "eval_steps_per_second": 8.281, + "step": 39600 + }, + { + "epoch": 561.1905434015525, + "grad_norm": 0.45621445775032043, + "learning_rate": 3.3946488294314384e-06, + "loss": 3.2068, + "step": 39700 + }, + { + "epoch": 561.1905434015525, + "eval_loss": 3.1491434574127197, + "eval_runtime": 75.6152, + "eval_samples_per_second": 132.248, + "eval_steps_per_second": 8.266, + "step": 39700 + }, + { + "epoch": 562.6019760056457, + "grad_norm": 0.43519681692123413, + "learning_rate": 3.3779264214046826e-06, + "loss": 3.2064, + "step": 39800 + }, + { + "epoch": 562.6019760056457, + "eval_loss": 3.150059938430786, + "eval_runtime": 75.5035, + "eval_samples_per_second": 132.444, + "eval_steps_per_second": 8.278, + "step": 39800 + }, + { + "epoch": 564.0134086097389, + "grad_norm": 0.4100271165370941, + "learning_rate": 3.3612040133779268e-06, + "loss": 3.2052, + "step": 39900 + }, + { + "epoch": 564.0134086097389, + "eval_loss": 3.146653175354004, + "eval_runtime": 75.6217, + "eval_samples_per_second": 132.237, + "eval_steps_per_second": 8.265, + "step": 39900 + }, + { + "epoch": 565.424841213832, + "grad_norm": 0.5362276434898376, + "learning_rate": 3.344481605351171e-06, + "loss": 3.2053, + "step": 40000 + }, + { + "epoch": 565.424841213832, + "eval_loss": 3.1463942527770996, + "eval_runtime": 75.2649, + "eval_samples_per_second": 132.864, + "eval_steps_per_second": 8.304, + "step": 40000 + }, + { + "epoch": 566.8362738179252, + "grad_norm": 0.49909672141075134, + "learning_rate": 3.327759197324415e-06, + "loss": 3.2037, + "step": 40100 + }, + { + "epoch": 566.8362738179252, + "eval_loss": 3.146095037460327, + "eval_runtime": 76.2008, + "eval_samples_per_second": 131.232, + "eval_steps_per_second": 8.202, + "step": 40100 + }, + { + "epoch": 568.2477064220184, + "grad_norm": 0.4283011257648468, + "learning_rate": 3.3110367892976593e-06, + "loss": 3.203, + "step": 40200 + }, + { + "epoch": 568.2477064220184, + "eval_loss": 3.1441144943237305, + "eval_runtime": 75.2084, + "eval_samples_per_second": 132.964, + "eval_steps_per_second": 8.31, + "step": 40200 + }, + { + "epoch": 569.6591390261115, + "grad_norm": 0.48537662625312805, + "learning_rate": 3.2943143812709035e-06, + "loss": 3.202, + "step": 40300 + }, + { + "epoch": 569.6591390261115, + "eval_loss": 3.146061658859253, + "eval_runtime": 75.1567, + "eval_samples_per_second": 133.055, + "eval_steps_per_second": 8.316, + "step": 40300 + }, + { + "epoch": 571.0705716302047, + "grad_norm": 0.5248004794120789, + "learning_rate": 3.2775919732441473e-06, + "loss": 3.2024, + "step": 40400 + }, + { + "epoch": 571.0705716302047, + "eval_loss": 3.1482038497924805, + "eval_runtime": 75.2627, + "eval_samples_per_second": 132.868, + "eval_steps_per_second": 8.304, + "step": 40400 + }, + { + "epoch": 572.4820042342978, + "grad_norm": 0.406740665435791, + "learning_rate": 3.2608695652173914e-06, + "loss": 3.2005, + "step": 40500 + }, + { + "epoch": 572.4820042342978, + "eval_loss": 3.1435883045196533, + "eval_runtime": 75.3704, + "eval_samples_per_second": 132.678, + "eval_steps_per_second": 8.292, + "step": 40500 + }, + { + "epoch": 573.8934368383909, + "grad_norm": 0.4268272817134857, + "learning_rate": 3.2441471571906356e-06, + "loss": 3.2016, + "step": 40600 + }, + { + "epoch": 573.8934368383909, + "eval_loss": 3.1432077884674072, + "eval_runtime": 75.5314, + "eval_samples_per_second": 132.395, + "eval_steps_per_second": 8.275, + "step": 40600 + }, + { + "epoch": 575.3048694424841, + "grad_norm": 0.3838156461715698, + "learning_rate": 3.22742474916388e-06, + "loss": 3.1999, + "step": 40700 + }, + { + "epoch": 575.3048694424841, + "eval_loss": 3.1417696475982666, + "eval_runtime": 75.5553, + "eval_samples_per_second": 132.353, + "eval_steps_per_second": 8.272, + "step": 40700 + }, + { + "epoch": 576.7163020465773, + "grad_norm": 0.3539990484714508, + "learning_rate": 3.210702341137124e-06, + "loss": 3.1997, + "step": 40800 + }, + { + "epoch": 576.7163020465773, + "eval_loss": 3.141932964324951, + "eval_runtime": 75.6878, + "eval_samples_per_second": 132.122, + "eval_steps_per_second": 8.258, + "step": 40800 + }, + { + "epoch": 578.1277346506704, + "grad_norm": 0.45554637908935547, + "learning_rate": 3.193979933110368e-06, + "loss": 3.1994, + "step": 40900 + }, + { + "epoch": 578.1277346506704, + "eval_loss": 3.142242670059204, + "eval_runtime": 75.5684, + "eval_samples_per_second": 132.33, + "eval_steps_per_second": 8.271, + "step": 40900 + }, + { + "epoch": 579.5391672547636, + "grad_norm": 0.44678226113319397, + "learning_rate": 3.1772575250836123e-06, + "loss": 3.1994, + "step": 41000 + }, + { + "epoch": 579.5391672547636, + "eval_loss": 3.141388177871704, + "eval_runtime": 75.4266, + "eval_samples_per_second": 132.579, + "eval_steps_per_second": 8.286, + "step": 41000 + }, + { + "epoch": 580.9505998588568, + "grad_norm": 0.4156506061553955, + "learning_rate": 3.1605351170568565e-06, + "loss": 3.1978, + "step": 41100 + }, + { + "epoch": 580.9505998588568, + "eval_loss": 3.138974905014038, + "eval_runtime": 75.6365, + "eval_samples_per_second": 132.211, + "eval_steps_per_second": 8.263, + "step": 41100 + }, + { + "epoch": 582.3620324629499, + "grad_norm": 0.42633840441703796, + "learning_rate": 3.1438127090301007e-06, + "loss": 3.1976, + "step": 41200 + }, + { + "epoch": 582.3620324629499, + "eval_loss": 3.1378233432769775, + "eval_runtime": 75.5912, + "eval_samples_per_second": 132.291, + "eval_steps_per_second": 8.268, + "step": 41200 + }, + { + "epoch": 583.773465067043, + "grad_norm": 0.3734727203845978, + "learning_rate": 3.127090301003345e-06, + "loss": 3.1982, + "step": 41300 + }, + { + "epoch": 583.773465067043, + "eval_loss": 3.1387135982513428, + "eval_runtime": 75.5016, + "eval_samples_per_second": 132.448, + "eval_steps_per_second": 8.278, + "step": 41300 + }, + { + "epoch": 585.1848976711362, + "grad_norm": 0.46358761191368103, + "learning_rate": 3.110367892976589e-06, + "loss": 3.1963, + "step": 41400 + }, + { + "epoch": 585.1848976711362, + "eval_loss": 3.138916492462158, + "eval_runtime": 75.6514, + "eval_samples_per_second": 132.185, + "eval_steps_per_second": 8.262, + "step": 41400 + }, + { + "epoch": 586.5963302752293, + "grad_norm": 0.48239365220069885, + "learning_rate": 3.0936454849498333e-06, + "loss": 3.1956, + "step": 41500 + }, + { + "epoch": 586.5963302752293, + "eval_loss": 3.1406919956207275, + "eval_runtime": 75.3804, + "eval_samples_per_second": 132.66, + "eval_steps_per_second": 8.291, + "step": 41500 + }, + { + "epoch": 588.0077628793225, + "grad_norm": 0.3974038362503052, + "learning_rate": 3.0769230769230774e-06, + "loss": 3.1953, + "step": 41600 + }, + { + "epoch": 588.0077628793225, + "eval_loss": 3.138157844543457, + "eval_runtime": 75.5546, + "eval_samples_per_second": 132.355, + "eval_steps_per_second": 8.272, + "step": 41600 + }, + { + "epoch": 589.4191954834157, + "grad_norm": 0.379353404045105, + "learning_rate": 3.0602006688963216e-06, + "loss": 3.195, + "step": 41700 + }, + { + "epoch": 589.4191954834157, + "eval_loss": 3.137274980545044, + "eval_runtime": 75.5651, + "eval_samples_per_second": 132.336, + "eval_steps_per_second": 8.271, + "step": 41700 + }, + { + "epoch": 590.8306280875088, + "grad_norm": 0.40416419506073, + "learning_rate": 3.043478260869566e-06, + "loss": 3.1938, + "step": 41800 + }, + { + "epoch": 590.8306280875088, + "eval_loss": 3.134814739227295, + "eval_runtime": 75.6798, + "eval_samples_per_second": 132.136, + "eval_steps_per_second": 8.258, + "step": 41800 + }, + { + "epoch": 592.242060691602, + "grad_norm": 0.49428197741508484, + "learning_rate": 3.02675585284281e-06, + "loss": 3.1933, + "step": 41900 + }, + { + "epoch": 592.242060691602, + "eval_loss": 3.13729190826416, + "eval_runtime": 75.4545, + "eval_samples_per_second": 132.53, + "eval_steps_per_second": 8.283, + "step": 41900 + }, + { + "epoch": 593.6534932956952, + "grad_norm": 0.39984115958213806, + "learning_rate": 3.010033444816054e-06, + "loss": 3.1939, + "step": 42000 + }, + { + "epoch": 593.6534932956952, + "eval_loss": 3.1380887031555176, + "eval_runtime": 74.1402, + "eval_samples_per_second": 134.88, + "eval_steps_per_second": 8.43, + "step": 42000 + }, + { + "epoch": 595.0649258997882, + "grad_norm": 0.42510271072387695, + "learning_rate": 2.9933110367892983e-06, + "loss": 3.1925, + "step": 42100 + }, + { + "epoch": 595.0649258997882, + "eval_loss": 3.1366851329803467, + "eval_runtime": 74.1063, + "eval_samples_per_second": 134.941, + "eval_steps_per_second": 8.434, + "step": 42100 + }, + { + "epoch": 596.4763585038814, + "grad_norm": 0.39396584033966064, + "learning_rate": 2.976588628762542e-06, + "loss": 3.192, + "step": 42200 + }, + { + "epoch": 596.4763585038814, + "eval_loss": 3.1356325149536133, + "eval_runtime": 74.0085, + "eval_samples_per_second": 135.12, + "eval_steps_per_second": 8.445, + "step": 42200 + }, + { + "epoch": 597.8877911079746, + "grad_norm": 0.38813215494155884, + "learning_rate": 2.9598662207357863e-06, + "loss": 3.1908, + "step": 42300 + }, + { + "epoch": 597.8877911079746, + "eval_loss": 3.1375906467437744, + "eval_runtime": 75.3924, + "eval_samples_per_second": 132.639, + "eval_steps_per_second": 8.29, + "step": 42300 + }, + { + "epoch": 599.2992237120677, + "grad_norm": 0.39870092272758484, + "learning_rate": 2.9431438127090305e-06, + "loss": 3.1907, + "step": 42400 + }, + { + "epoch": 599.2992237120677, + "eval_loss": 3.1329517364501953, + "eval_runtime": 75.5449, + "eval_samples_per_second": 132.372, + "eval_steps_per_second": 8.273, + "step": 42400 + }, + { + "epoch": 600.7106563161609, + "grad_norm": 0.40714314579963684, + "learning_rate": 2.9264214046822746e-06, + "loss": 3.1908, + "step": 42500 + }, + { + "epoch": 600.7106563161609, + "eval_loss": 3.130784034729004, + "eval_runtime": 75.095, + "eval_samples_per_second": 133.165, + "eval_steps_per_second": 8.323, + "step": 42500 + }, + { + "epoch": 602.1220889202541, + "grad_norm": 0.4590309262275696, + "learning_rate": 2.9096989966555184e-06, + "loss": 3.1906, + "step": 42600 + }, + { + "epoch": 602.1220889202541, + "eval_loss": 3.130798816680908, + "eval_runtime": 75.2081, + "eval_samples_per_second": 132.964, + "eval_steps_per_second": 8.31, + "step": 42600 + }, + { + "epoch": 603.5335215243472, + "grad_norm": 0.4752305746078491, + "learning_rate": 2.8929765886287626e-06, + "loss": 3.1885, + "step": 42700 + }, + { + "epoch": 603.5335215243472, + "eval_loss": 3.1303343772888184, + "eval_runtime": 75.2947, + "eval_samples_per_second": 132.812, + "eval_steps_per_second": 8.301, + "step": 42700 + }, + { + "epoch": 604.9449541284404, + "grad_norm": 0.5041990876197815, + "learning_rate": 2.8762541806020068e-06, + "loss": 3.1889, + "step": 42800 + }, + { + "epoch": 604.9449541284404, + "eval_loss": 3.133119821548462, + "eval_runtime": 75.4469, + "eval_samples_per_second": 132.544, + "eval_steps_per_second": 8.284, + "step": 42800 + }, + { + "epoch": 606.3563867325336, + "grad_norm": 0.4223128855228424, + "learning_rate": 2.859531772575251e-06, + "loss": 3.1883, + "step": 42900 + }, + { + "epoch": 606.3563867325336, + "eval_loss": 3.13100266456604, + "eval_runtime": 75.1208, + "eval_samples_per_second": 133.119, + "eval_steps_per_second": 8.32, + "step": 42900 + }, + { + "epoch": 607.7678193366266, + "grad_norm": 0.4178304672241211, + "learning_rate": 2.842809364548495e-06, + "loss": 3.1884, + "step": 43000 + }, + { + "epoch": 607.7678193366266, + "eval_loss": 3.1309714317321777, + "eval_runtime": 75.1871, + "eval_samples_per_second": 133.001, + "eval_steps_per_second": 8.313, + "step": 43000 + }, + { + "epoch": 609.1792519407198, + "grad_norm": 0.38649800419807434, + "learning_rate": 2.8260869565217393e-06, + "loss": 3.187, + "step": 43100 + }, + { + "epoch": 609.1792519407198, + "eval_loss": 3.1306862831115723, + "eval_runtime": 75.2092, + "eval_samples_per_second": 132.962, + "eval_steps_per_second": 8.31, + "step": 43100 + }, + { + "epoch": 610.590684544813, + "grad_norm": 0.4299427270889282, + "learning_rate": 2.8093645484949835e-06, + "loss": 3.1873, + "step": 43200 + }, + { + "epoch": 610.590684544813, + "eval_loss": 3.1312477588653564, + "eval_runtime": 75.2144, + "eval_samples_per_second": 132.953, + "eval_steps_per_second": 8.31, + "step": 43200 + }, + { + "epoch": 612.0021171489061, + "grad_norm": 0.46031683683395386, + "learning_rate": 2.7926421404682277e-06, + "loss": 3.1868, + "step": 43300 + }, + { + "epoch": 612.0021171489061, + "eval_loss": 3.1275081634521484, + "eval_runtime": 75.6145, + "eval_samples_per_second": 132.25, + "eval_steps_per_second": 8.266, + "step": 43300 + }, + { + "epoch": 613.4135497529993, + "grad_norm": 0.4347423017024994, + "learning_rate": 2.7759197324414714e-06, + "loss": 3.1856, + "step": 43400 + }, + { + "epoch": 613.4135497529993, + "eval_loss": 3.1303658485412598, + "eval_runtime": 75.6408, + "eval_samples_per_second": 132.204, + "eval_steps_per_second": 8.263, + "step": 43400 + }, + { + "epoch": 614.8249823570925, + "grad_norm": 0.38185596466064453, + "learning_rate": 2.7591973244147156e-06, + "loss": 3.1844, + "step": 43500 + }, + { + "epoch": 614.8249823570925, + "eval_loss": 3.1326870918273926, + "eval_runtime": 75.6056, + "eval_samples_per_second": 132.265, + "eval_steps_per_second": 8.267, + "step": 43500 + }, + { + "epoch": 616.2364149611856, + "grad_norm": 0.40985825657844543, + "learning_rate": 2.74247491638796e-06, + "loss": 3.1847, + "step": 43600 + }, + { + "epoch": 616.2364149611856, + "eval_loss": 3.128065347671509, + "eval_runtime": 75.6157, + "eval_samples_per_second": 132.248, + "eval_steps_per_second": 8.265, + "step": 43600 + }, + { + "epoch": 617.6478475652788, + "grad_norm": 0.43245822191238403, + "learning_rate": 2.725752508361204e-06, + "loss": 3.1839, + "step": 43700 + }, + { + "epoch": 617.6478475652788, + "eval_loss": 3.128894805908203, + "eval_runtime": 75.5829, + "eval_samples_per_second": 132.305, + "eval_steps_per_second": 8.269, + "step": 43700 + }, + { + "epoch": 619.059280169372, + "grad_norm": 0.3881818950176239, + "learning_rate": 2.709030100334448e-06, + "loss": 3.1827, + "step": 43800 + }, + { + "epoch": 619.059280169372, + "eval_loss": 3.1263980865478516, + "eval_runtime": 75.3715, + "eval_samples_per_second": 132.676, + "eval_steps_per_second": 8.292, + "step": 43800 + }, + { + "epoch": 620.470712773465, + "grad_norm": 0.36317145824432373, + "learning_rate": 2.6923076923076923e-06, + "loss": 3.1842, + "step": 43900 + }, + { + "epoch": 620.470712773465, + "eval_loss": 3.1291022300720215, + "eval_runtime": 75.6179, + "eval_samples_per_second": 132.244, + "eval_steps_per_second": 8.265, + "step": 43900 + }, + { + "epoch": 621.8821453775582, + "grad_norm": 0.3756316602230072, + "learning_rate": 2.6755852842809365e-06, + "loss": 3.1834, + "step": 44000 + }, + { + "epoch": 621.8821453775582, + "eval_loss": 3.1239471435546875, + "eval_runtime": 75.2359, + "eval_samples_per_second": 132.915, + "eval_steps_per_second": 8.307, + "step": 44000 } ], "logging_steps": 100, "max_steps": 60000, "num_input_tokens_seen": 0, - "num_train_epochs": 426, + "num_train_epochs": 858, "save_steps": 1000, "stateful_callbacks": { "EarlyStoppingCallback": { @@ -3485,7 +6635,7 @@ "attributes": {} } }, - "total_flos": 2.7437151333147083e+19, + "total_flos": 9.901642959904604e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null