| { |
| "best_metric": 3.1239471435546875, |
| "best_model_checkpoint": "learning_source_20260316/rna_celltype/bert-output/rna_celltype-medium/checkpoint-44000", |
| "epoch": 621.8821453775582, |
| "eval_steps": 100, |
| "global_step": 44000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.7057785619761799, |
| "grad_norm": 4.538567066192627, |
| "learning_rate": 5e-06, |
| "loss": 5.8142, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.7057785619761799, |
| "eval_loss": 5.327540397644043, |
| "eval_runtime": 193.0087, |
| "eval_samples_per_second": 51.811, |
| "eval_steps_per_second": 6.476, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.4115571239523599, |
| "grad_norm": 7.248689651489258, |
| "learning_rate": 1e-05, |
| "loss": 5.4369, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.4115571239523599, |
| "eval_loss": 5.1204304695129395, |
| "eval_runtime": 192.4871, |
| "eval_samples_per_second": 51.952, |
| "eval_steps_per_second": 6.494, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.11733568592854, |
| "grad_norm": 3.5560193061828613, |
| "learning_rate": 9.983277591973245e-06, |
| "loss": 5.2934, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.11733568592854, |
| "eval_loss": 5.017918109893799, |
| "eval_runtime": 191.809, |
| "eval_samples_per_second": 52.135, |
| "eval_steps_per_second": 6.517, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.8231142479047198, |
| "grad_norm": 3.971824884414673, |
| "learning_rate": 9.966555183946488e-06, |
| "loss": 5.1746, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.8231142479047198, |
| "eval_loss": 4.890848636627197, |
| "eval_runtime": 192.9174, |
| "eval_samples_per_second": 51.836, |
| "eval_steps_per_second": 6.479, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.5288928098809, |
| "grad_norm": 4.859294891357422, |
| "learning_rate": 9.949832775919734e-06, |
| "loss": 5.0841, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.5288928098809, |
| "eval_loss": 4.817997455596924, |
| "eval_runtime": 192.8217, |
| "eval_samples_per_second": 51.861, |
| "eval_steps_per_second": 6.483, |
| "step": 500 |
| }, |
| { |
| "epoch": 4.23467137185708, |
| "grad_norm": 6.207881927490234, |
| "learning_rate": 9.933110367892978e-06, |
| "loss": 5.0152, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.23467137185708, |
| "eval_loss": 4.762999057769775, |
| "eval_runtime": 192.7086, |
| "eval_samples_per_second": 51.892, |
| "eval_steps_per_second": 6.486, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.94044993383326, |
| "grad_norm": 6.541872024536133, |
| "learning_rate": 9.916387959866221e-06, |
| "loss": 4.962, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.94044993383326, |
| "eval_loss": 4.727071762084961, |
| "eval_runtime": 192.7219, |
| "eval_samples_per_second": 51.888, |
| "eval_steps_per_second": 6.486, |
| "step": 700 |
| }, |
| { |
| "epoch": 5.6462284958094395, |
| "grad_norm": 5.75955057144165, |
| "learning_rate": 9.899665551839465e-06, |
| "loss": 4.9235, |
| "step": 800 |
| }, |
| { |
| "epoch": 5.6462284958094395, |
| "eval_loss": 4.683049201965332, |
| "eval_runtime": 192.7611, |
| "eval_samples_per_second": 51.878, |
| "eval_steps_per_second": 6.485, |
| "step": 800 |
| }, |
| { |
| "epoch": 6.352007057785619, |
| "grad_norm": 5.269717693328857, |
| "learning_rate": 9.88294314381271e-06, |
| "loss": 4.8862, |
| "step": 900 |
| }, |
| { |
| "epoch": 6.352007057785619, |
| "eval_loss": 4.64549446105957, |
| "eval_runtime": 196.0126, |
| "eval_samples_per_second": 51.017, |
| "eval_steps_per_second": 6.377, |
| "step": 900 |
| }, |
| { |
| "epoch": 7.0577856197618, |
| "grad_norm": 5.438960552215576, |
| "learning_rate": 9.866220735785954e-06, |
| "loss": 4.8578, |
| "step": 1000 |
| }, |
| { |
| "epoch": 7.0577856197618, |
| "eval_loss": 4.6224799156188965, |
| "eval_runtime": 192.8239, |
| "eval_samples_per_second": 51.861, |
| "eval_steps_per_second": 6.483, |
| "step": 1000 |
| }, |
| { |
| "epoch": 7.76356418173798, |
| "grad_norm": 6.405101776123047, |
| "learning_rate": 9.849498327759198e-06, |
| "loss": 4.8345, |
| "step": 1100 |
| }, |
| { |
| "epoch": 7.76356418173798, |
| "eval_loss": 4.619054317474365, |
| "eval_runtime": 192.9295, |
| "eval_samples_per_second": 51.832, |
| "eval_steps_per_second": 6.479, |
| "step": 1100 |
| }, |
| { |
| "epoch": 8.46934274371416, |
| "grad_norm": 5.179597854614258, |
| "learning_rate": 9.832775919732442e-06, |
| "loss": 4.8176, |
| "step": 1200 |
| }, |
| { |
| "epoch": 8.46934274371416, |
| "eval_loss": 4.602497100830078, |
| "eval_runtime": 193.0329, |
| "eval_samples_per_second": 51.805, |
| "eval_steps_per_second": 6.476, |
| "step": 1200 |
| }, |
| { |
| "epoch": 9.17512130569034, |
| "grad_norm": 3.9729690551757812, |
| "learning_rate": 9.816053511705687e-06, |
| "loss": 4.7947, |
| "step": 1300 |
| }, |
| { |
| "epoch": 9.17512130569034, |
| "eval_loss": 4.545534610748291, |
| "eval_runtime": 192.786, |
| "eval_samples_per_second": 51.871, |
| "eval_steps_per_second": 6.484, |
| "step": 1300 |
| }, |
| { |
| "epoch": 9.88089986766652, |
| "grad_norm": 4.882862567901611, |
| "learning_rate": 9.799331103678931e-06, |
| "loss": 4.7722, |
| "step": 1400 |
| }, |
| { |
| "epoch": 9.88089986766652, |
| "eval_loss": 4.546108722686768, |
| "eval_runtime": 192.8307, |
| "eval_samples_per_second": 51.859, |
| "eval_steps_per_second": 6.482, |
| "step": 1400 |
| }, |
| { |
| "epoch": 10.5866784296427, |
| "grad_norm": 3.8944156169891357, |
| "learning_rate": 9.782608695652175e-06, |
| "loss": 4.7513, |
| "step": 1500 |
| }, |
| { |
| "epoch": 10.5866784296427, |
| "eval_loss": 4.515088081359863, |
| "eval_runtime": 192.7368, |
| "eval_samples_per_second": 51.884, |
| "eval_steps_per_second": 6.486, |
| "step": 1500 |
| }, |
| { |
| "epoch": 11.292456991618879, |
| "grad_norm": 5.533792018890381, |
| "learning_rate": 9.765886287625419e-06, |
| "loss": 4.731, |
| "step": 1600 |
| }, |
| { |
| "epoch": 11.292456991618879, |
| "eval_loss": 4.497799396514893, |
| "eval_runtime": 191.5673, |
| "eval_samples_per_second": 52.201, |
| "eval_steps_per_second": 6.525, |
| "step": 1600 |
| }, |
| { |
| "epoch": 11.998235553595059, |
| "grad_norm": 4.2257256507873535, |
| "learning_rate": 9.749163879598664e-06, |
| "loss": 4.7193, |
| "step": 1700 |
| }, |
| { |
| "epoch": 11.998235553595059, |
| "eval_loss": 4.487800598144531, |
| "eval_runtime": 192.5021, |
| "eval_samples_per_second": 51.947, |
| "eval_steps_per_second": 6.493, |
| "step": 1700 |
| }, |
| { |
| "epoch": 12.704014115571239, |
| "grad_norm": 4.029520034790039, |
| "learning_rate": 9.732441471571908e-06, |
| "loss": 4.7014, |
| "step": 1800 |
| }, |
| { |
| "epoch": 12.704014115571239, |
| "eval_loss": 4.481775760650635, |
| "eval_runtime": 192.7297, |
| "eval_samples_per_second": 51.886, |
| "eval_steps_per_second": 6.486, |
| "step": 1800 |
| }, |
| { |
| "epoch": 13.40979267754742, |
| "grad_norm": 3.9024765491485596, |
| "learning_rate": 9.715719063545151e-06, |
| "loss": 4.6848, |
| "step": 1900 |
| }, |
| { |
| "epoch": 13.40979267754742, |
| "eval_loss": 4.4503583908081055, |
| "eval_runtime": 192.7847, |
| "eval_samples_per_second": 51.871, |
| "eval_steps_per_second": 6.484, |
| "step": 1900 |
| }, |
| { |
| "epoch": 14.1155712395236, |
| "grad_norm": 5.402480125427246, |
| "learning_rate": 9.698996655518395e-06, |
| "loss": 4.6727, |
| "step": 2000 |
| }, |
| { |
| "epoch": 14.1155712395236, |
| "eval_loss": 4.455318927764893, |
| "eval_runtime": 192.7638, |
| "eval_samples_per_second": 51.877, |
| "eval_steps_per_second": 6.485, |
| "step": 2000 |
| }, |
| { |
| "epoch": 14.82134980149978, |
| "grad_norm": 3.5884945392608643, |
| "learning_rate": 9.682274247491639e-06, |
| "loss": 4.6611, |
| "step": 2100 |
| }, |
| { |
| "epoch": 14.82134980149978, |
| "eval_loss": 4.425973415374756, |
| "eval_runtime": 192.8769, |
| "eval_samples_per_second": 51.847, |
| "eval_steps_per_second": 6.481, |
| "step": 2100 |
| }, |
| { |
| "epoch": 15.52712836347596, |
| "grad_norm": 4.391833782196045, |
| "learning_rate": 9.665551839464884e-06, |
| "loss": 4.6476, |
| "step": 2200 |
| }, |
| { |
| "epoch": 15.52712836347596, |
| "eval_loss": 4.420197010040283, |
| "eval_runtime": 193.3815, |
| "eval_samples_per_second": 51.711, |
| "eval_steps_per_second": 6.464, |
| "step": 2200 |
| }, |
| { |
| "epoch": 16.232906925452138, |
| "grad_norm": 3.3455958366394043, |
| "learning_rate": 9.648829431438128e-06, |
| "loss": 4.638, |
| "step": 2300 |
| }, |
| { |
| "epoch": 16.232906925452138, |
| "eval_loss": 4.404079914093018, |
| "eval_runtime": 192.7422, |
| "eval_samples_per_second": 51.883, |
| "eval_steps_per_second": 6.485, |
| "step": 2300 |
| }, |
| { |
| "epoch": 16.93868548742832, |
| "grad_norm": 2.938906669616699, |
| "learning_rate": 9.632107023411372e-06, |
| "loss": 4.6264, |
| "step": 2400 |
| }, |
| { |
| "epoch": 16.93868548742832, |
| "eval_loss": 4.401567459106445, |
| "eval_runtime": 192.7203, |
| "eval_samples_per_second": 51.889, |
| "eval_steps_per_second": 6.486, |
| "step": 2400 |
| }, |
| { |
| "epoch": 17.644464049404498, |
| "grad_norm": 2.6890764236450195, |
| "learning_rate": 9.615384615384616e-06, |
| "loss": 4.6163, |
| "step": 2500 |
| }, |
| { |
| "epoch": 17.644464049404498, |
| "eval_loss": 4.387485504150391, |
| "eval_runtime": 192.7331, |
| "eval_samples_per_second": 51.885, |
| "eval_steps_per_second": 6.486, |
| "step": 2500 |
| }, |
| { |
| "epoch": 18.35024261138068, |
| "grad_norm": 3.5793533325195312, |
| "learning_rate": 9.598662207357861e-06, |
| "loss": 4.6039, |
| "step": 2600 |
| }, |
| { |
| "epoch": 18.35024261138068, |
| "eval_loss": 4.388312339782715, |
| "eval_runtime": 192.6508, |
| "eval_samples_per_second": 51.907, |
| "eval_steps_per_second": 6.488, |
| "step": 2600 |
| }, |
| { |
| "epoch": 19.05602117335686, |
| "grad_norm": 2.4879586696624756, |
| "learning_rate": 9.581939799331105e-06, |
| "loss": 4.5951, |
| "step": 2700 |
| }, |
| { |
| "epoch": 19.05602117335686, |
| "eval_loss": 4.378627300262451, |
| "eval_runtime": 192.7297, |
| "eval_samples_per_second": 51.886, |
| "eval_steps_per_second": 6.486, |
| "step": 2700 |
| }, |
| { |
| "epoch": 19.76179973533304, |
| "grad_norm": 4.8779072761535645, |
| "learning_rate": 9.565217391304349e-06, |
| "loss": 4.592, |
| "step": 2800 |
| }, |
| { |
| "epoch": 19.76179973533304, |
| "eval_loss": 4.376707077026367, |
| "eval_runtime": 191.5339, |
| "eval_samples_per_second": 52.21, |
| "eval_steps_per_second": 6.526, |
| "step": 2800 |
| }, |
| { |
| "epoch": 20.46757829730922, |
| "grad_norm": 4.811100006103516, |
| "learning_rate": 9.548494983277592e-06, |
| "loss": 4.5825, |
| "step": 2900 |
| }, |
| { |
| "epoch": 20.46757829730922, |
| "eval_loss": 4.363040447235107, |
| "eval_runtime": 192.7097, |
| "eval_samples_per_second": 51.892, |
| "eval_steps_per_second": 6.486, |
| "step": 2900 |
| }, |
| { |
| "epoch": 21.1733568592854, |
| "grad_norm": 3.4796102046966553, |
| "learning_rate": 9.531772575250838e-06, |
| "loss": 4.5731, |
| "step": 3000 |
| }, |
| { |
| "epoch": 21.1733568592854, |
| "eval_loss": 4.348217964172363, |
| "eval_runtime": 192.7117, |
| "eval_samples_per_second": 51.891, |
| "eval_steps_per_second": 6.486, |
| "step": 3000 |
| }, |
| { |
| "epoch": 21.87913542126158, |
| "grad_norm": 3.2958459854125977, |
| "learning_rate": 9.515050167224082e-06, |
| "loss": 4.5611, |
| "step": 3100 |
| }, |
| { |
| "epoch": 21.87913542126158, |
| "eval_loss": 4.336493968963623, |
| "eval_runtime": 192.6996, |
| "eval_samples_per_second": 51.894, |
| "eval_steps_per_second": 6.487, |
| "step": 3100 |
| }, |
| { |
| "epoch": 22.584913983237758, |
| "grad_norm": 3.4779412746429443, |
| "learning_rate": 9.498327759197325e-06, |
| "loss": 4.5518, |
| "step": 3200 |
| }, |
| { |
| "epoch": 22.584913983237758, |
| "eval_loss": 4.320661544799805, |
| "eval_runtime": 192.7594, |
| "eval_samples_per_second": 51.878, |
| "eval_steps_per_second": 6.485, |
| "step": 3200 |
| }, |
| { |
| "epoch": 23.29069254521394, |
| "grad_norm": 2.7907259464263916, |
| "learning_rate": 9.48160535117057e-06, |
| "loss": 4.5401, |
| "step": 3300 |
| }, |
| { |
| "epoch": 23.29069254521394, |
| "eval_loss": 4.316312313079834, |
| "eval_runtime": 192.9908, |
| "eval_samples_per_second": 51.816, |
| "eval_steps_per_second": 6.477, |
| "step": 3300 |
| }, |
| { |
| "epoch": 23.996471107190118, |
| "grad_norm": 3.1774535179138184, |
| "learning_rate": 9.464882943143815e-06, |
| "loss": 4.5307, |
| "step": 3400 |
| }, |
| { |
| "epoch": 23.996471107190118, |
| "eval_loss": 4.305875778198242, |
| "eval_runtime": 192.9792, |
| "eval_samples_per_second": 51.819, |
| "eval_steps_per_second": 6.477, |
| "step": 3400 |
| }, |
| { |
| "epoch": 24.7022496691663, |
| "grad_norm": 4.013304233551025, |
| "learning_rate": 9.448160535117058e-06, |
| "loss": 4.5236, |
| "step": 3500 |
| }, |
| { |
| "epoch": 24.7022496691663, |
| "eval_loss": 4.303380012512207, |
| "eval_runtime": 192.8249, |
| "eval_samples_per_second": 51.861, |
| "eval_steps_per_second": 6.483, |
| "step": 3500 |
| }, |
| { |
| "epoch": 25.408028231142477, |
| "grad_norm": 3.611783504486084, |
| "learning_rate": 9.431438127090302e-06, |
| "loss": 4.5173, |
| "step": 3600 |
| }, |
| { |
| "epoch": 25.408028231142477, |
| "eval_loss": 4.297857761383057, |
| "eval_runtime": 192.8889, |
| "eval_samples_per_second": 51.843, |
| "eval_steps_per_second": 6.48, |
| "step": 3600 |
| }, |
| { |
| "epoch": 26.11380679311866, |
| "grad_norm": 2.8548221588134766, |
| "learning_rate": 9.414715719063546e-06, |
| "loss": 4.5052, |
| "step": 3700 |
| }, |
| { |
| "epoch": 26.11380679311866, |
| "eval_loss": 4.289151668548584, |
| "eval_runtime": 192.7224, |
| "eval_samples_per_second": 51.888, |
| "eval_steps_per_second": 6.486, |
| "step": 3700 |
| }, |
| { |
| "epoch": 26.81958535509484, |
| "grad_norm": 3.3853394985198975, |
| "learning_rate": 9.39799331103679e-06, |
| "loss": 4.4983, |
| "step": 3800 |
| }, |
| { |
| "epoch": 26.81958535509484, |
| "eval_loss": 4.275169372558594, |
| "eval_runtime": 192.7669, |
| "eval_samples_per_second": 51.876, |
| "eval_steps_per_second": 6.485, |
| "step": 3800 |
| }, |
| { |
| "epoch": 27.52536391707102, |
| "grad_norm": 2.4811007976531982, |
| "learning_rate": 9.381270903010035e-06, |
| "loss": 4.4876, |
| "step": 3900 |
| }, |
| { |
| "epoch": 27.52536391707102, |
| "eval_loss": 4.272207736968994, |
| "eval_runtime": 192.9342, |
| "eval_samples_per_second": 51.831, |
| "eval_steps_per_second": 6.479, |
| "step": 3900 |
| }, |
| { |
| "epoch": 28.2311424790472, |
| "grad_norm": 2.3202896118164062, |
| "learning_rate": 9.364548494983279e-06, |
| "loss": 4.4792, |
| "step": 4000 |
| }, |
| { |
| "epoch": 28.2311424790472, |
| "eval_loss": 4.268562316894531, |
| "eval_runtime": 191.834, |
| "eval_samples_per_second": 52.128, |
| "eval_steps_per_second": 6.516, |
| "step": 4000 |
| }, |
| { |
| "epoch": 28.93692104102338, |
| "grad_norm": 2.385047674179077, |
| "learning_rate": 9.347826086956523e-06, |
| "loss": 4.4711, |
| "step": 4100 |
| }, |
| { |
| "epoch": 28.93692104102338, |
| "eval_loss": 4.258542537689209, |
| "eval_runtime": 192.3883, |
| "eval_samples_per_second": 51.978, |
| "eval_steps_per_second": 6.497, |
| "step": 4100 |
| }, |
| { |
| "epoch": 29.64269960299956, |
| "grad_norm": 2.873243808746338, |
| "learning_rate": 9.331103678929766e-06, |
| "loss": 4.4627, |
| "step": 4200 |
| }, |
| { |
| "epoch": 29.64269960299956, |
| "eval_loss": 4.245582580566406, |
| "eval_runtime": 192.4906, |
| "eval_samples_per_second": 51.951, |
| "eval_steps_per_second": 6.494, |
| "step": 4200 |
| }, |
| { |
| "epoch": 30.348478164975738, |
| "grad_norm": 2.814659833908081, |
| "learning_rate": 9.314381270903012e-06, |
| "loss": 4.4557, |
| "step": 4300 |
| }, |
| { |
| "epoch": 30.348478164975738, |
| "eval_loss": 4.239570140838623, |
| "eval_runtime": 192.8746, |
| "eval_samples_per_second": 51.847, |
| "eval_steps_per_second": 6.481, |
| "step": 4300 |
| }, |
| { |
| "epoch": 31.05425672695192, |
| "grad_norm": 2.69787859916687, |
| "learning_rate": 9.297658862876256e-06, |
| "loss": 4.4454, |
| "step": 4400 |
| }, |
| { |
| "epoch": 31.05425672695192, |
| "eval_loss": 4.234774589538574, |
| "eval_runtime": 192.878, |
| "eval_samples_per_second": 51.846, |
| "eval_steps_per_second": 6.481, |
| "step": 4400 |
| }, |
| { |
| "epoch": 31.760035288928098, |
| "grad_norm": 2.860370397567749, |
| "learning_rate": 9.2809364548495e-06, |
| "loss": 4.4384, |
| "step": 4500 |
| }, |
| { |
| "epoch": 31.760035288928098, |
| "eval_loss": 4.227160930633545, |
| "eval_runtime": 193.0111, |
| "eval_samples_per_second": 51.811, |
| "eval_steps_per_second": 6.476, |
| "step": 4500 |
| }, |
| { |
| "epoch": 32.465813850904276, |
| "grad_norm": 2.4307503700256348, |
| "learning_rate": 9.264214046822743e-06, |
| "loss": 4.4306, |
| "step": 4600 |
| }, |
| { |
| "epoch": 32.465813850904276, |
| "eval_loss": 4.218355655670166, |
| "eval_runtime": 192.9008, |
| "eval_samples_per_second": 51.84, |
| "eval_steps_per_second": 6.48, |
| "step": 4600 |
| }, |
| { |
| "epoch": 33.17159241288046, |
| "grad_norm": 2.4772098064422607, |
| "learning_rate": 9.247491638795989e-06, |
| "loss": 4.4233, |
| "step": 4700 |
| }, |
| { |
| "epoch": 33.17159241288046, |
| "eval_loss": 4.208319664001465, |
| "eval_runtime": 191.9987, |
| "eval_samples_per_second": 52.084, |
| "eval_steps_per_second": 6.51, |
| "step": 4700 |
| }, |
| { |
| "epoch": 33.87737097485664, |
| "grad_norm": 3.371622085571289, |
| "learning_rate": 9.230769230769232e-06, |
| "loss": 4.4188, |
| "step": 4800 |
| }, |
| { |
| "epoch": 33.87737097485664, |
| "eval_loss": 4.222493648529053, |
| "eval_runtime": 192.3929, |
| "eval_samples_per_second": 51.977, |
| "eval_steps_per_second": 6.497, |
| "step": 4800 |
| }, |
| { |
| "epoch": 34.58314953683282, |
| "grad_norm": 3.5253772735595703, |
| "learning_rate": 9.214046822742476e-06, |
| "loss": 4.4141, |
| "step": 4900 |
| }, |
| { |
| "epoch": 34.58314953683282, |
| "eval_loss": 4.214654922485352, |
| "eval_runtime": 192.7046, |
| "eval_samples_per_second": 51.893, |
| "eval_steps_per_second": 6.487, |
| "step": 4900 |
| }, |
| { |
| "epoch": 35.288928098809, |
| "grad_norm": 3.155348300933838, |
| "learning_rate": 9.19732441471572e-06, |
| "loss": 4.4075, |
| "step": 5000 |
| }, |
| { |
| "epoch": 35.288928098809, |
| "eval_loss": 4.202410697937012, |
| "eval_runtime": 192.8791, |
| "eval_samples_per_second": 51.846, |
| "eval_steps_per_second": 6.481, |
| "step": 5000 |
| }, |
| { |
| "epoch": 72.8348623853211, |
| "grad_norm": 3.394275188446045, |
| "learning_rate": 9.180602006688965e-06, |
| "loss": 4.3981, |
| "step": 5100 |
| }, |
| { |
| "epoch": 72.8348623853211, |
| "eval_loss": 4.189794063568115, |
| "eval_runtime": 75.5217, |
| "eval_samples_per_second": 132.412, |
| "eval_steps_per_second": 8.276, |
| "step": 5100 |
| }, |
| { |
| "epoch": 74.24629498941425, |
| "grad_norm": 2.187847375869751, |
| "learning_rate": 9.163879598662207e-06, |
| "loss": 4.3878, |
| "step": 5200 |
| }, |
| { |
| "epoch": 74.24629498941425, |
| "eval_loss": 4.175257205963135, |
| "eval_runtime": 75.2341, |
| "eval_samples_per_second": 132.918, |
| "eval_steps_per_second": 8.307, |
| "step": 5200 |
| }, |
| { |
| "epoch": 75.65772759350742, |
| "grad_norm": 2.895294427871704, |
| "learning_rate": 9.147157190635451e-06, |
| "loss": 4.3818, |
| "step": 5300 |
| }, |
| { |
| "epoch": 75.65772759350742, |
| "eval_loss": 4.169253349304199, |
| "eval_runtime": 75.1154, |
| "eval_samples_per_second": 133.129, |
| "eval_steps_per_second": 8.321, |
| "step": 5300 |
| }, |
| { |
| "epoch": 77.06916019760057, |
| "grad_norm": 2.3158841133117676, |
| "learning_rate": 9.130434782608697e-06, |
| "loss": 4.3728, |
| "step": 5400 |
| }, |
| { |
| "epoch": 77.06916019760057, |
| "eval_loss": 4.166540622711182, |
| "eval_runtime": 75.1279, |
| "eval_samples_per_second": 133.106, |
| "eval_steps_per_second": 8.319, |
| "step": 5400 |
| }, |
| { |
| "epoch": 78.48059280169372, |
| "grad_norm": 2.402764081954956, |
| "learning_rate": 9.11371237458194e-06, |
| "loss": 4.3659, |
| "step": 5500 |
| }, |
| { |
| "epoch": 78.48059280169372, |
| "eval_loss": 4.161038875579834, |
| "eval_runtime": 74.9365, |
| "eval_samples_per_second": 133.446, |
| "eval_steps_per_second": 8.34, |
| "step": 5500 |
| }, |
| { |
| "epoch": 79.89202540578687, |
| "grad_norm": 2.1741960048675537, |
| "learning_rate": 9.096989966555184e-06, |
| "loss": 4.3566, |
| "step": 5600 |
| }, |
| { |
| "epoch": 79.89202540578687, |
| "eval_loss": 4.145488262176514, |
| "eval_runtime": 75.1211, |
| "eval_samples_per_second": 133.118, |
| "eval_steps_per_second": 8.32, |
| "step": 5600 |
| }, |
| { |
| "epoch": 81.30345800988003, |
| "grad_norm": 1.9842256307601929, |
| "learning_rate": 9.080267558528428e-06, |
| "loss": 4.3473, |
| "step": 5700 |
| }, |
| { |
| "epoch": 81.30345800988003, |
| "eval_loss": 4.139003753662109, |
| "eval_runtime": 75.1615, |
| "eval_samples_per_second": 133.047, |
| "eval_steps_per_second": 8.315, |
| "step": 5700 |
| }, |
| { |
| "epoch": 82.71489061397318, |
| "grad_norm": 2.0038816928863525, |
| "learning_rate": 9.063545150501673e-06, |
| "loss": 4.3389, |
| "step": 5800 |
| }, |
| { |
| "epoch": 82.71489061397318, |
| "eval_loss": 4.1378560066223145, |
| "eval_runtime": 75.1493, |
| "eval_samples_per_second": 133.068, |
| "eval_steps_per_second": 8.317, |
| "step": 5800 |
| }, |
| { |
| "epoch": 84.12632321806633, |
| "grad_norm": 2.4109838008880615, |
| "learning_rate": 9.046822742474917e-06, |
| "loss": 4.3293, |
| "step": 5900 |
| }, |
| { |
| "epoch": 84.12632321806633, |
| "eval_loss": 4.127132415771484, |
| "eval_runtime": 75.1873, |
| "eval_samples_per_second": 133.001, |
| "eval_steps_per_second": 8.313, |
| "step": 5900 |
| }, |
| { |
| "epoch": 85.5377558221595, |
| "grad_norm": 2.1353368759155273, |
| "learning_rate": 9.03010033444816e-06, |
| "loss": 4.3203, |
| "step": 6000 |
| }, |
| { |
| "epoch": 85.5377558221595, |
| "eval_loss": 4.116780757904053, |
| "eval_runtime": 74.9341, |
| "eval_samples_per_second": 133.451, |
| "eval_steps_per_second": 8.341, |
| "step": 6000 |
| }, |
| { |
| "epoch": 86.94918842625265, |
| "grad_norm": 2.372614860534668, |
| "learning_rate": 9.013377926421405e-06, |
| "loss": 4.3126, |
| "step": 6100 |
| }, |
| { |
| "epoch": 86.94918842625265, |
| "eval_loss": 4.109887599945068, |
| "eval_runtime": 75.1685, |
| "eval_samples_per_second": 133.034, |
| "eval_steps_per_second": 8.315, |
| "step": 6100 |
| }, |
| { |
| "epoch": 88.3606210303458, |
| "grad_norm": 1.8894693851470947, |
| "learning_rate": 8.996655518394648e-06, |
| "loss": 4.3013, |
| "step": 6200 |
| }, |
| { |
| "epoch": 88.3606210303458, |
| "eval_loss": 4.100041389465332, |
| "eval_runtime": 75.1752, |
| "eval_samples_per_second": 133.023, |
| "eval_steps_per_second": 8.314, |
| "step": 6200 |
| }, |
| { |
| "epoch": 89.77205363443896, |
| "grad_norm": 1.9410585165023804, |
| "learning_rate": 8.979933110367894e-06, |
| "loss": 4.2934, |
| "step": 6300 |
| }, |
| { |
| "epoch": 89.77205363443896, |
| "eval_loss": 4.090487480163574, |
| "eval_runtime": 75.1691, |
| "eval_samples_per_second": 133.033, |
| "eval_steps_per_second": 8.315, |
| "step": 6300 |
| }, |
| { |
| "epoch": 91.18348623853211, |
| "grad_norm": 3.4204158782958984, |
| "learning_rate": 8.963210702341138e-06, |
| "loss": 4.2875, |
| "step": 6400 |
| }, |
| { |
| "epoch": 91.18348623853211, |
| "eval_loss": 4.08727502822876, |
| "eval_runtime": 75.1834, |
| "eval_samples_per_second": 133.008, |
| "eval_steps_per_second": 8.313, |
| "step": 6400 |
| }, |
| { |
| "epoch": 92.59491884262526, |
| "grad_norm": 3.0449135303497314, |
| "learning_rate": 8.946488294314381e-06, |
| "loss": 4.2808, |
| "step": 6500 |
| }, |
| { |
| "epoch": 92.59491884262526, |
| "eval_loss": 4.0819501876831055, |
| "eval_runtime": 74.8907, |
| "eval_samples_per_second": 133.528, |
| "eval_steps_per_second": 8.345, |
| "step": 6500 |
| }, |
| { |
| "epoch": 94.00635144671843, |
| "grad_norm": 1.9112857580184937, |
| "learning_rate": 8.929765886287625e-06, |
| "loss": 4.2735, |
| "step": 6600 |
| }, |
| { |
| "epoch": 94.00635144671843, |
| "eval_loss": 4.071495056152344, |
| "eval_runtime": 75.2761, |
| "eval_samples_per_second": 132.844, |
| "eval_steps_per_second": 8.303, |
| "step": 6600 |
| }, |
| { |
| "epoch": 95.41778405081158, |
| "grad_norm": 1.7011171579360962, |
| "learning_rate": 8.91304347826087e-06, |
| "loss": 4.264, |
| "step": 6700 |
| }, |
| { |
| "epoch": 95.41778405081158, |
| "eval_loss": 4.066438674926758, |
| "eval_runtime": 75.215, |
| "eval_samples_per_second": 132.952, |
| "eval_steps_per_second": 8.31, |
| "step": 6700 |
| }, |
| { |
| "epoch": 96.82921665490473, |
| "grad_norm": 1.510539174079895, |
| "learning_rate": 8.896321070234114e-06, |
| "loss": 4.2556, |
| "step": 6800 |
| }, |
| { |
| "epoch": 96.82921665490473, |
| "eval_loss": 4.061617851257324, |
| "eval_runtime": 75.2534, |
| "eval_samples_per_second": 132.884, |
| "eval_steps_per_second": 8.305, |
| "step": 6800 |
| }, |
| { |
| "epoch": 98.24064925899788, |
| "grad_norm": 1.6027462482452393, |
| "learning_rate": 8.879598662207358e-06, |
| "loss": 4.2455, |
| "step": 6900 |
| }, |
| { |
| "epoch": 98.24064925899788, |
| "eval_loss": 4.0491743087768555, |
| "eval_runtime": 75.2588, |
| "eval_samples_per_second": 132.875, |
| "eval_steps_per_second": 8.305, |
| "step": 6900 |
| }, |
| { |
| "epoch": 99.65208186309104, |
| "grad_norm": 1.8287110328674316, |
| "learning_rate": 8.862876254180602e-06, |
| "loss": 4.2361, |
| "step": 7000 |
| }, |
| { |
| "epoch": 99.65208186309104, |
| "eval_loss": 4.048295974731445, |
| "eval_runtime": 74.9626, |
| "eval_samples_per_second": 133.4, |
| "eval_steps_per_second": 8.337, |
| "step": 7000 |
| }, |
| { |
| "epoch": 101.06351446718419, |
| "grad_norm": 1.6138700246810913, |
| "learning_rate": 8.846153846153847e-06, |
| "loss": 4.2272, |
| "step": 7100 |
| }, |
| { |
| "epoch": 101.06351446718419, |
| "eval_loss": 4.032159805297852, |
| "eval_runtime": 75.1461, |
| "eval_samples_per_second": 133.074, |
| "eval_steps_per_second": 8.317, |
| "step": 7100 |
| }, |
| { |
| "epoch": 102.47494707127734, |
| "grad_norm": 1.5357685089111328, |
| "learning_rate": 8.829431438127091e-06, |
| "loss": 4.2171, |
| "step": 7200 |
| }, |
| { |
| "epoch": 102.47494707127734, |
| "eval_loss": 4.029723167419434, |
| "eval_runtime": 75.1903, |
| "eval_samples_per_second": 132.996, |
| "eval_steps_per_second": 8.312, |
| "step": 7200 |
| }, |
| { |
| "epoch": 103.8863796753705, |
| "grad_norm": 2.423367977142334, |
| "learning_rate": 8.812709030100335e-06, |
| "loss": 4.2108, |
| "step": 7300 |
| }, |
| { |
| "epoch": 103.8863796753705, |
| "eval_loss": 4.024651527404785, |
| "eval_runtime": 75.1893, |
| "eval_samples_per_second": 132.998, |
| "eval_steps_per_second": 8.312, |
| "step": 7300 |
| }, |
| { |
| "epoch": 105.29781227946366, |
| "grad_norm": 1.7042992115020752, |
| "learning_rate": 8.795986622073578e-06, |
| "loss": 4.203, |
| "step": 7400 |
| }, |
| { |
| "epoch": 105.29781227946366, |
| "eval_loss": 4.0152106285095215, |
| "eval_runtime": 74.9058, |
| "eval_samples_per_second": 133.501, |
| "eval_steps_per_second": 8.344, |
| "step": 7400 |
| }, |
| { |
| "epoch": 106.7092448835568, |
| "grad_norm": 1.934262752532959, |
| "learning_rate": 8.779264214046824e-06, |
| "loss": 4.1943, |
| "step": 7500 |
| }, |
| { |
| "epoch": 106.7092448835568, |
| "eval_loss": 4.006458282470703, |
| "eval_runtime": 75.0162, |
| "eval_samples_per_second": 133.305, |
| "eval_steps_per_second": 8.332, |
| "step": 7500 |
| }, |
| { |
| "epoch": 108.12067748764997, |
| "grad_norm": 1.7501742839813232, |
| "learning_rate": 8.762541806020068e-06, |
| "loss": 4.1853, |
| "step": 7600 |
| }, |
| { |
| "epoch": 108.12067748764997, |
| "eval_loss": 4.001890182495117, |
| "eval_runtime": 75.2517, |
| "eval_samples_per_second": 132.887, |
| "eval_steps_per_second": 8.305, |
| "step": 7600 |
| }, |
| { |
| "epoch": 109.53211009174312, |
| "grad_norm": 1.874009609222412, |
| "learning_rate": 8.745819397993311e-06, |
| "loss": 4.1751, |
| "step": 7700 |
| }, |
| { |
| "epoch": 109.53211009174312, |
| "eval_loss": 3.9898202419281006, |
| "eval_runtime": 75.147, |
| "eval_samples_per_second": 133.072, |
| "eval_steps_per_second": 8.317, |
| "step": 7700 |
| }, |
| { |
| "epoch": 110.94354269583627, |
| "grad_norm": 1.5682804584503174, |
| "learning_rate": 8.729096989966555e-06, |
| "loss": 4.1656, |
| "step": 7800 |
| }, |
| { |
| "epoch": 110.94354269583627, |
| "eval_loss": 3.9801883697509766, |
| "eval_runtime": 75.3127, |
| "eval_samples_per_second": 132.78, |
| "eval_steps_per_second": 8.299, |
| "step": 7800 |
| }, |
| { |
| "epoch": 112.35497529992942, |
| "grad_norm": 1.5020002126693726, |
| "learning_rate": 8.712374581939799e-06, |
| "loss": 4.1549, |
| "step": 7900 |
| }, |
| { |
| "epoch": 112.35497529992942, |
| "eval_loss": 3.973633289337158, |
| "eval_runtime": 75.0416, |
| "eval_samples_per_second": 133.259, |
| "eval_steps_per_second": 8.329, |
| "step": 7900 |
| }, |
| { |
| "epoch": 113.76640790402259, |
| "grad_norm": 1.4100362062454224, |
| "learning_rate": 8.695652173913044e-06, |
| "loss": 4.1444, |
| "step": 8000 |
| }, |
| { |
| "epoch": 113.76640790402259, |
| "eval_loss": 3.9643702507019043, |
| "eval_runtime": 75.2941, |
| "eval_samples_per_second": 132.813, |
| "eval_steps_per_second": 8.301, |
| "step": 8000 |
| }, |
| { |
| "epoch": 115.17784050811574, |
| "grad_norm": 1.2643848657608032, |
| "learning_rate": 8.678929765886288e-06, |
| "loss": 4.1331, |
| "step": 8100 |
| }, |
| { |
| "epoch": 115.17784050811574, |
| "eval_loss": 3.9529407024383545, |
| "eval_runtime": 75.2161, |
| "eval_samples_per_second": 132.95, |
| "eval_steps_per_second": 8.309, |
| "step": 8100 |
| }, |
| { |
| "epoch": 116.58927311220889, |
| "grad_norm": 1.3504067659378052, |
| "learning_rate": 8.662207357859532e-06, |
| "loss": 4.1221, |
| "step": 8200 |
| }, |
| { |
| "epoch": 116.58927311220889, |
| "eval_loss": 3.946784257888794, |
| "eval_runtime": 75.2097, |
| "eval_samples_per_second": 132.962, |
| "eval_steps_per_second": 8.31, |
| "step": 8200 |
| }, |
| { |
| "epoch": 118.00070571630205, |
| "grad_norm": 1.3503401279449463, |
| "learning_rate": 8.645484949832776e-06, |
| "loss": 4.1107, |
| "step": 8300 |
| }, |
| { |
| "epoch": 118.00070571630205, |
| "eval_loss": 3.936836004257202, |
| "eval_runtime": 75.1834, |
| "eval_samples_per_second": 133.008, |
| "eval_steps_per_second": 8.313, |
| "step": 8300 |
| }, |
| { |
| "epoch": 119.4121383203952, |
| "grad_norm": 1.4280999898910522, |
| "learning_rate": 8.628762541806021e-06, |
| "loss": 4.1001, |
| "step": 8400 |
| }, |
| { |
| "epoch": 119.4121383203952, |
| "eval_loss": 3.926053047180176, |
| "eval_runtime": 75.2443, |
| "eval_samples_per_second": 132.9, |
| "eval_steps_per_second": 8.306, |
| "step": 8400 |
| }, |
| { |
| "epoch": 120.82357092448835, |
| "grad_norm": 1.8372641801834106, |
| "learning_rate": 8.612040133779265e-06, |
| "loss": 4.0876, |
| "step": 8500 |
| }, |
| { |
| "epoch": 120.82357092448835, |
| "eval_loss": 3.9173219203948975, |
| "eval_runtime": 75.0392, |
| "eval_samples_per_second": 133.264, |
| "eval_steps_per_second": 8.329, |
| "step": 8500 |
| }, |
| { |
| "epoch": 122.23500352858152, |
| "grad_norm": 1.6978626251220703, |
| "learning_rate": 8.595317725752509e-06, |
| "loss": 4.079, |
| "step": 8600 |
| }, |
| { |
| "epoch": 122.23500352858152, |
| "eval_loss": 3.9100394248962402, |
| "eval_runtime": 75.1695, |
| "eval_samples_per_second": 133.033, |
| "eval_steps_per_second": 8.315, |
| "step": 8600 |
| }, |
| { |
| "epoch": 123.64643613267467, |
| "grad_norm": 1.5298271179199219, |
| "learning_rate": 8.578595317725752e-06, |
| "loss": 4.069, |
| "step": 8700 |
| }, |
| { |
| "epoch": 123.64643613267467, |
| "eval_loss": 3.9006407260894775, |
| "eval_runtime": 75.13, |
| "eval_samples_per_second": 133.103, |
| "eval_steps_per_second": 8.319, |
| "step": 8700 |
| }, |
| { |
| "epoch": 125.05786873676782, |
| "grad_norm": 1.4782963991165161, |
| "learning_rate": 8.561872909698998e-06, |
| "loss": 4.06, |
| "step": 8800 |
| }, |
| { |
| "epoch": 125.05786873676782, |
| "eval_loss": 3.8956563472747803, |
| "eval_runtime": 75.325, |
| "eval_samples_per_second": 132.758, |
| "eval_steps_per_second": 8.297, |
| "step": 8800 |
| }, |
| { |
| "epoch": 126.46930134086098, |
| "grad_norm": 1.5350950956344604, |
| "learning_rate": 8.545150501672242e-06, |
| "loss": 4.0513, |
| "step": 8900 |
| }, |
| { |
| "epoch": 126.46930134086098, |
| "eval_loss": 3.887305498123169, |
| "eval_runtime": 75.2815, |
| "eval_samples_per_second": 132.835, |
| "eval_steps_per_second": 8.302, |
| "step": 8900 |
| }, |
| { |
| "epoch": 127.88073394495413, |
| "grad_norm": 1.1390595436096191, |
| "learning_rate": 8.528428093645485e-06, |
| "loss": 4.0414, |
| "step": 9000 |
| }, |
| { |
| "epoch": 127.88073394495413, |
| "eval_loss": 3.882507801055908, |
| "eval_runtime": 75.0211, |
| "eval_samples_per_second": 133.296, |
| "eval_steps_per_second": 8.331, |
| "step": 9000 |
| }, |
| { |
| "epoch": 129.2921665490473, |
| "grad_norm": 1.2423325777053833, |
| "learning_rate": 8.511705685618729e-06, |
| "loss": 4.0329, |
| "step": 9100 |
| }, |
| { |
| "epoch": 129.2921665490473, |
| "eval_loss": 3.8772571086883545, |
| "eval_runtime": 75.2328, |
| "eval_samples_per_second": 132.921, |
| "eval_steps_per_second": 8.308, |
| "step": 9100 |
| }, |
| { |
| "epoch": 130.70359915314043, |
| "grad_norm": 1.1587265729904175, |
| "learning_rate": 8.494983277591975e-06, |
| "loss": 4.0247, |
| "step": 9200 |
| }, |
| { |
| "epoch": 130.70359915314043, |
| "eval_loss": 3.8697094917297363, |
| "eval_runtime": 75.2987, |
| "eval_samples_per_second": 132.804, |
| "eval_steps_per_second": 8.3, |
| "step": 9200 |
| }, |
| { |
| "epoch": 132.1150317572336, |
| "grad_norm": 1.3564627170562744, |
| "learning_rate": 8.478260869565218e-06, |
| "loss": 4.0162, |
| "step": 9300 |
| }, |
| { |
| "epoch": 132.1150317572336, |
| "eval_loss": 3.8612961769104004, |
| "eval_runtime": 75.0366, |
| "eval_samples_per_second": 133.268, |
| "eval_steps_per_second": 8.329, |
| "step": 9300 |
| }, |
| { |
| "epoch": 133.52646436132676, |
| "grad_norm": 1.1728644371032715, |
| "learning_rate": 8.461538461538462e-06, |
| "loss": 4.0088, |
| "step": 9400 |
| }, |
| { |
| "epoch": 133.52646436132676, |
| "eval_loss": 3.853496789932251, |
| "eval_runtime": 75.1117, |
| "eval_samples_per_second": 133.135, |
| "eval_steps_per_second": 8.321, |
| "step": 9400 |
| }, |
| { |
| "epoch": 134.9378969654199, |
| "grad_norm": 1.221337080001831, |
| "learning_rate": 8.444816053511706e-06, |
| "loss": 3.9999, |
| "step": 9500 |
| }, |
| { |
| "epoch": 134.9378969654199, |
| "eval_loss": 3.848421812057495, |
| "eval_runtime": 75.1472, |
| "eval_samples_per_second": 133.072, |
| "eval_steps_per_second": 8.317, |
| "step": 9500 |
| }, |
| { |
| "epoch": 136.34932956951306, |
| "grad_norm": 1.2611275911331177, |
| "learning_rate": 8.42809364548495e-06, |
| "loss": 3.9922, |
| "step": 9600 |
| }, |
| { |
| "epoch": 136.34932956951306, |
| "eval_loss": 3.8415913581848145, |
| "eval_runtime": 75.2162, |
| "eval_samples_per_second": 132.95, |
| "eval_steps_per_second": 8.309, |
| "step": 9600 |
| }, |
| { |
| "epoch": 137.76076217360622, |
| "grad_norm": 1.0138766765594482, |
| "learning_rate": 8.411371237458195e-06, |
| "loss": 3.9831, |
| "step": 9700 |
| }, |
| { |
| "epoch": 137.76076217360622, |
| "eval_loss": 3.8360984325408936, |
| "eval_runtime": 75.2818, |
| "eval_samples_per_second": 132.834, |
| "eval_steps_per_second": 8.302, |
| "step": 9700 |
| }, |
| { |
| "epoch": 139.17219477769936, |
| "grad_norm": 1.4916014671325684, |
| "learning_rate": 8.394648829431439e-06, |
| "loss": 3.9754, |
| "step": 9800 |
| }, |
| { |
| "epoch": 139.17219477769936, |
| "eval_loss": 3.8284976482391357, |
| "eval_runtime": 75.0346, |
| "eval_samples_per_second": 133.272, |
| "eval_steps_per_second": 8.329, |
| "step": 9800 |
| }, |
| { |
| "epoch": 140.58362738179252, |
| "grad_norm": 1.2558103799819946, |
| "learning_rate": 8.377926421404683e-06, |
| "loss": 3.9674, |
| "step": 9900 |
| }, |
| { |
| "epoch": 140.58362738179252, |
| "eval_loss": 3.8220624923706055, |
| "eval_runtime": 75.3076, |
| "eval_samples_per_second": 132.789, |
| "eval_steps_per_second": 8.299, |
| "step": 9900 |
| }, |
| { |
| "epoch": 141.99505998588566, |
| "grad_norm": 1.076314926147461, |
| "learning_rate": 8.361204013377926e-06, |
| "loss": 3.9601, |
| "step": 10000 |
| }, |
| { |
| "epoch": 141.99505998588566, |
| "eval_loss": 3.814333915710449, |
| "eval_runtime": 75.2689, |
| "eval_samples_per_second": 132.857, |
| "eval_steps_per_second": 8.304, |
| "step": 10000 |
| }, |
| { |
| "epoch": 143.40649258997882, |
| "grad_norm": 1.3485060930252075, |
| "learning_rate": 8.344481605351172e-06, |
| "loss": 3.9518, |
| "step": 10100 |
| }, |
| { |
| "epoch": 143.40649258997882, |
| "eval_loss": 3.8039023876190186, |
| "eval_runtime": 75.2506, |
| "eval_samples_per_second": 132.889, |
| "eval_steps_per_second": 8.306, |
| "step": 10100 |
| }, |
| { |
| "epoch": 144.817925194072, |
| "grad_norm": 1.477800726890564, |
| "learning_rate": 8.327759197324416e-06, |
| "loss": 3.9453, |
| "step": 10200 |
| }, |
| { |
| "epoch": 144.817925194072, |
| "eval_loss": 3.8053476810455322, |
| "eval_runtime": 75.7225, |
| "eval_samples_per_second": 132.061, |
| "eval_steps_per_second": 8.254, |
| "step": 10200 |
| }, |
| { |
| "epoch": 146.22935779816513, |
| "grad_norm": 0.9907758235931396, |
| "learning_rate": 8.31103678929766e-06, |
| "loss": 3.9382, |
| "step": 10300 |
| }, |
| { |
| "epoch": 146.22935779816513, |
| "eval_loss": 3.7936344146728516, |
| "eval_runtime": 75.3213, |
| "eval_samples_per_second": 132.765, |
| "eval_steps_per_second": 8.298, |
| "step": 10300 |
| }, |
| { |
| "epoch": 147.6407904022583, |
| "grad_norm": 0.9641264081001282, |
| "learning_rate": 8.294314381270903e-06, |
| "loss": 3.93, |
| "step": 10400 |
| }, |
| { |
| "epoch": 147.6407904022583, |
| "eval_loss": 3.787029266357422, |
| "eval_runtime": 75.7774, |
| "eval_samples_per_second": 131.965, |
| "eval_steps_per_second": 8.248, |
| "step": 10400 |
| }, |
| { |
| "epoch": 149.05222300635145, |
| "grad_norm": 0.9813300371170044, |
| "learning_rate": 8.277591973244149e-06, |
| "loss": 3.9242, |
| "step": 10500 |
| }, |
| { |
| "epoch": 149.05222300635145, |
| "eval_loss": 3.785655975341797, |
| "eval_runtime": 75.7954, |
| "eval_samples_per_second": 131.934, |
| "eval_steps_per_second": 8.246, |
| "step": 10500 |
| }, |
| { |
| "epoch": 150.4636556104446, |
| "grad_norm": 1.3373197317123413, |
| "learning_rate": 8.260869565217392e-06, |
| "loss": 3.9171, |
| "step": 10600 |
| }, |
| { |
| "epoch": 150.4636556104446, |
| "eval_loss": 3.7779102325439453, |
| "eval_runtime": 75.7673, |
| "eval_samples_per_second": 131.983, |
| "eval_steps_per_second": 8.249, |
| "step": 10600 |
| }, |
| { |
| "epoch": 151.87508821453775, |
| "grad_norm": 1.0385922193527222, |
| "learning_rate": 8.244147157190636e-06, |
| "loss": 3.91, |
| "step": 10700 |
| }, |
| { |
| "epoch": 151.87508821453775, |
| "eval_loss": 3.7687366008758545, |
| "eval_runtime": 75.7457, |
| "eval_samples_per_second": 132.021, |
| "eval_steps_per_second": 8.251, |
| "step": 10700 |
| }, |
| { |
| "epoch": 153.28652081863092, |
| "grad_norm": 1.0471336841583252, |
| "learning_rate": 8.22742474916388e-06, |
| "loss": 3.9039, |
| "step": 10800 |
| }, |
| { |
| "epoch": 153.28652081863092, |
| "eval_loss": 3.7694108486175537, |
| "eval_runtime": 75.7692, |
| "eval_samples_per_second": 131.98, |
| "eval_steps_per_second": 8.249, |
| "step": 10800 |
| }, |
| { |
| "epoch": 154.69795342272405, |
| "grad_norm": 1.0338389873504639, |
| "learning_rate": 8.210702341137125e-06, |
| "loss": 3.8976, |
| "step": 10900 |
| }, |
| { |
| "epoch": 154.69795342272405, |
| "eval_loss": 3.762704610824585, |
| "eval_runtime": 75.3453, |
| "eval_samples_per_second": 132.722, |
| "eval_steps_per_second": 8.295, |
| "step": 10900 |
| }, |
| { |
| "epoch": 156.10938602681722, |
| "grad_norm": 0.9690730571746826, |
| "learning_rate": 8.193979933110369e-06, |
| "loss": 3.8914, |
| "step": 11000 |
| }, |
| { |
| "epoch": 156.10938602681722, |
| "eval_loss": 3.7522528171539307, |
| "eval_runtime": 75.7714, |
| "eval_samples_per_second": 131.976, |
| "eval_steps_per_second": 8.248, |
| "step": 11000 |
| }, |
| { |
| "epoch": 157.52081863091038, |
| "grad_norm": 1.2407863140106201, |
| "learning_rate": 8.177257525083613e-06, |
| "loss": 3.8845, |
| "step": 11100 |
| }, |
| { |
| "epoch": 157.52081863091038, |
| "eval_loss": 3.752902030944824, |
| "eval_runtime": 75.7459, |
| "eval_samples_per_second": 132.02, |
| "eval_steps_per_second": 8.251, |
| "step": 11100 |
| }, |
| { |
| "epoch": 158.93225123500352, |
| "grad_norm": 1.0596587657928467, |
| "learning_rate": 8.160535117056857e-06, |
| "loss": 3.8795, |
| "step": 11200 |
| }, |
| { |
| "epoch": 158.93225123500352, |
| "eval_loss": 3.7448883056640625, |
| "eval_runtime": 75.3391, |
| "eval_samples_per_second": 132.733, |
| "eval_steps_per_second": 8.296, |
| "step": 11200 |
| }, |
| { |
| "epoch": 160.34368383909668, |
| "grad_norm": 1.1886705160140991, |
| "learning_rate": 8.143812709030102e-06, |
| "loss": 3.872, |
| "step": 11300 |
| }, |
| { |
| "epoch": 160.34368383909668, |
| "eval_loss": 3.740713357925415, |
| "eval_runtime": 75.3057, |
| "eval_samples_per_second": 132.792, |
| "eval_steps_per_second": 8.3, |
| "step": 11300 |
| }, |
| { |
| "epoch": 161.75511644318985, |
| "grad_norm": 1.0268244743347168, |
| "learning_rate": 8.127090301003346e-06, |
| "loss": 3.8648, |
| "step": 11400 |
| }, |
| { |
| "epoch": 161.75511644318985, |
| "eval_loss": 3.732024669647217, |
| "eval_runtime": 75.3507, |
| "eval_samples_per_second": 132.713, |
| "eval_steps_per_second": 8.295, |
| "step": 11400 |
| }, |
| { |
| "epoch": 163.16654904728298, |
| "grad_norm": 1.1183993816375732, |
| "learning_rate": 8.11036789297659e-06, |
| "loss": 3.8592, |
| "step": 11500 |
| }, |
| { |
| "epoch": 163.16654904728298, |
| "eval_loss": 3.727663993835449, |
| "eval_runtime": 75.0877, |
| "eval_samples_per_second": 133.178, |
| "eval_steps_per_second": 8.324, |
| "step": 11500 |
| }, |
| { |
| "epoch": 164.57798165137615, |
| "grad_norm": 0.9553079605102539, |
| "learning_rate": 8.093645484949833e-06, |
| "loss": 3.8529, |
| "step": 11600 |
| }, |
| { |
| "epoch": 164.57798165137615, |
| "eval_loss": 3.7202141284942627, |
| "eval_runtime": 75.341, |
| "eval_samples_per_second": 132.73, |
| "eval_steps_per_second": 8.296, |
| "step": 11600 |
| }, |
| { |
| "epoch": 165.9894142554693, |
| "grad_norm": 0.9820226430892944, |
| "learning_rate": 8.076923076923077e-06, |
| "loss": 3.8473, |
| "step": 11700 |
| }, |
| { |
| "epoch": 165.9894142554693, |
| "eval_loss": 3.7166054248809814, |
| "eval_runtime": 75.2501, |
| "eval_samples_per_second": 132.89, |
| "eval_steps_per_second": 8.306, |
| "step": 11700 |
| }, |
| { |
| "epoch": 167.40084685956245, |
| "grad_norm": 0.9525455236434937, |
| "learning_rate": 8.060200668896322e-06, |
| "loss": 3.8402, |
| "step": 11800 |
| }, |
| { |
| "epoch": 167.40084685956245, |
| "eval_loss": 3.7105445861816406, |
| "eval_runtime": 75.2571, |
| "eval_samples_per_second": 132.878, |
| "eval_steps_per_second": 8.305, |
| "step": 11800 |
| }, |
| { |
| "epoch": 168.8122794636556, |
| "grad_norm": 1.0878891944885254, |
| "learning_rate": 8.043478260869566e-06, |
| "loss": 3.8338, |
| "step": 11900 |
| }, |
| { |
| "epoch": 168.8122794636556, |
| "eval_loss": 3.70639967918396, |
| "eval_runtime": 75.2616, |
| "eval_samples_per_second": 132.87, |
| "eval_steps_per_second": 8.304, |
| "step": 11900 |
| }, |
| { |
| "epoch": 170.22371206774878, |
| "grad_norm": 0.8236098289489746, |
| "learning_rate": 8.02675585284281e-06, |
| "loss": 3.8274, |
| "step": 12000 |
| }, |
| { |
| "epoch": 170.22371206774878, |
| "eval_loss": 3.6996874809265137, |
| "eval_runtime": 75.0887, |
| "eval_samples_per_second": 133.176, |
| "eval_steps_per_second": 8.323, |
| "step": 12000 |
| }, |
| { |
| "epoch": 171.6351446718419, |
| "grad_norm": 0.997268557548523, |
| "learning_rate": 8.010033444816054e-06, |
| "loss": 3.8213, |
| "step": 12100 |
| }, |
| { |
| "epoch": 171.6351446718419, |
| "eval_loss": 3.6938281059265137, |
| "eval_runtime": 75.2392, |
| "eval_samples_per_second": 132.909, |
| "eval_steps_per_second": 8.307, |
| "step": 12100 |
| }, |
| { |
| "epoch": 173.04657727593508, |
| "grad_norm": 0.7722117900848389, |
| "learning_rate": 7.9933110367893e-06, |
| "loss": 3.8171, |
| "step": 12200 |
| }, |
| { |
| "epoch": 173.04657727593508, |
| "eval_loss": 3.6874475479125977, |
| "eval_runtime": 75.2829, |
| "eval_samples_per_second": 132.832, |
| "eval_steps_per_second": 8.302, |
| "step": 12200 |
| }, |
| { |
| "epoch": 174.45800988002824, |
| "grad_norm": 0.7989856004714966, |
| "learning_rate": 7.976588628762543e-06, |
| "loss": 3.8094, |
| "step": 12300 |
| }, |
| { |
| "epoch": 174.45800988002824, |
| "eval_loss": 3.681959390640259, |
| "eval_runtime": 75.3827, |
| "eval_samples_per_second": 132.657, |
| "eval_steps_per_second": 8.291, |
| "step": 12300 |
| }, |
| { |
| "epoch": 175.86944248412138, |
| "grad_norm": 0.8630412817001343, |
| "learning_rate": 7.959866220735787e-06, |
| "loss": 3.8039, |
| "step": 12400 |
| }, |
| { |
| "epoch": 175.86944248412138, |
| "eval_loss": 3.6759207248687744, |
| "eval_runtime": 75.0135, |
| "eval_samples_per_second": 133.309, |
| "eval_steps_per_second": 8.332, |
| "step": 12400 |
| }, |
| { |
| "epoch": 177.28087508821454, |
| "grad_norm": 0.7929290533065796, |
| "learning_rate": 7.94314381270903e-06, |
| "loss": 3.7975, |
| "step": 12500 |
| }, |
| { |
| "epoch": 177.28087508821454, |
| "eval_loss": 3.6663968563079834, |
| "eval_runtime": 75.3138, |
| "eval_samples_per_second": 132.778, |
| "eval_steps_per_second": 8.299, |
| "step": 12500 |
| }, |
| { |
| "epoch": 178.69230769230768, |
| "grad_norm": 0.9302893280982971, |
| "learning_rate": 7.926421404682276e-06, |
| "loss": 3.7915, |
| "step": 12600 |
| }, |
| { |
| "epoch": 178.69230769230768, |
| "eval_loss": 3.665828227996826, |
| "eval_runtime": 75.2973, |
| "eval_samples_per_second": 132.807, |
| "eval_steps_per_second": 8.3, |
| "step": 12600 |
| }, |
| { |
| "epoch": 180.10374029640084, |
| "grad_norm": 0.9526273608207703, |
| "learning_rate": 7.90969899665552e-06, |
| "loss": 3.7845, |
| "step": 12700 |
| }, |
| { |
| "epoch": 180.10374029640084, |
| "eval_loss": 3.6617684364318848, |
| "eval_runtime": 75.2728, |
| "eval_samples_per_second": 132.85, |
| "eval_steps_per_second": 8.303, |
| "step": 12700 |
| }, |
| { |
| "epoch": 181.515172900494, |
| "grad_norm": 0.99673992395401, |
| "learning_rate": 7.892976588628763e-06, |
| "loss": 3.7792, |
| "step": 12800 |
| }, |
| { |
| "epoch": 181.515172900494, |
| "eval_loss": 3.6547155380249023, |
| "eval_runtime": 75.0256, |
| "eval_samples_per_second": 133.288, |
| "eval_steps_per_second": 8.33, |
| "step": 12800 |
| }, |
| { |
| "epoch": 182.92660550458714, |
| "grad_norm": 0.9285475015640259, |
| "learning_rate": 7.876254180602007e-06, |
| "loss": 3.7708, |
| "step": 12900 |
| }, |
| { |
| "epoch": 182.92660550458714, |
| "eval_loss": 3.649796485900879, |
| "eval_runtime": 75.3331, |
| "eval_samples_per_second": 132.744, |
| "eval_steps_per_second": 8.296, |
| "step": 12900 |
| }, |
| { |
| "epoch": 184.3380381086803, |
| "grad_norm": 1.0837434530258179, |
| "learning_rate": 7.859531772575253e-06, |
| "loss": 3.7654, |
| "step": 13000 |
| }, |
| { |
| "epoch": 184.3380381086803, |
| "eval_loss": 3.645569086074829, |
| "eval_runtime": 75.2864, |
| "eval_samples_per_second": 132.826, |
| "eval_steps_per_second": 8.302, |
| "step": 13000 |
| }, |
| { |
| "epoch": 185.74947071277347, |
| "grad_norm": 0.8819906711578369, |
| "learning_rate": 7.842809364548496e-06, |
| "loss": 3.7596, |
| "step": 13100 |
| }, |
| { |
| "epoch": 185.74947071277347, |
| "eval_loss": 3.637125015258789, |
| "eval_runtime": 75.2984, |
| "eval_samples_per_second": 132.805, |
| "eval_steps_per_second": 8.3, |
| "step": 13100 |
| }, |
| { |
| "epoch": 187.1609033168666, |
| "grad_norm": 0.8664088249206543, |
| "learning_rate": 7.82608695652174e-06, |
| "loss": 3.7528, |
| "step": 13200 |
| }, |
| { |
| "epoch": 187.1609033168666, |
| "eval_loss": 3.6319968700408936, |
| "eval_runtime": 75.3308, |
| "eval_samples_per_second": 132.748, |
| "eval_steps_per_second": 8.297, |
| "step": 13200 |
| }, |
| { |
| "epoch": 188.57233592095977, |
| "grad_norm": 1.0038634538650513, |
| "learning_rate": 7.809364548494984e-06, |
| "loss": 3.7462, |
| "step": 13300 |
| }, |
| { |
| "epoch": 188.57233592095977, |
| "eval_loss": 3.6271727085113525, |
| "eval_runtime": 75.185, |
| "eval_samples_per_second": 133.005, |
| "eval_steps_per_second": 8.313, |
| "step": 13300 |
| }, |
| { |
| "epoch": 189.98376852505294, |
| "grad_norm": 0.8182855248451233, |
| "learning_rate": 7.792642140468228e-06, |
| "loss": 3.7395, |
| "step": 13400 |
| }, |
| { |
| "epoch": 189.98376852505294, |
| "eval_loss": 3.621074676513672, |
| "eval_runtime": 75.3792, |
| "eval_samples_per_second": 132.663, |
| "eval_steps_per_second": 8.291, |
| "step": 13400 |
| }, |
| { |
| "epoch": 191.39520112914607, |
| "grad_norm": 0.7293921113014221, |
| "learning_rate": 7.775919732441473e-06, |
| "loss": 3.7333, |
| "step": 13500 |
| }, |
| { |
| "epoch": 191.39520112914607, |
| "eval_loss": 3.61885929107666, |
| "eval_runtime": 75.3514, |
| "eval_samples_per_second": 132.712, |
| "eval_steps_per_second": 8.294, |
| "step": 13500 |
| }, |
| { |
| "epoch": 192.80663373323924, |
| "grad_norm": 0.7797924876213074, |
| "learning_rate": 7.759197324414717e-06, |
| "loss": 3.7263, |
| "step": 13600 |
| }, |
| { |
| "epoch": 192.80663373323924, |
| "eval_loss": 3.612473726272583, |
| "eval_runtime": 75.3512, |
| "eval_samples_per_second": 132.712, |
| "eval_steps_per_second": 8.294, |
| "step": 13600 |
| }, |
| { |
| "epoch": 194.2180663373324, |
| "grad_norm": 0.8082018494606018, |
| "learning_rate": 7.74247491638796e-06, |
| "loss": 3.7209, |
| "step": 13700 |
| }, |
| { |
| "epoch": 194.2180663373324, |
| "eval_loss": 3.6049203872680664, |
| "eval_runtime": 75.3686, |
| "eval_samples_per_second": 132.681, |
| "eval_steps_per_second": 8.293, |
| "step": 13700 |
| }, |
| { |
| "epoch": 195.62949894142554, |
| "grad_norm": 0.7965067625045776, |
| "learning_rate": 7.725752508361204e-06, |
| "loss": 3.7147, |
| "step": 13800 |
| }, |
| { |
| "epoch": 195.62949894142554, |
| "eval_loss": 3.599919080734253, |
| "eval_runtime": 75.1572, |
| "eval_samples_per_second": 133.055, |
| "eval_steps_per_second": 8.316, |
| "step": 13800 |
| }, |
| { |
| "epoch": 197.0409315455187, |
| "grad_norm": 0.8104972839355469, |
| "learning_rate": 7.70903010033445e-06, |
| "loss": 3.7082, |
| "step": 13900 |
| }, |
| { |
| "epoch": 197.0409315455187, |
| "eval_loss": 3.5933010578155518, |
| "eval_runtime": 75.3918, |
| "eval_samples_per_second": 132.641, |
| "eval_steps_per_second": 8.29, |
| "step": 13900 |
| }, |
| { |
| "epoch": 198.45236414961187, |
| "grad_norm": 0.9350934624671936, |
| "learning_rate": 7.692307692307694e-06, |
| "loss": 3.7011, |
| "step": 14000 |
| }, |
| { |
| "epoch": 198.45236414961187, |
| "eval_loss": 3.586028814315796, |
| "eval_runtime": 75.495, |
| "eval_samples_per_second": 132.459, |
| "eval_steps_per_second": 8.279, |
| "step": 14000 |
| }, |
| { |
| "epoch": 199.863796753705, |
| "grad_norm": 0.7946003079414368, |
| "learning_rate": 7.675585284280937e-06, |
| "loss": 3.6949, |
| "step": 14100 |
| }, |
| { |
| "epoch": 199.863796753705, |
| "eval_loss": 3.581510543823242, |
| "eval_runtime": 75.3493, |
| "eval_samples_per_second": 132.715, |
| "eval_steps_per_second": 8.295, |
| "step": 14100 |
| }, |
| { |
| "epoch": 201.27522935779817, |
| "grad_norm": 0.776996910572052, |
| "learning_rate": 7.658862876254181e-06, |
| "loss": 3.689, |
| "step": 14200 |
| }, |
| { |
| "epoch": 201.27522935779817, |
| "eval_loss": 3.5773942470550537, |
| "eval_runtime": 75.3953, |
| "eval_samples_per_second": 132.634, |
| "eval_steps_per_second": 8.29, |
| "step": 14200 |
| }, |
| { |
| "epoch": 202.68666196189133, |
| "grad_norm": 0.6654204726219177, |
| "learning_rate": 7.642140468227427e-06, |
| "loss": 3.6824, |
| "step": 14300 |
| }, |
| { |
| "epoch": 202.68666196189133, |
| "eval_loss": 3.574646472930908, |
| "eval_runtime": 75.4068, |
| "eval_samples_per_second": 132.614, |
| "eval_steps_per_second": 8.288, |
| "step": 14300 |
| }, |
| { |
| "epoch": 204.09809456598447, |
| "grad_norm": 0.7471653819084167, |
| "learning_rate": 7.62541806020067e-06, |
| "loss": 3.6764, |
| "step": 14400 |
| }, |
| { |
| "epoch": 204.09809456598447, |
| "eval_loss": 3.5689644813537598, |
| "eval_runtime": 75.2186, |
| "eval_samples_per_second": 132.946, |
| "eval_steps_per_second": 8.309, |
| "step": 14400 |
| }, |
| { |
| "epoch": 205.50952717007763, |
| "grad_norm": 0.6326889991760254, |
| "learning_rate": 7.608695652173914e-06, |
| "loss": 3.6708, |
| "step": 14500 |
| }, |
| { |
| "epoch": 205.50952717007763, |
| "eval_loss": 3.5631089210510254, |
| "eval_runtime": 75.269, |
| "eval_samples_per_second": 132.857, |
| "eval_steps_per_second": 8.304, |
| "step": 14500 |
| }, |
| { |
| "epoch": 206.9209597741708, |
| "grad_norm": 0.6415218710899353, |
| "learning_rate": 7.591973244147159e-06, |
| "loss": 3.666, |
| "step": 14600 |
| }, |
| { |
| "epoch": 206.9209597741708, |
| "eval_loss": 3.559755563735962, |
| "eval_runtime": 75.341, |
| "eval_samples_per_second": 132.73, |
| "eval_steps_per_second": 8.296, |
| "step": 14600 |
| }, |
| { |
| "epoch": 208.33239237826393, |
| "grad_norm": 0.6695938110351562, |
| "learning_rate": 7.5752508361204024e-06, |
| "loss": 3.66, |
| "step": 14700 |
| }, |
| { |
| "epoch": 208.33239237826393, |
| "eval_loss": 3.5543429851531982, |
| "eval_runtime": 75.3608, |
| "eval_samples_per_second": 132.695, |
| "eval_steps_per_second": 8.293, |
| "step": 14700 |
| }, |
| { |
| "epoch": 209.7438249823571, |
| "grad_norm": 0.8358725309371948, |
| "learning_rate": 7.558528428093647e-06, |
| "loss": 3.6542, |
| "step": 14800 |
| }, |
| { |
| "epoch": 209.7438249823571, |
| "eval_loss": 3.5488317012786865, |
| "eval_runtime": 75.3691, |
| "eval_samples_per_second": 132.68, |
| "eval_steps_per_second": 8.293, |
| "step": 14800 |
| }, |
| { |
| "epoch": 211.15525758645023, |
| "grad_norm": 0.87603360414505, |
| "learning_rate": 7.541806020066891e-06, |
| "loss": 3.649, |
| "step": 14900 |
| }, |
| { |
| "epoch": 211.15525758645023, |
| "eval_loss": 3.545093297958374, |
| "eval_runtime": 75.0836, |
| "eval_samples_per_second": 133.185, |
| "eval_steps_per_second": 8.324, |
| "step": 14900 |
| }, |
| { |
| "epoch": 212.5666901905434, |
| "grad_norm": 0.7373155951499939, |
| "learning_rate": 7.5250836120401346e-06, |
| "loss": 3.6429, |
| "step": 15000 |
| }, |
| { |
| "epoch": 212.5666901905434, |
| "eval_loss": 3.5412116050720215, |
| "eval_runtime": 75.3512, |
| "eval_samples_per_second": 132.712, |
| "eval_steps_per_second": 8.294, |
| "step": 15000 |
| }, |
| { |
| "epoch": 213.97812279463656, |
| "grad_norm": 0.6794809699058533, |
| "learning_rate": 7.508361204013379e-06, |
| "loss": 3.6389, |
| "step": 15100 |
| }, |
| { |
| "epoch": 213.97812279463656, |
| "eval_loss": 3.53684139251709, |
| "eval_runtime": 75.1526, |
| "eval_samples_per_second": 133.063, |
| "eval_steps_per_second": 8.316, |
| "step": 15100 |
| }, |
| { |
| "epoch": 215.3895553987297, |
| "grad_norm": 0.7504755854606628, |
| "learning_rate": 7.491638795986622e-06, |
| "loss": 3.6338, |
| "step": 15200 |
| }, |
| { |
| "epoch": 215.3895553987297, |
| "eval_loss": 3.530754804611206, |
| "eval_runtime": 75.3259, |
| "eval_samples_per_second": 132.757, |
| "eval_steps_per_second": 8.297, |
| "step": 15200 |
| }, |
| { |
| "epoch": 216.80098800282286, |
| "grad_norm": 0.6434842348098755, |
| "learning_rate": 7.474916387959867e-06, |
| "loss": 3.6291, |
| "step": 15300 |
| }, |
| { |
| "epoch": 216.80098800282286, |
| "eval_loss": 3.5268187522888184, |
| "eval_runtime": 75.4572, |
| "eval_samples_per_second": 132.525, |
| "eval_steps_per_second": 8.283, |
| "step": 15300 |
| }, |
| { |
| "epoch": 218.21242060691603, |
| "grad_norm": 0.7057756781578064, |
| "learning_rate": 7.4581939799331104e-06, |
| "loss": 3.6233, |
| "step": 15400 |
| }, |
| { |
| "epoch": 218.21242060691603, |
| "eval_loss": 3.5211195945739746, |
| "eval_runtime": 75.4018, |
| "eval_samples_per_second": 132.623, |
| "eval_steps_per_second": 8.289, |
| "step": 15400 |
| }, |
| { |
| "epoch": 219.62385321100916, |
| "grad_norm": 0.6788062453269958, |
| "learning_rate": 7.441471571906354e-06, |
| "loss": 3.6187, |
| "step": 15500 |
| }, |
| { |
| "epoch": 219.62385321100916, |
| "eval_loss": 3.5227015018463135, |
| "eval_runtime": 75.1928, |
| "eval_samples_per_second": 132.992, |
| "eval_steps_per_second": 8.312, |
| "step": 15500 |
| }, |
| { |
| "epoch": 221.03528581510233, |
| "grad_norm": 0.5607024431228638, |
| "learning_rate": 7.424749163879599e-06, |
| "loss": 3.6145, |
| "step": 15600 |
| }, |
| { |
| "epoch": 221.03528581510233, |
| "eval_loss": 3.514875888824463, |
| "eval_runtime": 75.3921, |
| "eval_samples_per_second": 132.64, |
| "eval_steps_per_second": 8.29, |
| "step": 15600 |
| }, |
| { |
| "epoch": 222.4467184191955, |
| "grad_norm": 1.0785976648330688, |
| "learning_rate": 7.4080267558528426e-06, |
| "loss": 3.6091, |
| "step": 15700 |
| }, |
| { |
| "epoch": 222.4467184191955, |
| "eval_loss": 3.5113186836242676, |
| "eval_runtime": 75.365, |
| "eval_samples_per_second": 132.688, |
| "eval_steps_per_second": 8.293, |
| "step": 15700 |
| }, |
| { |
| "epoch": 223.85815102328863, |
| "grad_norm": 0.8430230617523193, |
| "learning_rate": 7.391304347826087e-06, |
| "loss": 3.6059, |
| "step": 15800 |
| }, |
| { |
| "epoch": 223.85815102328863, |
| "eval_loss": 3.5113461017608643, |
| "eval_runtime": 75.3749, |
| "eval_samples_per_second": 132.67, |
| "eval_steps_per_second": 8.292, |
| "step": 15800 |
| }, |
| { |
| "epoch": 225.2695836273818, |
| "grad_norm": 0.5923272967338562, |
| "learning_rate": 7.374581939799331e-06, |
| "loss": 3.6016, |
| "step": 15900 |
| }, |
| { |
| "epoch": 225.2695836273818, |
| "eval_loss": 3.5035176277160645, |
| "eval_runtime": 75.2962, |
| "eval_samples_per_second": 132.809, |
| "eval_steps_per_second": 8.301, |
| "step": 15900 |
| }, |
| { |
| "epoch": 226.68101623147496, |
| "grad_norm": 0.7486432194709778, |
| "learning_rate": 7.3578595317725755e-06, |
| "loss": 3.598, |
| "step": 16000 |
| }, |
| { |
| "epoch": 226.68101623147496, |
| "eval_loss": 3.4993245601654053, |
| "eval_runtime": 75.1842, |
| "eval_samples_per_second": 133.007, |
| "eval_steps_per_second": 8.313, |
| "step": 16000 |
| }, |
| { |
| "epoch": 228.0924488355681, |
| "grad_norm": 0.8732613325119019, |
| "learning_rate": 7.341137123745819e-06, |
| "loss": 3.5925, |
| "step": 16100 |
| }, |
| { |
| "epoch": 228.0924488355681, |
| "eval_loss": 3.495875835418701, |
| "eval_runtime": 75.4084, |
| "eval_samples_per_second": 132.611, |
| "eval_steps_per_second": 8.288, |
| "step": 16100 |
| }, |
| { |
| "epoch": 229.50388143966126, |
| "grad_norm": 0.6386623978614807, |
| "learning_rate": 7.324414715719064e-06, |
| "loss": 3.5889, |
| "step": 16200 |
| }, |
| { |
| "epoch": 229.50388143966126, |
| "eval_loss": 3.492525815963745, |
| "eval_runtime": 75.3941, |
| "eval_samples_per_second": 132.636, |
| "eval_steps_per_second": 8.29, |
| "step": 16200 |
| }, |
| { |
| "epoch": 230.91531404375442, |
| "grad_norm": 0.7436226606369019, |
| "learning_rate": 7.307692307692308e-06, |
| "loss": 3.5848, |
| "step": 16300 |
| }, |
| { |
| "epoch": 230.91531404375442, |
| "eval_loss": 3.4906632900238037, |
| "eval_runtime": 75.3655, |
| "eval_samples_per_second": 132.687, |
| "eval_steps_per_second": 8.293, |
| "step": 16300 |
| }, |
| { |
| "epoch": 232.32674664784756, |
| "grad_norm": 0.8418622016906738, |
| "learning_rate": 7.290969899665552e-06, |
| "loss": 3.5803, |
| "step": 16400 |
| }, |
| { |
| "epoch": 232.32674664784756, |
| "eval_loss": 3.4869649410247803, |
| "eval_runtime": 75.3226, |
| "eval_samples_per_second": 132.762, |
| "eval_steps_per_second": 8.298, |
| "step": 16400 |
| }, |
| { |
| "epoch": 233.73817925194072, |
| "grad_norm": 0.7934303879737854, |
| "learning_rate": 7.274247491638796e-06, |
| "loss": 3.5778, |
| "step": 16500 |
| }, |
| { |
| "epoch": 233.73817925194072, |
| "eval_loss": 3.478532314300537, |
| "eval_runtime": 75.0535, |
| "eval_samples_per_second": 133.238, |
| "eval_steps_per_second": 8.327, |
| "step": 16500 |
| }, |
| { |
| "epoch": 235.14961185603389, |
| "grad_norm": 0.6809811592102051, |
| "learning_rate": 7.257525083612041e-06, |
| "loss": 3.5736, |
| "step": 16600 |
| }, |
| { |
| "epoch": 235.14961185603389, |
| "eval_loss": 3.4779250621795654, |
| "eval_runtime": 75.3809, |
| "eval_samples_per_second": 132.66, |
| "eval_steps_per_second": 8.291, |
| "step": 16600 |
| }, |
| { |
| "epoch": 236.56104446012702, |
| "grad_norm": 0.8483596444129944, |
| "learning_rate": 7.240802675585284e-06, |
| "loss": 3.5682, |
| "step": 16700 |
| }, |
| { |
| "epoch": 236.56104446012702, |
| "eval_loss": 3.473116874694824, |
| "eval_runtime": 75.3936, |
| "eval_samples_per_second": 132.637, |
| "eval_steps_per_second": 8.29, |
| "step": 16700 |
| }, |
| { |
| "epoch": 237.9724770642202, |
| "grad_norm": 0.6600437164306641, |
| "learning_rate": 7.224080267558529e-06, |
| "loss": 3.5656, |
| "step": 16800 |
| }, |
| { |
| "epoch": 237.9724770642202, |
| "eval_loss": 3.47174072265625, |
| "eval_runtime": 75.3681, |
| "eval_samples_per_second": 132.682, |
| "eval_steps_per_second": 8.293, |
| "step": 16800 |
| }, |
| { |
| "epoch": 239.38390966831335, |
| "grad_norm": 0.607857882976532, |
| "learning_rate": 7.207357859531773e-06, |
| "loss": 3.5616, |
| "step": 16900 |
| }, |
| { |
| "epoch": 239.38390966831335, |
| "eval_loss": 3.467322587966919, |
| "eval_runtime": 75.3846, |
| "eval_samples_per_second": 132.653, |
| "eval_steps_per_second": 8.291, |
| "step": 16900 |
| }, |
| { |
| "epoch": 240.7953422724065, |
| "grad_norm": 0.6345656514167786, |
| "learning_rate": 7.190635451505017e-06, |
| "loss": 3.5582, |
| "step": 17000 |
| }, |
| { |
| "epoch": 240.7953422724065, |
| "eval_loss": 3.4662554264068604, |
| "eval_runtime": 75.3233, |
| "eval_samples_per_second": 132.761, |
| "eval_steps_per_second": 8.298, |
| "step": 17000 |
| }, |
| { |
| "epoch": 242.20677487649965, |
| "grad_norm": 0.6691942811012268, |
| "learning_rate": 7.173913043478261e-06, |
| "loss": 3.5552, |
| "step": 17100 |
| }, |
| { |
| "epoch": 242.20677487649965, |
| "eval_loss": 3.4594500064849854, |
| "eval_runtime": 75.1343, |
| "eval_samples_per_second": 133.095, |
| "eval_steps_per_second": 8.318, |
| "step": 17100 |
| }, |
| { |
| "epoch": 243.61820748059282, |
| "grad_norm": 0.6638765931129456, |
| "learning_rate": 7.157190635451506e-06, |
| "loss": 3.551, |
| "step": 17200 |
| }, |
| { |
| "epoch": 243.61820748059282, |
| "eval_loss": 3.459233283996582, |
| "eval_runtime": 75.3664, |
| "eval_samples_per_second": 132.685, |
| "eval_steps_per_second": 8.293, |
| "step": 17200 |
| }, |
| { |
| "epoch": 245.02964008468595, |
| "grad_norm": 0.7361800670623779, |
| "learning_rate": 7.1404682274247495e-06, |
| "loss": 3.5476, |
| "step": 17300 |
| }, |
| { |
| "epoch": 245.02964008468595, |
| "eval_loss": 3.456892251968384, |
| "eval_runtime": 75.3908, |
| "eval_samples_per_second": 132.642, |
| "eval_steps_per_second": 8.29, |
| "step": 17300 |
| }, |
| { |
| "epoch": 246.44107268877912, |
| "grad_norm": 0.7599518299102783, |
| "learning_rate": 7.123745819397993e-06, |
| "loss": 3.544, |
| "step": 17400 |
| }, |
| { |
| "epoch": 246.44107268877912, |
| "eval_loss": 3.453518867492676, |
| "eval_runtime": 75.3827, |
| "eval_samples_per_second": 132.656, |
| "eval_steps_per_second": 8.291, |
| "step": 17400 |
| }, |
| { |
| "epoch": 247.85250529287225, |
| "grad_norm": 0.6123988032341003, |
| "learning_rate": 7.107023411371238e-06, |
| "loss": 3.5408, |
| "step": 17500 |
| }, |
| { |
| "epoch": 247.85250529287225, |
| "eval_loss": 3.4502522945404053, |
| "eval_runtime": 75.3683, |
| "eval_samples_per_second": 132.682, |
| "eval_steps_per_second": 8.293, |
| "step": 17500 |
| }, |
| { |
| "epoch": 249.26393789696542, |
| "grad_norm": 0.6615849137306213, |
| "learning_rate": 7.0903010033444816e-06, |
| "loss": 3.5376, |
| "step": 17600 |
| }, |
| { |
| "epoch": 249.26393789696542, |
| "eval_loss": 3.446369171142578, |
| "eval_runtime": 75.2088, |
| "eval_samples_per_second": 132.963, |
| "eval_steps_per_second": 8.31, |
| "step": 17600 |
| }, |
| { |
| "epoch": 250.67537050105858, |
| "grad_norm": 0.6400436162948608, |
| "learning_rate": 7.073578595317726e-06, |
| "loss": 3.5338, |
| "step": 17700 |
| }, |
| { |
| "epoch": 250.67537050105858, |
| "eval_loss": 3.444706678390503, |
| "eval_runtime": 75.2466, |
| "eval_samples_per_second": 132.896, |
| "eval_steps_per_second": 8.306, |
| "step": 17700 |
| }, |
| { |
| "epoch": 252.08680310515172, |
| "grad_norm": 0.6528608798980713, |
| "learning_rate": 7.05685618729097e-06, |
| "loss": 3.5307, |
| "step": 17800 |
| }, |
| { |
| "epoch": 252.08680310515172, |
| "eval_loss": 3.4401872158050537, |
| "eval_runtime": 75.4051, |
| "eval_samples_per_second": 132.617, |
| "eval_steps_per_second": 8.289, |
| "step": 17800 |
| }, |
| { |
| "epoch": 253.49823570924488, |
| "grad_norm": 0.7888281345367432, |
| "learning_rate": 7.0401337792642145e-06, |
| "loss": 3.5276, |
| "step": 17900 |
| }, |
| { |
| "epoch": 253.49823570924488, |
| "eval_loss": 3.438217878341675, |
| "eval_runtime": 76.8212, |
| "eval_samples_per_second": 130.172, |
| "eval_steps_per_second": 8.136, |
| "step": 17900 |
| }, |
| { |
| "epoch": 254.90966831333805, |
| "grad_norm": 0.6938881874084473, |
| "learning_rate": 7.023411371237458e-06, |
| "loss": 3.5237, |
| "step": 18000 |
| }, |
| { |
| "epoch": 254.90966831333805, |
| "eval_loss": 3.434736728668213, |
| "eval_runtime": 76.8401, |
| "eval_samples_per_second": 130.14, |
| "eval_steps_per_second": 8.134, |
| "step": 18000 |
| }, |
| { |
| "epoch": 256.3211009174312, |
| "grad_norm": 0.6238895654678345, |
| "learning_rate": 7.006688963210703e-06, |
| "loss": 3.5208, |
| "step": 18100 |
| }, |
| { |
| "epoch": 256.3211009174312, |
| "eval_loss": 3.4336953163146973, |
| "eval_runtime": 76.7196, |
| "eval_samples_per_second": 130.345, |
| "eval_steps_per_second": 8.147, |
| "step": 18100 |
| }, |
| { |
| "epoch": 257.7325335215244, |
| "grad_norm": 0.5130559206008911, |
| "learning_rate": 6.989966555183947e-06, |
| "loss": 3.5184, |
| "step": 18200 |
| }, |
| { |
| "epoch": 257.7325335215244, |
| "eval_loss": 3.4297399520874023, |
| "eval_runtime": 76.8567, |
| "eval_samples_per_second": 130.112, |
| "eval_steps_per_second": 8.132, |
| "step": 18200 |
| }, |
| { |
| "epoch": 259.1439661256175, |
| "grad_norm": 0.480792373418808, |
| "learning_rate": 6.973244147157191e-06, |
| "loss": 3.5156, |
| "step": 18300 |
| }, |
| { |
| "epoch": 259.1439661256175, |
| "eval_loss": 3.4281508922576904, |
| "eval_runtime": 76.9652, |
| "eval_samples_per_second": 129.929, |
| "eval_steps_per_second": 8.121, |
| "step": 18300 |
| }, |
| { |
| "epoch": 260.55539872971065, |
| "grad_norm": 0.6780312657356262, |
| "learning_rate": 6.956521739130435e-06, |
| "loss": 3.5118, |
| "step": 18400 |
| }, |
| { |
| "epoch": 260.55539872971065, |
| "eval_loss": 3.4258944988250732, |
| "eval_runtime": 76.8943, |
| "eval_samples_per_second": 130.049, |
| "eval_steps_per_second": 8.128, |
| "step": 18400 |
| }, |
| { |
| "epoch": 261.96683133380384, |
| "grad_norm": 0.5994666814804077, |
| "learning_rate": 6.93979933110368e-06, |
| "loss": 3.5086, |
| "step": 18500 |
| }, |
| { |
| "epoch": 261.96683133380384, |
| "eval_loss": 3.422513246536255, |
| "eval_runtime": 76.8723, |
| "eval_samples_per_second": 130.086, |
| "eval_steps_per_second": 8.13, |
| "step": 18500 |
| }, |
| { |
| "epoch": 263.378263937897, |
| "grad_norm": 0.7816299796104431, |
| "learning_rate": 6.923076923076923e-06, |
| "loss": 3.505, |
| "step": 18600 |
| }, |
| { |
| "epoch": 263.378263937897, |
| "eval_loss": 3.4186975955963135, |
| "eval_runtime": 76.7391, |
| "eval_samples_per_second": 130.312, |
| "eval_steps_per_second": 8.144, |
| "step": 18600 |
| }, |
| { |
| "epoch": 264.7896965419901, |
| "grad_norm": 0.6295963525772095, |
| "learning_rate": 6.906354515050168e-06, |
| "loss": 3.5025, |
| "step": 18700 |
| }, |
| { |
| "epoch": 264.7896965419901, |
| "eval_loss": 3.416973352432251, |
| "eval_runtime": 76.7644, |
| "eval_samples_per_second": 130.269, |
| "eval_steps_per_second": 8.142, |
| "step": 18700 |
| }, |
| { |
| "epoch": 266.20112914608325, |
| "grad_norm": 0.5306549072265625, |
| "learning_rate": 6.889632107023412e-06, |
| "loss": 3.5002, |
| "step": 18800 |
| }, |
| { |
| "epoch": 266.20112914608325, |
| "eval_loss": 3.4140942096710205, |
| "eval_runtime": 76.9199, |
| "eval_samples_per_second": 130.005, |
| "eval_steps_per_second": 8.125, |
| "step": 18800 |
| }, |
| { |
| "epoch": 267.61256175017644, |
| "grad_norm": 0.6763441562652588, |
| "learning_rate": 6.872909698996656e-06, |
| "loss": 3.4971, |
| "step": 18900 |
| }, |
| { |
| "epoch": 267.61256175017644, |
| "eval_loss": 3.4103851318359375, |
| "eval_runtime": 76.776, |
| "eval_samples_per_second": 130.249, |
| "eval_steps_per_second": 8.141, |
| "step": 18900 |
| }, |
| { |
| "epoch": 269.0239943542696, |
| "grad_norm": 0.557765781879425, |
| "learning_rate": 6.8561872909699e-06, |
| "loss": 3.4944, |
| "step": 19000 |
| }, |
| { |
| "epoch": 269.0239943542696, |
| "eval_loss": 3.4099249839782715, |
| "eval_runtime": 76.8515, |
| "eval_samples_per_second": 130.121, |
| "eval_steps_per_second": 8.133, |
| "step": 19000 |
| }, |
| { |
| "epoch": 270.4354269583627, |
| "grad_norm": 0.6447959542274475, |
| "learning_rate": 6.839464882943144e-06, |
| "loss": 3.492, |
| "step": 19100 |
| }, |
| { |
| "epoch": 270.4354269583627, |
| "eval_loss": 3.408655881881714, |
| "eval_runtime": 76.7546, |
| "eval_samples_per_second": 130.285, |
| "eval_steps_per_second": 8.143, |
| "step": 19100 |
| }, |
| { |
| "epoch": 271.8468595624559, |
| "grad_norm": 0.5875563621520996, |
| "learning_rate": 6.8227424749163885e-06, |
| "loss": 3.4898, |
| "step": 19200 |
| }, |
| { |
| "epoch": 271.8468595624559, |
| "eval_loss": 3.404829263687134, |
| "eval_runtime": 76.8832, |
| "eval_samples_per_second": 130.067, |
| "eval_steps_per_second": 8.129, |
| "step": 19200 |
| }, |
| { |
| "epoch": 273.25829216654904, |
| "grad_norm": 0.6500788927078247, |
| "learning_rate": 6.806020066889632e-06, |
| "loss": 3.4858, |
| "step": 19300 |
| }, |
| { |
| "epoch": 273.25829216654904, |
| "eval_loss": 3.40264892578125, |
| "eval_runtime": 76.8385, |
| "eval_samples_per_second": 130.143, |
| "eval_steps_per_second": 8.134, |
| "step": 19300 |
| }, |
| { |
| "epoch": 274.6697247706422, |
| "grad_norm": 0.7083373069763184, |
| "learning_rate": 6.789297658862877e-06, |
| "loss": 3.4824, |
| "step": 19400 |
| }, |
| { |
| "epoch": 274.6697247706422, |
| "eval_loss": 3.398090124130249, |
| "eval_runtime": 76.8694, |
| "eval_samples_per_second": 130.091, |
| "eval_steps_per_second": 8.131, |
| "step": 19400 |
| }, |
| { |
| "epoch": 276.08115737473537, |
| "grad_norm": 0.7219327092170715, |
| "learning_rate": 6.772575250836121e-06, |
| "loss": 3.4811, |
| "step": 19500 |
| }, |
| { |
| "epoch": 276.08115737473537, |
| "eval_loss": 3.396939516067505, |
| "eval_runtime": 76.9317, |
| "eval_samples_per_second": 129.985, |
| "eval_steps_per_second": 8.124, |
| "step": 19500 |
| }, |
| { |
| "epoch": 277.4925899788285, |
| "grad_norm": 0.49477267265319824, |
| "learning_rate": 6.755852842809365e-06, |
| "loss": 3.4782, |
| "step": 19600 |
| }, |
| { |
| "epoch": 277.4925899788285, |
| "eval_loss": 3.395801544189453, |
| "eval_runtime": 76.7586, |
| "eval_samples_per_second": 130.279, |
| "eval_steps_per_second": 8.142, |
| "step": 19600 |
| }, |
| { |
| "epoch": 278.90402258292164, |
| "grad_norm": 0.7135903835296631, |
| "learning_rate": 6.739130434782609e-06, |
| "loss": 3.4749, |
| "step": 19700 |
| }, |
| { |
| "epoch": 278.90402258292164, |
| "eval_loss": 3.393001079559326, |
| "eval_runtime": 76.9157, |
| "eval_samples_per_second": 130.012, |
| "eval_steps_per_second": 8.126, |
| "step": 19700 |
| }, |
| { |
| "epoch": 280.31545518701483, |
| "grad_norm": 0.7002623677253723, |
| "learning_rate": 6.7224080267558536e-06, |
| "loss": 3.4731, |
| "step": 19800 |
| }, |
| { |
| "epoch": 280.31545518701483, |
| "eval_loss": 3.3903141021728516, |
| "eval_runtime": 75.5607, |
| "eval_samples_per_second": 132.344, |
| "eval_steps_per_second": 8.271, |
| "step": 19800 |
| }, |
| { |
| "epoch": 281.72688779110797, |
| "grad_norm": 0.6401045322418213, |
| "learning_rate": 6.705685618729097e-06, |
| "loss": 3.4704, |
| "step": 19900 |
| }, |
| { |
| "epoch": 281.72688779110797, |
| "eval_loss": 3.3897156715393066, |
| "eval_runtime": 76.9187, |
| "eval_samples_per_second": 130.007, |
| "eval_steps_per_second": 8.125, |
| "step": 19900 |
| }, |
| { |
| "epoch": 283.1383203952011, |
| "grad_norm": 0.6009179353713989, |
| "learning_rate": 6.688963210702342e-06, |
| "loss": 3.4667, |
| "step": 20000 |
| }, |
| { |
| "epoch": 283.1383203952011, |
| "eval_loss": 3.388314962387085, |
| "eval_runtime": 76.761, |
| "eval_samples_per_second": 130.274, |
| "eval_steps_per_second": 8.142, |
| "step": 20000 |
| }, |
| { |
| "epoch": 284.5497529992943, |
| "grad_norm": 0.6317482590675354, |
| "learning_rate": 6.672240802675586e-06, |
| "loss": 3.4653, |
| "step": 20100 |
| }, |
| { |
| "epoch": 284.5497529992943, |
| "eval_loss": 3.382216691970825, |
| "eval_runtime": 76.6247, |
| "eval_samples_per_second": 130.506, |
| "eval_steps_per_second": 8.157, |
| "step": 20100 |
| }, |
| { |
| "epoch": 285.96118560338743, |
| "grad_norm": 0.5381720066070557, |
| "learning_rate": 6.65551839464883e-06, |
| "loss": 3.4639, |
| "step": 20200 |
| }, |
| { |
| "epoch": 285.96118560338743, |
| "eval_loss": 3.379488229751587, |
| "eval_runtime": 76.8042, |
| "eval_samples_per_second": 130.201, |
| "eval_steps_per_second": 8.138, |
| "step": 20200 |
| }, |
| { |
| "epoch": 287.37261820748057, |
| "grad_norm": 0.5575243234634399, |
| "learning_rate": 6.638795986622074e-06, |
| "loss": 3.4614, |
| "step": 20300 |
| }, |
| { |
| "epoch": 287.37261820748057, |
| "eval_loss": 3.37882137298584, |
| "eval_runtime": 76.6731, |
| "eval_samples_per_second": 130.424, |
| "eval_steps_per_second": 8.151, |
| "step": 20300 |
| }, |
| { |
| "epoch": 288.78405081157376, |
| "grad_norm": 0.6351670622825623, |
| "learning_rate": 6.622073578595319e-06, |
| "loss": 3.4581, |
| "step": 20400 |
| }, |
| { |
| "epoch": 288.78405081157376, |
| "eval_loss": 3.376701593399048, |
| "eval_runtime": 76.6829, |
| "eval_samples_per_second": 130.407, |
| "eval_steps_per_second": 8.15, |
| "step": 20400 |
| }, |
| { |
| "epoch": 290.1954834156669, |
| "grad_norm": 0.8166664242744446, |
| "learning_rate": 6.605351170568562e-06, |
| "loss": 3.4559, |
| "step": 20500 |
| }, |
| { |
| "epoch": 290.1954834156669, |
| "eval_loss": 3.37479305267334, |
| "eval_runtime": 76.6609, |
| "eval_samples_per_second": 130.445, |
| "eval_steps_per_second": 8.153, |
| "step": 20500 |
| }, |
| { |
| "epoch": 291.60691601976004, |
| "grad_norm": 0.6857735514640808, |
| "learning_rate": 6.588628762541807e-06, |
| "loss": 3.452, |
| "step": 20600 |
| }, |
| { |
| "epoch": 291.60691601976004, |
| "eval_loss": 3.372075080871582, |
| "eval_runtime": 76.5027, |
| "eval_samples_per_second": 130.714, |
| "eval_steps_per_second": 8.17, |
| "step": 20600 |
| }, |
| { |
| "epoch": 293.0183486238532, |
| "grad_norm": 0.5841640830039978, |
| "learning_rate": 6.571906354515051e-06, |
| "loss": 3.4506, |
| "step": 20700 |
| }, |
| { |
| "epoch": 293.0183486238532, |
| "eval_loss": 3.367901563644409, |
| "eval_runtime": 75.957, |
| "eval_samples_per_second": 131.653, |
| "eval_steps_per_second": 8.228, |
| "step": 20700 |
| }, |
| { |
| "epoch": 294.42978122794636, |
| "grad_norm": 0.6171954870223999, |
| "learning_rate": 6.5551839464882945e-06, |
| "loss": 3.4472, |
| "step": 20800 |
| }, |
| { |
| "epoch": 294.42978122794636, |
| "eval_loss": 3.3702898025512695, |
| "eval_runtime": 75.9329, |
| "eval_samples_per_second": 131.695, |
| "eval_steps_per_second": 8.231, |
| "step": 20800 |
| }, |
| { |
| "epoch": 295.8412138320395, |
| "grad_norm": 0.5663770437240601, |
| "learning_rate": 6.538461538461539e-06, |
| "loss": 3.4471, |
| "step": 20900 |
| }, |
| { |
| "epoch": 295.8412138320395, |
| "eval_loss": 3.363970994949341, |
| "eval_runtime": 75.4098, |
| "eval_samples_per_second": 132.609, |
| "eval_steps_per_second": 8.288, |
| "step": 20900 |
| }, |
| { |
| "epoch": 297.2526464361327, |
| "grad_norm": 0.5675836801528931, |
| "learning_rate": 6.521739130434783e-06, |
| "loss": 3.4444, |
| "step": 21000 |
| }, |
| { |
| "epoch": 297.2526464361327, |
| "eval_loss": 3.366093397140503, |
| "eval_runtime": 75.0156, |
| "eval_samples_per_second": 133.306, |
| "eval_steps_per_second": 8.332, |
| "step": 21000 |
| }, |
| { |
| "epoch": 298.66407904022583, |
| "grad_norm": 0.6213370561599731, |
| "learning_rate": 6.5050167224080275e-06, |
| "loss": 3.4411, |
| "step": 21100 |
| }, |
| { |
| "epoch": 298.66407904022583, |
| "eval_loss": 3.362675666809082, |
| "eval_runtime": 75.5557, |
| "eval_samples_per_second": 132.353, |
| "eval_steps_per_second": 8.272, |
| "step": 21100 |
| }, |
| { |
| "epoch": 300.07551164431896, |
| "grad_norm": 0.5837761759757996, |
| "learning_rate": 6.488294314381271e-06, |
| "loss": 3.4401, |
| "step": 21200 |
| }, |
| { |
| "epoch": 300.07551164431896, |
| "eval_loss": 3.3615338802337646, |
| "eval_runtime": 76.9026, |
| "eval_samples_per_second": 130.035, |
| "eval_steps_per_second": 8.127, |
| "step": 21200 |
| }, |
| { |
| "epoch": 301.48694424841216, |
| "grad_norm": 0.512986958026886, |
| "learning_rate": 6.471571906354516e-06, |
| "loss": 3.4376, |
| "step": 21300 |
| }, |
| { |
| "epoch": 301.48694424841216, |
| "eval_loss": 3.360222578048706, |
| "eval_runtime": 77.0669, |
| "eval_samples_per_second": 129.757, |
| "eval_steps_per_second": 8.11, |
| "step": 21300 |
| }, |
| { |
| "epoch": 302.8983768525053, |
| "grad_norm": 0.6404605507850647, |
| "learning_rate": 6.45484949832776e-06, |
| "loss": 3.435, |
| "step": 21400 |
| }, |
| { |
| "epoch": 302.8983768525053, |
| "eval_loss": 3.355327844619751, |
| "eval_runtime": 77.0252, |
| "eval_samples_per_second": 129.828, |
| "eval_steps_per_second": 8.114, |
| "step": 21400 |
| }, |
| { |
| "epoch": 304.30980945659843, |
| "grad_norm": 0.6251723766326904, |
| "learning_rate": 6.438127090301004e-06, |
| "loss": 3.4317, |
| "step": 21500 |
| }, |
| { |
| "epoch": 304.30980945659843, |
| "eval_loss": 3.3560776710510254, |
| "eval_runtime": 76.6868, |
| "eval_samples_per_second": 130.4, |
| "eval_steps_per_second": 8.15, |
| "step": 21500 |
| }, |
| { |
| "epoch": 305.7212420606916, |
| "grad_norm": 0.6398562788963318, |
| "learning_rate": 6.421404682274248e-06, |
| "loss": 3.4313, |
| "step": 21600 |
| }, |
| { |
| "epoch": 305.7212420606916, |
| "eval_loss": 3.3519680500030518, |
| "eval_runtime": 76.6282, |
| "eval_samples_per_second": 130.5, |
| "eval_steps_per_second": 8.156, |
| "step": 21600 |
| }, |
| { |
| "epoch": 307.13267466478476, |
| "grad_norm": 0.6046691536903381, |
| "learning_rate": 6.4046822742474926e-06, |
| "loss": 3.4288, |
| "step": 21700 |
| }, |
| { |
| "epoch": 307.13267466478476, |
| "eval_loss": 3.3501694202423096, |
| "eval_runtime": 76.6941, |
| "eval_samples_per_second": 130.388, |
| "eval_steps_per_second": 8.149, |
| "step": 21700 |
| }, |
| { |
| "epoch": 308.5441072688779, |
| "grad_norm": 0.5842333436012268, |
| "learning_rate": 6.387959866220736e-06, |
| "loss": 3.4263, |
| "step": 21800 |
| }, |
| { |
| "epoch": 308.5441072688779, |
| "eval_loss": 3.3471367359161377, |
| "eval_runtime": 76.9426, |
| "eval_samples_per_second": 129.967, |
| "eval_steps_per_second": 8.123, |
| "step": 21800 |
| }, |
| { |
| "epoch": 309.9555398729711, |
| "grad_norm": 0.6153472065925598, |
| "learning_rate": 6.371237458193981e-06, |
| "loss": 3.4239, |
| "step": 21900 |
| }, |
| { |
| "epoch": 309.9555398729711, |
| "eval_loss": 3.3457906246185303, |
| "eval_runtime": 76.7426, |
| "eval_samples_per_second": 130.306, |
| "eval_steps_per_second": 8.144, |
| "step": 21900 |
| }, |
| { |
| "epoch": 311.3669724770642, |
| "grad_norm": 0.5566316246986389, |
| "learning_rate": 6.354515050167225e-06, |
| "loss": 3.4213, |
| "step": 22000 |
| }, |
| { |
| "epoch": 311.3669724770642, |
| "eval_loss": 3.343456268310547, |
| "eval_runtime": 76.5426, |
| "eval_samples_per_second": 130.646, |
| "eval_steps_per_second": 8.165, |
| "step": 22000 |
| }, |
| { |
| "epoch": 312.77840508115736, |
| "grad_norm": 0.5340844988822937, |
| "learning_rate": 6.337792642140469e-06, |
| "loss": 3.4212, |
| "step": 22100 |
| }, |
| { |
| "epoch": 312.77840508115736, |
| "eval_loss": 3.3435897827148438, |
| "eval_runtime": 76.6598, |
| "eval_samples_per_second": 130.446, |
| "eval_steps_per_second": 8.153, |
| "step": 22100 |
| }, |
| { |
| "epoch": 314.18983768525055, |
| "grad_norm": 0.5751451849937439, |
| "learning_rate": 6.321070234113713e-06, |
| "loss": 3.4176, |
| "step": 22200 |
| }, |
| { |
| "epoch": 314.18983768525055, |
| "eval_loss": 3.340423345565796, |
| "eval_runtime": 76.7241, |
| "eval_samples_per_second": 130.337, |
| "eval_steps_per_second": 8.146, |
| "step": 22200 |
| }, |
| { |
| "epoch": 315.6012702893437, |
| "grad_norm": 0.6988760828971863, |
| "learning_rate": 6.304347826086958e-06, |
| "loss": 3.4165, |
| "step": 22300 |
| }, |
| { |
| "epoch": 315.6012702893437, |
| "eval_loss": 3.3365683555603027, |
| "eval_runtime": 76.7082, |
| "eval_samples_per_second": 130.364, |
| "eval_steps_per_second": 8.148, |
| "step": 22300 |
| }, |
| { |
| "epoch": 317.0127028934368, |
| "grad_norm": 0.6003105044364929, |
| "learning_rate": 6.287625418060201e-06, |
| "loss": 3.4139, |
| "step": 22400 |
| }, |
| { |
| "epoch": 317.0127028934368, |
| "eval_loss": 3.3391008377075195, |
| "eval_runtime": 76.6667, |
| "eval_samples_per_second": 130.435, |
| "eval_steps_per_second": 8.152, |
| "step": 22400 |
| }, |
| { |
| "epoch": 318.42413549753, |
| "grad_norm": 0.6000874042510986, |
| "learning_rate": 6.270903010033445e-06, |
| "loss": 3.4116, |
| "step": 22500 |
| }, |
| { |
| "epoch": 318.42413549753, |
| "eval_loss": 3.3361566066741943, |
| "eval_runtime": 76.5668, |
| "eval_samples_per_second": 130.605, |
| "eval_steps_per_second": 8.163, |
| "step": 22500 |
| }, |
| { |
| "epoch": 319.83556810162315, |
| "grad_norm": 0.5550394058227539, |
| "learning_rate": 6.25418060200669e-06, |
| "loss": 3.4107, |
| "step": 22600 |
| }, |
| { |
| "epoch": 319.83556810162315, |
| "eval_loss": 3.334117889404297, |
| "eval_runtime": 76.601, |
| "eval_samples_per_second": 130.547, |
| "eval_steps_per_second": 8.159, |
| "step": 22600 |
| }, |
| { |
| "epoch": 321.2470007057163, |
| "grad_norm": 0.6073735356330872, |
| "learning_rate": 6.2374581939799335e-06, |
| "loss": 3.4081, |
| "step": 22700 |
| }, |
| { |
| "epoch": 321.2470007057163, |
| "eval_loss": 3.331463575363159, |
| "eval_runtime": 76.8708, |
| "eval_samples_per_second": 130.088, |
| "eval_steps_per_second": 8.131, |
| "step": 22700 |
| }, |
| { |
| "epoch": 322.6584333098095, |
| "grad_norm": 0.4972577691078186, |
| "learning_rate": 6.220735785953178e-06, |
| "loss": 3.4063, |
| "step": 22800 |
| }, |
| { |
| "epoch": 322.6584333098095, |
| "eval_loss": 3.3266327381134033, |
| "eval_runtime": 76.6986, |
| "eval_samples_per_second": 130.381, |
| "eval_steps_per_second": 8.149, |
| "step": 22800 |
| }, |
| { |
| "epoch": 324.0698659139026, |
| "grad_norm": 0.753437340259552, |
| "learning_rate": 6.204013377926422e-06, |
| "loss": 3.4039, |
| "step": 22900 |
| }, |
| { |
| "epoch": 324.0698659139026, |
| "eval_loss": 3.3264803886413574, |
| "eval_runtime": 76.8541, |
| "eval_samples_per_second": 130.117, |
| "eval_steps_per_second": 8.132, |
| "step": 22900 |
| }, |
| { |
| "epoch": 325.48129851799575, |
| "grad_norm": 0.6331711411476135, |
| "learning_rate": 6.1872909698996665e-06, |
| "loss": 3.4021, |
| "step": 23000 |
| }, |
| { |
| "epoch": 325.48129851799575, |
| "eval_loss": 3.328427314758301, |
| "eval_runtime": 76.5771, |
| "eval_samples_per_second": 130.587, |
| "eval_steps_per_second": 8.162, |
| "step": 23000 |
| }, |
| { |
| "epoch": 326.89273112208895, |
| "grad_norm": 0.5499814748764038, |
| "learning_rate": 6.17056856187291e-06, |
| "loss": 3.4007, |
| "step": 23100 |
| }, |
| { |
| "epoch": 326.89273112208895, |
| "eval_loss": 3.325409412384033, |
| "eval_runtime": 76.8356, |
| "eval_samples_per_second": 130.148, |
| "eval_steps_per_second": 8.134, |
| "step": 23100 |
| }, |
| { |
| "epoch": 328.3041637261821, |
| "grad_norm": 0.6611850261688232, |
| "learning_rate": 6.153846153846155e-06, |
| "loss": 3.3982, |
| "step": 23200 |
| }, |
| { |
| "epoch": 328.3041637261821, |
| "eval_loss": 3.325766086578369, |
| "eval_runtime": 76.7214, |
| "eval_samples_per_second": 130.342, |
| "eval_steps_per_second": 8.146, |
| "step": 23200 |
| }, |
| { |
| "epoch": 329.7155963302752, |
| "grad_norm": 0.7029439806938171, |
| "learning_rate": 6.137123745819399e-06, |
| "loss": 3.3966, |
| "step": 23300 |
| }, |
| { |
| "epoch": 329.7155963302752, |
| "eval_loss": 3.3255183696746826, |
| "eval_runtime": 76.7208, |
| "eval_samples_per_second": 130.343, |
| "eval_steps_per_second": 8.146, |
| "step": 23300 |
| }, |
| { |
| "epoch": 331.1270289343684, |
| "grad_norm": 0.5664273500442505, |
| "learning_rate": 6.120401337792643e-06, |
| "loss": 3.3942, |
| "step": 23400 |
| }, |
| { |
| "epoch": 331.1270289343684, |
| "eval_loss": 3.3201589584350586, |
| "eval_runtime": 76.9218, |
| "eval_samples_per_second": 130.002, |
| "eval_steps_per_second": 8.125, |
| "step": 23400 |
| }, |
| { |
| "epoch": 332.53846153846155, |
| "grad_norm": 0.49630409479141235, |
| "learning_rate": 6.103678929765887e-06, |
| "loss": 3.3915, |
| "step": 23500 |
| }, |
| { |
| "epoch": 332.53846153846155, |
| "eval_loss": 3.3179125785827637, |
| "eval_runtime": 76.7322, |
| "eval_samples_per_second": 130.323, |
| "eval_steps_per_second": 8.145, |
| "step": 23500 |
| }, |
| { |
| "epoch": 333.9498941425547, |
| "grad_norm": 0.5176893472671509, |
| "learning_rate": 6.086956521739132e-06, |
| "loss": 3.3909, |
| "step": 23600 |
| }, |
| { |
| "epoch": 333.9498941425547, |
| "eval_loss": 3.316556215286255, |
| "eval_runtime": 76.9107, |
| "eval_samples_per_second": 130.021, |
| "eval_steps_per_second": 8.126, |
| "step": 23600 |
| }, |
| { |
| "epoch": 335.3613267466478, |
| "grad_norm": 0.5305048823356628, |
| "learning_rate": 6.070234113712375e-06, |
| "loss": 3.3892, |
| "step": 23700 |
| }, |
| { |
| "epoch": 335.3613267466478, |
| "eval_loss": 3.314192533493042, |
| "eval_runtime": 76.9191, |
| "eval_samples_per_second": 130.007, |
| "eval_steps_per_second": 8.125, |
| "step": 23700 |
| }, |
| { |
| "epoch": 336.772759350741, |
| "grad_norm": 0.5576648116111755, |
| "learning_rate": 6.05351170568562e-06, |
| "loss": 3.3875, |
| "step": 23800 |
| }, |
| { |
| "epoch": 336.772759350741, |
| "eval_loss": 3.314286708831787, |
| "eval_runtime": 80.2593, |
| "eval_samples_per_second": 124.596, |
| "eval_steps_per_second": 7.787, |
| "step": 23800 |
| }, |
| { |
| "epoch": 338.18419195483415, |
| "grad_norm": 0.5328962802886963, |
| "learning_rate": 6.036789297658864e-06, |
| "loss": 3.3851, |
| "step": 23900 |
| }, |
| { |
| "epoch": 338.18419195483415, |
| "eval_loss": 3.312880516052246, |
| "eval_runtime": 76.9752, |
| "eval_samples_per_second": 129.912, |
| "eval_steps_per_second": 8.119, |
| "step": 23900 |
| }, |
| { |
| "epoch": 339.5956245589273, |
| "grad_norm": 0.568818211555481, |
| "learning_rate": 6.020066889632108e-06, |
| "loss": 3.3839, |
| "step": 24000 |
| }, |
| { |
| "epoch": 339.5956245589273, |
| "eval_loss": 3.3099491596221924, |
| "eval_runtime": 76.771, |
| "eval_samples_per_second": 130.258, |
| "eval_steps_per_second": 8.141, |
| "step": 24000 |
| }, |
| { |
| "epoch": 341.0070571630205, |
| "grad_norm": 0.7009373307228088, |
| "learning_rate": 6.003344481605352e-06, |
| "loss": 3.3816, |
| "step": 24100 |
| }, |
| { |
| "epoch": 341.0070571630205, |
| "eval_loss": 3.3101296424865723, |
| "eval_runtime": 76.9806, |
| "eval_samples_per_second": 129.903, |
| "eval_steps_per_second": 8.119, |
| "step": 24100 |
| }, |
| { |
| "epoch": 342.4184897671136, |
| "grad_norm": 0.677617073059082, |
| "learning_rate": 5.986622073578597e-06, |
| "loss": 3.3797, |
| "step": 24200 |
| }, |
| { |
| "epoch": 342.4184897671136, |
| "eval_loss": 3.3081202507019043, |
| "eval_runtime": 77.0716, |
| "eval_samples_per_second": 129.75, |
| "eval_steps_per_second": 8.109, |
| "step": 24200 |
| }, |
| { |
| "epoch": 343.82992237120675, |
| "grad_norm": 0.5626755952835083, |
| "learning_rate": 5.9698996655518404e-06, |
| "loss": 3.3783, |
| "step": 24300 |
| }, |
| { |
| "epoch": 343.82992237120675, |
| "eval_loss": 3.306441307067871, |
| "eval_runtime": 77.1155, |
| "eval_samples_per_second": 129.676, |
| "eval_steps_per_second": 8.105, |
| "step": 24300 |
| }, |
| { |
| "epoch": 345.24135497529994, |
| "grad_norm": 0.5604245662689209, |
| "learning_rate": 5.953177257525084e-06, |
| "loss": 3.3767, |
| "step": 24400 |
| }, |
| { |
| "epoch": 345.24135497529994, |
| "eval_loss": 3.3060827255249023, |
| "eval_runtime": 77.0349, |
| "eval_samples_per_second": 129.811, |
| "eval_steps_per_second": 8.113, |
| "step": 24400 |
| }, |
| { |
| "epoch": 346.6527875793931, |
| "grad_norm": 0.4447239339351654, |
| "learning_rate": 5.936454849498329e-06, |
| "loss": 3.3753, |
| "step": 24500 |
| }, |
| { |
| "epoch": 346.6527875793931, |
| "eval_loss": 3.299449920654297, |
| "eval_runtime": 77.0415, |
| "eval_samples_per_second": 129.8, |
| "eval_steps_per_second": 8.113, |
| "step": 24500 |
| }, |
| { |
| "epoch": 348.0642201834862, |
| "grad_norm": 0.4330262839794159, |
| "learning_rate": 5.9197324414715726e-06, |
| "loss": 3.3728, |
| "step": 24600 |
| }, |
| { |
| "epoch": 348.0642201834862, |
| "eval_loss": 3.3012359142303467, |
| "eval_runtime": 76.9455, |
| "eval_samples_per_second": 129.962, |
| "eval_steps_per_second": 8.123, |
| "step": 24600 |
| }, |
| { |
| "epoch": 349.4756527875794, |
| "grad_norm": 0.5388196110725403, |
| "learning_rate": 5.903010033444817e-06, |
| "loss": 3.3712, |
| "step": 24700 |
| }, |
| { |
| "epoch": 349.4756527875794, |
| "eval_loss": 3.3015191555023193, |
| "eval_runtime": 77.0484, |
| "eval_samples_per_second": 129.789, |
| "eval_steps_per_second": 8.112, |
| "step": 24700 |
| }, |
| { |
| "epoch": 350.88708539167254, |
| "grad_norm": 0.5516813397407532, |
| "learning_rate": 5.886287625418061e-06, |
| "loss": 3.3687, |
| "step": 24800 |
| }, |
| { |
| "epoch": 350.88708539167254, |
| "eval_loss": 3.299619436264038, |
| "eval_runtime": 77.2423, |
| "eval_samples_per_second": 129.463, |
| "eval_steps_per_second": 8.091, |
| "step": 24800 |
| }, |
| { |
| "epoch": 352.2985179957657, |
| "grad_norm": 0.5282019972801208, |
| "learning_rate": 5.8695652173913055e-06, |
| "loss": 3.3675, |
| "step": 24900 |
| }, |
| { |
| "epoch": 352.2985179957657, |
| "eval_loss": 3.2979164123535156, |
| "eval_runtime": 77.1133, |
| "eval_samples_per_second": 129.679, |
| "eval_steps_per_second": 8.105, |
| "step": 24900 |
| }, |
| { |
| "epoch": 353.70995059985887, |
| "grad_norm": 0.5380076169967651, |
| "learning_rate": 5.852842809364549e-06, |
| "loss": 3.366, |
| "step": 25000 |
| }, |
| { |
| "epoch": 353.70995059985887, |
| "eval_loss": 3.2956883907318115, |
| "eval_runtime": 77.1138, |
| "eval_samples_per_second": 129.679, |
| "eval_steps_per_second": 8.105, |
| "step": 25000 |
| }, |
| { |
| "epoch": 355.121383203952, |
| "grad_norm": 0.5667893290519714, |
| "learning_rate": 5.836120401337794e-06, |
| "loss": 3.3642, |
| "step": 25100 |
| }, |
| { |
| "epoch": 355.121383203952, |
| "eval_loss": 3.292149543762207, |
| "eval_runtime": 76.8472, |
| "eval_samples_per_second": 130.128, |
| "eval_steps_per_second": 8.133, |
| "step": 25100 |
| }, |
| { |
| "epoch": 356.53281580804514, |
| "grad_norm": 0.5600453019142151, |
| "learning_rate": 5.819397993311037e-06, |
| "loss": 3.363, |
| "step": 25200 |
| }, |
| { |
| "epoch": 356.53281580804514, |
| "eval_loss": 3.2910449504852295, |
| "eval_runtime": 77.1271, |
| "eval_samples_per_second": 129.656, |
| "eval_steps_per_second": 8.104, |
| "step": 25200 |
| }, |
| { |
| "epoch": 357.94424841213834, |
| "grad_norm": 0.5726943016052246, |
| "learning_rate": 5.8026755852842806e-06, |
| "loss": 3.3617, |
| "step": 25300 |
| }, |
| { |
| "epoch": 357.94424841213834, |
| "eval_loss": 3.292724609375, |
| "eval_runtime": 77.1078, |
| "eval_samples_per_second": 129.689, |
| "eval_steps_per_second": 8.106, |
| "step": 25300 |
| }, |
| { |
| "epoch": 359.3556810162315, |
| "grad_norm": 0.5794702768325806, |
| "learning_rate": 5.785953177257525e-06, |
| "loss": 3.3605, |
| "step": 25400 |
| }, |
| { |
| "epoch": 359.3556810162315, |
| "eval_loss": 3.2890124320983887, |
| "eval_runtime": 75.9883, |
| "eval_samples_per_second": 131.599, |
| "eval_steps_per_second": 8.225, |
| "step": 25400 |
| }, |
| { |
| "epoch": 360.7671136203246, |
| "grad_norm": 0.5813022255897522, |
| "learning_rate": 5.769230769230769e-06, |
| "loss": 3.3579, |
| "step": 25500 |
| }, |
| { |
| "epoch": 360.7671136203246, |
| "eval_loss": 3.288177251815796, |
| "eval_runtime": 76.09, |
| "eval_samples_per_second": 131.423, |
| "eval_steps_per_second": 8.214, |
| "step": 25500 |
| }, |
| { |
| "epoch": 362.1785462244178, |
| "grad_norm": 0.5103623270988464, |
| "learning_rate": 5.7525083612040135e-06, |
| "loss": 3.3559, |
| "step": 25600 |
| }, |
| { |
| "epoch": 362.1785462244178, |
| "eval_loss": 3.2898659706115723, |
| "eval_runtime": 75.9717, |
| "eval_samples_per_second": 131.628, |
| "eval_steps_per_second": 8.227, |
| "step": 25600 |
| }, |
| { |
| "epoch": 363.58997882851094, |
| "grad_norm": 0.5904416441917419, |
| "learning_rate": 5.735785953177257e-06, |
| "loss": 3.3546, |
| "step": 25700 |
| }, |
| { |
| "epoch": 363.58997882851094, |
| "eval_loss": 3.2834417819976807, |
| "eval_runtime": 76.1148, |
| "eval_samples_per_second": 131.38, |
| "eval_steps_per_second": 8.211, |
| "step": 25700 |
| }, |
| { |
| "epoch": 365.0014114326041, |
| "grad_norm": 0.5598199367523193, |
| "learning_rate": 5.719063545150502e-06, |
| "loss": 3.3532, |
| "step": 25800 |
| }, |
| { |
| "epoch": 365.0014114326041, |
| "eval_loss": 3.2836413383483887, |
| "eval_runtime": 76.0815, |
| "eval_samples_per_second": 131.438, |
| "eval_steps_per_second": 8.215, |
| "step": 25800 |
| }, |
| { |
| "epoch": 366.41284403669727, |
| "grad_norm": 0.4987429976463318, |
| "learning_rate": 5.702341137123746e-06, |
| "loss": 3.3506, |
| "step": 25900 |
| }, |
| { |
| "epoch": 366.41284403669727, |
| "eval_loss": 3.2850100994110107, |
| "eval_runtime": 76.1264, |
| "eval_samples_per_second": 131.361, |
| "eval_steps_per_second": 8.21, |
| "step": 25900 |
| }, |
| { |
| "epoch": 367.8242766407904, |
| "grad_norm": 0.5134937763214111, |
| "learning_rate": 5.68561872909699e-06, |
| "loss": 3.3502, |
| "step": 26000 |
| }, |
| { |
| "epoch": 367.8242766407904, |
| "eval_loss": 3.2814862728118896, |
| "eval_runtime": 74.6669, |
| "eval_samples_per_second": 133.928, |
| "eval_steps_per_second": 8.371, |
| "step": 26000 |
| }, |
| { |
| "epoch": 369.23570924488354, |
| "grad_norm": 0.44258707761764526, |
| "learning_rate": 5.668896321070234e-06, |
| "loss": 3.3486, |
| "step": 26100 |
| }, |
| { |
| "epoch": 369.23570924488354, |
| "eval_loss": 3.277559757232666, |
| "eval_runtime": 74.7207, |
| "eval_samples_per_second": 133.832, |
| "eval_steps_per_second": 8.364, |
| "step": 26100 |
| }, |
| { |
| "epoch": 370.64714184897673, |
| "grad_norm": 0.5384514927864075, |
| "learning_rate": 5.652173913043479e-06, |
| "loss": 3.3449, |
| "step": 26200 |
| }, |
| { |
| "epoch": 370.64714184897673, |
| "eval_loss": 3.2763917446136475, |
| "eval_runtime": 74.5448, |
| "eval_samples_per_second": 134.148, |
| "eval_steps_per_second": 8.384, |
| "step": 26200 |
| }, |
| { |
| "epoch": 372.05857445306987, |
| "grad_norm": 0.6275975108146667, |
| "learning_rate": 5.635451505016722e-06, |
| "loss": 3.3452, |
| "step": 26300 |
| }, |
| { |
| "epoch": 372.05857445306987, |
| "eval_loss": 3.273946523666382, |
| "eval_runtime": 74.5431, |
| "eval_samples_per_second": 134.151, |
| "eval_steps_per_second": 8.384, |
| "step": 26300 |
| }, |
| { |
| "epoch": 373.470007057163, |
| "grad_norm": 0.44813865423202515, |
| "learning_rate": 5.618729096989967e-06, |
| "loss": 3.3436, |
| "step": 26400 |
| }, |
| { |
| "epoch": 373.470007057163, |
| "eval_loss": 3.272003412246704, |
| "eval_runtime": 74.6769, |
| "eval_samples_per_second": 133.91, |
| "eval_steps_per_second": 8.369, |
| "step": 26400 |
| }, |
| { |
| "epoch": 374.8814396612562, |
| "grad_norm": 0.5522853136062622, |
| "learning_rate": 5.602006688963211e-06, |
| "loss": 3.3418, |
| "step": 26500 |
| }, |
| { |
| "epoch": 374.8814396612562, |
| "eval_loss": 3.271955966949463, |
| "eval_runtime": 74.7546, |
| "eval_samples_per_second": 133.771, |
| "eval_steps_per_second": 8.361, |
| "step": 26500 |
| }, |
| { |
| "epoch": 376.29287226534933, |
| "grad_norm": 0.6033642888069153, |
| "learning_rate": 5.585284280936455e-06, |
| "loss": 3.3406, |
| "step": 26600 |
| }, |
| { |
| "epoch": 376.29287226534933, |
| "eval_loss": 3.2729640007019043, |
| "eval_runtime": 74.7615, |
| "eval_samples_per_second": 133.759, |
| "eval_steps_per_second": 8.36, |
| "step": 26600 |
| }, |
| { |
| "epoch": 377.70430486944247, |
| "grad_norm": 0.39052197337150574, |
| "learning_rate": 5.568561872909699e-06, |
| "loss": 3.3389, |
| "step": 26700 |
| }, |
| { |
| "epoch": 377.70430486944247, |
| "eval_loss": 3.2696945667266846, |
| "eval_runtime": 74.854, |
| "eval_samples_per_second": 133.593, |
| "eval_steps_per_second": 8.35, |
| "step": 26700 |
| }, |
| { |
| "epoch": 379.11573747353566, |
| "grad_norm": 0.6026007533073425, |
| "learning_rate": 5.551839464882943e-06, |
| "loss": 3.3365, |
| "step": 26800 |
| }, |
| { |
| "epoch": 379.11573747353566, |
| "eval_loss": 3.2685930728912354, |
| "eval_runtime": 74.7049, |
| "eval_samples_per_second": 133.86, |
| "eval_steps_per_second": 8.366, |
| "step": 26800 |
| }, |
| { |
| "epoch": 380.5271700776288, |
| "grad_norm": 0.47670257091522217, |
| "learning_rate": 5.5351170568561875e-06, |
| "loss": 3.3368, |
| "step": 26900 |
| }, |
| { |
| "epoch": 380.5271700776288, |
| "eval_loss": 3.2685508728027344, |
| "eval_runtime": 74.7567, |
| "eval_samples_per_second": 133.767, |
| "eval_steps_per_second": 8.36, |
| "step": 26900 |
| }, |
| { |
| "epoch": 381.93860268172193, |
| "grad_norm": 0.5947960615158081, |
| "learning_rate": 5.518394648829431e-06, |
| "loss": 3.3337, |
| "step": 27000 |
| }, |
| { |
| "epoch": 381.93860268172193, |
| "eval_loss": 3.2665364742279053, |
| "eval_runtime": 75.0954, |
| "eval_samples_per_second": 133.164, |
| "eval_steps_per_second": 8.323, |
| "step": 27000 |
| }, |
| { |
| "epoch": 383.3500352858151, |
| "grad_norm": 0.5476217865943909, |
| "learning_rate": 5.501672240802676e-06, |
| "loss": 3.3329, |
| "step": 27100 |
| }, |
| { |
| "epoch": 383.3500352858151, |
| "eval_loss": 3.2639975547790527, |
| "eval_runtime": 75.1039, |
| "eval_samples_per_second": 133.149, |
| "eval_steps_per_second": 8.322, |
| "step": 27100 |
| }, |
| { |
| "epoch": 384.76146788990826, |
| "grad_norm": 0.4159116744995117, |
| "learning_rate": 5.48494983277592e-06, |
| "loss": 3.3315, |
| "step": 27200 |
| }, |
| { |
| "epoch": 384.76146788990826, |
| "eval_loss": 3.263084650039673, |
| "eval_runtime": 74.9436, |
| "eval_samples_per_second": 133.434, |
| "eval_steps_per_second": 8.34, |
| "step": 27200 |
| }, |
| { |
| "epoch": 386.1729004940014, |
| "grad_norm": 0.5419652462005615, |
| "learning_rate": 5.468227424749164e-06, |
| "loss": 3.3303, |
| "step": 27300 |
| }, |
| { |
| "epoch": 386.1729004940014, |
| "eval_loss": 3.2612051963806152, |
| "eval_runtime": 75.1117, |
| "eval_samples_per_second": 133.135, |
| "eval_steps_per_second": 8.321, |
| "step": 27300 |
| }, |
| { |
| "epoch": 387.5843330980946, |
| "grad_norm": 0.512188732624054, |
| "learning_rate": 5.451505016722408e-06, |
| "loss": 3.3292, |
| "step": 27400 |
| }, |
| { |
| "epoch": 387.5843330980946, |
| "eval_loss": 3.259936809539795, |
| "eval_runtime": 75.0941, |
| "eval_samples_per_second": 133.166, |
| "eval_steps_per_second": 8.323, |
| "step": 27400 |
| }, |
| { |
| "epoch": 388.9957657021877, |
| "grad_norm": 0.727654755115509, |
| "learning_rate": 5.4347826086956525e-06, |
| "loss": 3.3269, |
| "step": 27500 |
| }, |
| { |
| "epoch": 388.9957657021877, |
| "eval_loss": 3.261517286300659, |
| "eval_runtime": 75.0628, |
| "eval_samples_per_second": 133.222, |
| "eval_steps_per_second": 8.326, |
| "step": 27500 |
| }, |
| { |
| "epoch": 390.40719830628086, |
| "grad_norm": 0.5372282862663269, |
| "learning_rate": 5.418060200668896e-06, |
| "loss": 3.3254, |
| "step": 27600 |
| }, |
| { |
| "epoch": 390.40719830628086, |
| "eval_loss": 3.258228302001953, |
| "eval_runtime": 74.9449, |
| "eval_samples_per_second": 133.431, |
| "eval_steps_per_second": 8.339, |
| "step": 27600 |
| }, |
| { |
| "epoch": 391.81863091037405, |
| "grad_norm": 0.5655878186225891, |
| "learning_rate": 5.401337792642141e-06, |
| "loss": 3.3242, |
| "step": 27700 |
| }, |
| { |
| "epoch": 391.81863091037405, |
| "eval_loss": 3.2560343742370605, |
| "eval_runtime": 75.2757, |
| "eval_samples_per_second": 132.845, |
| "eval_steps_per_second": 8.303, |
| "step": 27700 |
| }, |
| { |
| "epoch": 393.2300635144672, |
| "grad_norm": 0.6290187835693359, |
| "learning_rate": 5.384615384615385e-06, |
| "loss": 3.3238, |
| "step": 27800 |
| }, |
| { |
| "epoch": 393.2300635144672, |
| "eval_loss": 3.2537336349487305, |
| "eval_runtime": 75.0543, |
| "eval_samples_per_second": 133.237, |
| "eval_steps_per_second": 8.327, |
| "step": 27800 |
| }, |
| { |
| "epoch": 394.6414961185603, |
| "grad_norm": 0.6421985626220703, |
| "learning_rate": 5.367892976588629e-06, |
| "loss": 3.3218, |
| "step": 27900 |
| }, |
| { |
| "epoch": 394.6414961185603, |
| "eval_loss": 3.254143238067627, |
| "eval_runtime": 75.0661, |
| "eval_samples_per_second": 133.216, |
| "eval_steps_per_second": 8.326, |
| "step": 27900 |
| }, |
| { |
| "epoch": 396.0529287226535, |
| "grad_norm": 0.4675317108631134, |
| "learning_rate": 5.351170568561873e-06, |
| "loss": 3.3203, |
| "step": 28000 |
| }, |
| { |
| "epoch": 396.0529287226535, |
| "eval_loss": 3.2528676986694336, |
| "eval_runtime": 74.9957, |
| "eval_samples_per_second": 133.341, |
| "eval_steps_per_second": 8.334, |
| "step": 28000 |
| }, |
| { |
| "epoch": 397.46436132674665, |
| "grad_norm": 0.5349506735801697, |
| "learning_rate": 5.334448160535118e-06, |
| "loss": 3.3185, |
| "step": 28100 |
| }, |
| { |
| "epoch": 397.46436132674665, |
| "eval_loss": 3.2534239292144775, |
| "eval_runtime": 74.7782, |
| "eval_samples_per_second": 133.729, |
| "eval_steps_per_second": 8.358, |
| "step": 28100 |
| }, |
| { |
| "epoch": 398.8757939308398, |
| "grad_norm": 0.6712241768836975, |
| "learning_rate": 5.317725752508361e-06, |
| "loss": 3.3173, |
| "step": 28200 |
| }, |
| { |
| "epoch": 398.8757939308398, |
| "eval_loss": 3.2485058307647705, |
| "eval_runtime": 74.9968, |
| "eval_samples_per_second": 133.339, |
| "eval_steps_per_second": 8.334, |
| "step": 28200 |
| }, |
| { |
| "epoch": 400.287226534933, |
| "grad_norm": 0.4701189696788788, |
| "learning_rate": 5.301003344481606e-06, |
| "loss": 3.3163, |
| "step": 28300 |
| }, |
| { |
| "epoch": 400.287226534933, |
| "eval_loss": 3.2490499019622803, |
| "eval_runtime": 75.0541, |
| "eval_samples_per_second": 133.237, |
| "eval_steps_per_second": 8.327, |
| "step": 28300 |
| }, |
| { |
| "epoch": 401.6986591390261, |
| "grad_norm": 0.5290805101394653, |
| "learning_rate": 5.28428093645485e-06, |
| "loss": 3.315, |
| "step": 28400 |
| }, |
| { |
| "epoch": 401.6986591390261, |
| "eval_loss": 3.2493200302124023, |
| "eval_runtime": 75.1066, |
| "eval_samples_per_second": 133.144, |
| "eval_steps_per_second": 8.322, |
| "step": 28400 |
| }, |
| { |
| "epoch": 403.11009174311926, |
| "grad_norm": 0.5082231760025024, |
| "learning_rate": 5.2675585284280935e-06, |
| "loss": 3.313, |
| "step": 28500 |
| }, |
| { |
| "epoch": 403.11009174311926, |
| "eval_loss": 3.2446095943450928, |
| "eval_runtime": 75.0521, |
| "eval_samples_per_second": 133.241, |
| "eval_steps_per_second": 8.328, |
| "step": 28500 |
| }, |
| { |
| "epoch": 404.52152434721245, |
| "grad_norm": 0.6178306937217712, |
| "learning_rate": 5.250836120401338e-06, |
| "loss": 3.3124, |
| "step": 28600 |
| }, |
| { |
| "epoch": 404.52152434721245, |
| "eval_loss": 3.243537425994873, |
| "eval_runtime": 74.8287, |
| "eval_samples_per_second": 133.638, |
| "eval_steps_per_second": 8.352, |
| "step": 28600 |
| }, |
| { |
| "epoch": 405.9329569513056, |
| "grad_norm": 0.48364028334617615, |
| "learning_rate": 5.234113712374582e-06, |
| "loss": 3.3104, |
| "step": 28700 |
| }, |
| { |
| "epoch": 405.9329569513056, |
| "eval_loss": 3.243783712387085, |
| "eval_runtime": 74.8558, |
| "eval_samples_per_second": 133.59, |
| "eval_steps_per_second": 8.349, |
| "step": 28700 |
| }, |
| { |
| "epoch": 407.3443895553987, |
| "grad_norm": 0.6693919897079468, |
| "learning_rate": 5.2173913043478265e-06, |
| "loss": 3.3088, |
| "step": 28800 |
| }, |
| { |
| "epoch": 407.3443895553987, |
| "eval_loss": 3.2437820434570312, |
| "eval_runtime": 75.0675, |
| "eval_samples_per_second": 133.213, |
| "eval_steps_per_second": 8.326, |
| "step": 28800 |
| }, |
| { |
| "epoch": 408.75582215949186, |
| "grad_norm": 0.5126225352287292, |
| "learning_rate": 5.20066889632107e-06, |
| "loss": 3.3087, |
| "step": 28900 |
| }, |
| { |
| "epoch": 408.75582215949186, |
| "eval_loss": 3.2396769523620605, |
| "eval_runtime": 74.9695, |
| "eval_samples_per_second": 133.388, |
| "eval_steps_per_second": 8.337, |
| "step": 28900 |
| }, |
| { |
| "epoch": 410.16725476358505, |
| "grad_norm": 0.48420536518096924, |
| "learning_rate": 5.183946488294315e-06, |
| "loss": 3.3072, |
| "step": 29000 |
| }, |
| { |
| "epoch": 410.16725476358505, |
| "eval_loss": 3.2414257526397705, |
| "eval_runtime": 75.1143, |
| "eval_samples_per_second": 133.13, |
| "eval_steps_per_second": 8.321, |
| "step": 29000 |
| }, |
| { |
| "epoch": 411.5786873676782, |
| "grad_norm": 0.4449322819709778, |
| "learning_rate": 5.167224080267559e-06, |
| "loss": 3.3052, |
| "step": 29100 |
| }, |
| { |
| "epoch": 411.5786873676782, |
| "eval_loss": 3.2392234802246094, |
| "eval_runtime": 75.0722, |
| "eval_samples_per_second": 133.205, |
| "eval_steps_per_second": 8.325, |
| "step": 29100 |
| }, |
| { |
| "epoch": 412.9901199717713, |
| "grad_norm": 0.5432822704315186, |
| "learning_rate": 5.150501672240803e-06, |
| "loss": 3.3039, |
| "step": 29200 |
| }, |
| { |
| "epoch": 412.9901199717713, |
| "eval_loss": 3.236405372619629, |
| "eval_runtime": 74.8808, |
| "eval_samples_per_second": 133.546, |
| "eval_steps_per_second": 8.347, |
| "step": 29200 |
| }, |
| { |
| "epoch": 414.4015525758645, |
| "grad_norm": 0.5220522880554199, |
| "learning_rate": 5.133779264214047e-06, |
| "loss": 3.3038, |
| "step": 29300 |
| }, |
| { |
| "epoch": 414.4015525758645, |
| "eval_loss": 3.237210273742676, |
| "eval_runtime": 75.1138, |
| "eval_samples_per_second": 133.131, |
| "eval_steps_per_second": 8.321, |
| "step": 29300 |
| }, |
| { |
| "epoch": 415.81298517995765, |
| "grad_norm": 0.40687912702560425, |
| "learning_rate": 5.1170568561872916e-06, |
| "loss": 3.3017, |
| "step": 29400 |
| }, |
| { |
| "epoch": 415.81298517995765, |
| "eval_loss": 3.2353012561798096, |
| "eval_runtime": 75.0181, |
| "eval_samples_per_second": 133.301, |
| "eval_steps_per_second": 8.331, |
| "step": 29400 |
| }, |
| { |
| "epoch": 417.2244177840508, |
| "grad_norm": 0.5893601775169373, |
| "learning_rate": 5.100334448160535e-06, |
| "loss": 3.3005, |
| "step": 29500 |
| }, |
| { |
| "epoch": 417.2244177840508, |
| "eval_loss": 3.2351584434509277, |
| "eval_runtime": 75.0403, |
| "eval_samples_per_second": 133.262, |
| "eval_steps_per_second": 8.329, |
| "step": 29500 |
| }, |
| { |
| "epoch": 418.635850388144, |
| "grad_norm": 0.5068885087966919, |
| "learning_rate": 5.08361204013378e-06, |
| "loss": 3.2986, |
| "step": 29600 |
| }, |
| { |
| "epoch": 418.635850388144, |
| "eval_loss": 3.234811782836914, |
| "eval_runtime": 75.0157, |
| "eval_samples_per_second": 133.306, |
| "eval_steps_per_second": 8.332, |
| "step": 29600 |
| }, |
| { |
| "epoch": 420.0472829922371, |
| "grad_norm": 0.5675772428512573, |
| "learning_rate": 5.066889632107024e-06, |
| "loss": 3.2984, |
| "step": 29700 |
| }, |
| { |
| "epoch": 420.0472829922371, |
| "eval_loss": 3.231552839279175, |
| "eval_runtime": 74.8696, |
| "eval_samples_per_second": 133.566, |
| "eval_steps_per_second": 8.348, |
| "step": 29700 |
| }, |
| { |
| "epoch": 421.45871559633025, |
| "grad_norm": 0.5616305470466614, |
| "learning_rate": 5.050167224080268e-06, |
| "loss": 3.2971, |
| "step": 29800 |
| }, |
| { |
| "epoch": 421.45871559633025, |
| "eval_loss": 3.2294559478759766, |
| "eval_runtime": 74.9526, |
| "eval_samples_per_second": 133.418, |
| "eval_steps_per_second": 8.339, |
| "step": 29800 |
| }, |
| { |
| "epoch": 422.87014820042344, |
| "grad_norm": 0.4248732328414917, |
| "learning_rate": 5.033444816053512e-06, |
| "loss": 3.2951, |
| "step": 29900 |
| }, |
| { |
| "epoch": 422.87014820042344, |
| "eval_loss": 3.2306745052337646, |
| "eval_runtime": 74.9879, |
| "eval_samples_per_second": 133.355, |
| "eval_steps_per_second": 8.335, |
| "step": 29900 |
| }, |
| { |
| "epoch": 424.2815808045166, |
| "grad_norm": 0.4150717556476593, |
| "learning_rate": 5.016722408026757e-06, |
| "loss": 3.2934, |
| "step": 30000 |
| }, |
| { |
| "epoch": 424.2815808045166, |
| "eval_loss": 3.2273478507995605, |
| "eval_runtime": 75.0475, |
| "eval_samples_per_second": 133.249, |
| "eval_steps_per_second": 8.328, |
| "step": 30000 |
| }, |
| { |
| "epoch": 425.6930134086097, |
| "grad_norm": 0.43221816420555115, |
| "learning_rate": 5e-06, |
| "loss": 3.293, |
| "step": 30100 |
| }, |
| { |
| "epoch": 425.6930134086097, |
| "eval_loss": 3.2289834022521973, |
| "eval_runtime": 75.3497, |
| "eval_samples_per_second": 132.715, |
| "eval_steps_per_second": 8.295, |
| "step": 30100 |
| }, |
| { |
| "epoch": 427.1044460127029, |
| "grad_norm": 0.5517231822013855, |
| "learning_rate": 4.983277591973244e-06, |
| "loss": 3.2926, |
| "step": 30200 |
| }, |
| { |
| "epoch": 427.1044460127029, |
| "eval_loss": 3.2246079444885254, |
| "eval_runtime": 75.5521, |
| "eval_samples_per_second": 132.359, |
| "eval_steps_per_second": 8.272, |
| "step": 30200 |
| }, |
| { |
| "epoch": 428.51587861679604, |
| "grad_norm": 0.47511130571365356, |
| "learning_rate": 4.966555183946489e-06, |
| "loss": 3.2895, |
| "step": 30300 |
| }, |
| { |
| "epoch": 428.51587861679604, |
| "eval_loss": 3.2235753536224365, |
| "eval_runtime": 75.5117, |
| "eval_samples_per_second": 132.43, |
| "eval_steps_per_second": 8.277, |
| "step": 30300 |
| }, |
| { |
| "epoch": 429.9273112208892, |
| "grad_norm": 0.4866757392883301, |
| "learning_rate": 4.9498327759197325e-06, |
| "loss": 3.2895, |
| "step": 30400 |
| }, |
| { |
| "epoch": 429.9273112208892, |
| "eval_loss": 3.2261710166931152, |
| "eval_runtime": 75.5689, |
| "eval_samples_per_second": 132.33, |
| "eval_steps_per_second": 8.271, |
| "step": 30400 |
| }, |
| { |
| "epoch": 431.3387438249824, |
| "grad_norm": 0.4943363666534424, |
| "learning_rate": 4.933110367892977e-06, |
| "loss": 3.2883, |
| "step": 30500 |
| }, |
| { |
| "epoch": 431.3387438249824, |
| "eval_loss": 3.2263576984405518, |
| "eval_runtime": 75.3634, |
| "eval_samples_per_second": 132.69, |
| "eval_steps_per_second": 8.293, |
| "step": 30500 |
| }, |
| { |
| "epoch": 432.7501764290755, |
| "grad_norm": 0.5846276879310608, |
| "learning_rate": 4.916387959866221e-06, |
| "loss": 3.2873, |
| "step": 30600 |
| }, |
| { |
| "epoch": 432.7501764290755, |
| "eval_loss": 3.220747709274292, |
| "eval_runtime": 75.4241, |
| "eval_samples_per_second": 132.584, |
| "eval_steps_per_second": 8.286, |
| "step": 30600 |
| }, |
| { |
| "epoch": 434.16160903316865, |
| "grad_norm": 0.4371052086353302, |
| "learning_rate": 4.8996655518394655e-06, |
| "loss": 3.2857, |
| "step": 30700 |
| }, |
| { |
| "epoch": 434.16160903316865, |
| "eval_loss": 3.2217774391174316, |
| "eval_runtime": 75.5005, |
| "eval_samples_per_second": 132.45, |
| "eval_steps_per_second": 8.278, |
| "step": 30700 |
| }, |
| { |
| "epoch": 435.57304163726184, |
| "grad_norm": 0.5578730702400208, |
| "learning_rate": 4.882943143812709e-06, |
| "loss": 3.2844, |
| "step": 30800 |
| }, |
| { |
| "epoch": 435.57304163726184, |
| "eval_loss": 3.219982862472534, |
| "eval_runtime": 75.31, |
| "eval_samples_per_second": 132.784, |
| "eval_steps_per_second": 8.299, |
| "step": 30800 |
| }, |
| { |
| "epoch": 436.984474241355, |
| "grad_norm": 0.4692535996437073, |
| "learning_rate": 4.866220735785954e-06, |
| "loss": 3.2839, |
| "step": 30900 |
| }, |
| { |
| "epoch": 436.984474241355, |
| "eval_loss": 3.2172024250030518, |
| "eval_runtime": 75.315, |
| "eval_samples_per_second": 132.776, |
| "eval_steps_per_second": 8.298, |
| "step": 30900 |
| }, |
| { |
| "epoch": 438.3959068454481, |
| "grad_norm": 0.5210705995559692, |
| "learning_rate": 4.849498327759198e-06, |
| "loss": 3.2817, |
| "step": 31000 |
| }, |
| { |
| "epoch": 438.3959068454481, |
| "eval_loss": 3.220083236694336, |
| "eval_runtime": 74.9928, |
| "eval_samples_per_second": 133.346, |
| "eval_steps_per_second": 8.334, |
| "step": 31000 |
| }, |
| { |
| "epoch": 439.8073394495413, |
| "grad_norm": 0.417936235666275, |
| "learning_rate": 4.832775919732442e-06, |
| "loss": 3.2814, |
| "step": 31100 |
| }, |
| { |
| "epoch": 439.8073394495413, |
| "eval_loss": 3.218825101852417, |
| "eval_runtime": 75.2435, |
| "eval_samples_per_second": 132.902, |
| "eval_steps_per_second": 8.306, |
| "step": 31100 |
| }, |
| { |
| "epoch": 441.21877205363444, |
| "grad_norm": 0.5074178576469421, |
| "learning_rate": 4.816053511705686e-06, |
| "loss": 3.2805, |
| "step": 31200 |
| }, |
| { |
| "epoch": 441.21877205363444, |
| "eval_loss": 3.218318462371826, |
| "eval_runtime": 75.2501, |
| "eval_samples_per_second": 132.89, |
| "eval_steps_per_second": 8.306, |
| "step": 31200 |
| }, |
| { |
| "epoch": 442.6302046577276, |
| "grad_norm": 0.6320539116859436, |
| "learning_rate": 4.799331103678931e-06, |
| "loss": 3.2791, |
| "step": 31300 |
| }, |
| { |
| "epoch": 442.6302046577276, |
| "eval_loss": 3.215117931365967, |
| "eval_runtime": 75.2339, |
| "eval_samples_per_second": 132.919, |
| "eval_steps_per_second": 8.307, |
| "step": 31300 |
| }, |
| { |
| "epoch": 444.04163726182077, |
| "grad_norm": 0.573082685470581, |
| "learning_rate": 4.782608695652174e-06, |
| "loss": 3.2784, |
| "step": 31400 |
| }, |
| { |
| "epoch": 444.04163726182077, |
| "eval_loss": 3.2140893936157227, |
| "eval_runtime": 75.2464, |
| "eval_samples_per_second": 132.897, |
| "eval_steps_per_second": 8.306, |
| "step": 31400 |
| }, |
| { |
| "epoch": 445.4530698659139, |
| "grad_norm": 0.4167691469192505, |
| "learning_rate": 4.765886287625419e-06, |
| "loss": 3.2764, |
| "step": 31500 |
| }, |
| { |
| "epoch": 445.4530698659139, |
| "eval_loss": 3.209411382675171, |
| "eval_runtime": 75.3085, |
| "eval_samples_per_second": 132.787, |
| "eval_steps_per_second": 8.299, |
| "step": 31500 |
| }, |
| { |
| "epoch": 446.86450247000704, |
| "grad_norm": 0.4705657958984375, |
| "learning_rate": 4.749163879598663e-06, |
| "loss": 3.2769, |
| "step": 31600 |
| }, |
| { |
| "epoch": 446.86450247000704, |
| "eval_loss": 3.212165117263794, |
| "eval_runtime": 75.4939, |
| "eval_samples_per_second": 132.461, |
| "eval_steps_per_second": 8.279, |
| "step": 31600 |
| }, |
| { |
| "epoch": 448.27593507410023, |
| "grad_norm": 0.48369839787483215, |
| "learning_rate": 4.732441471571907e-06, |
| "loss": 3.2737, |
| "step": 31700 |
| }, |
| { |
| "epoch": 448.27593507410023, |
| "eval_loss": 3.211515426635742, |
| "eval_runtime": 75.5083, |
| "eval_samples_per_second": 132.436, |
| "eval_steps_per_second": 8.277, |
| "step": 31700 |
| }, |
| { |
| "epoch": 449.68736767819337, |
| "grad_norm": 0.5090962648391724, |
| "learning_rate": 4.715719063545151e-06, |
| "loss": 3.2739, |
| "step": 31800 |
| }, |
| { |
| "epoch": 449.68736767819337, |
| "eval_loss": 3.2119221687316895, |
| "eval_runtime": 75.5485, |
| "eval_samples_per_second": 132.365, |
| "eval_steps_per_second": 8.273, |
| "step": 31800 |
| }, |
| { |
| "epoch": 451.0988002822865, |
| "grad_norm": 0.5034780502319336, |
| "learning_rate": 4.698996655518395e-06, |
| "loss": 3.2724, |
| "step": 31900 |
| }, |
| { |
| "epoch": 451.0988002822865, |
| "eval_loss": 3.2079360485076904, |
| "eval_runtime": 75.5032, |
| "eval_samples_per_second": 132.445, |
| "eval_steps_per_second": 8.278, |
| "step": 31900 |
| }, |
| { |
| "epoch": 452.5102328863797, |
| "grad_norm": 0.4804918169975281, |
| "learning_rate": 4.6822742474916394e-06, |
| "loss": 3.2711, |
| "step": 32000 |
| }, |
| { |
| "epoch": 452.5102328863797, |
| "eval_loss": 3.206244468688965, |
| "eval_runtime": 75.5098, |
| "eval_samples_per_second": 132.433, |
| "eval_steps_per_second": 8.277, |
| "step": 32000 |
| }, |
| { |
| "epoch": 453.92166549047283, |
| "grad_norm": 0.47639667987823486, |
| "learning_rate": 4.665551839464883e-06, |
| "loss": 3.2704, |
| "step": 32100 |
| }, |
| { |
| "epoch": 453.92166549047283, |
| "eval_loss": 3.2067034244537354, |
| "eval_runtime": 75.4292, |
| "eval_samples_per_second": 132.575, |
| "eval_steps_per_second": 8.286, |
| "step": 32100 |
| }, |
| { |
| "epoch": 455.33309809456597, |
| "grad_norm": 0.5194268226623535, |
| "learning_rate": 4.648829431438128e-06, |
| "loss": 3.2692, |
| "step": 32200 |
| }, |
| { |
| "epoch": 455.33309809456597, |
| "eval_loss": 3.2050275802612305, |
| "eval_runtime": 75.3819, |
| "eval_samples_per_second": 132.658, |
| "eval_steps_per_second": 8.291, |
| "step": 32200 |
| }, |
| { |
| "epoch": 456.74453069865916, |
| "grad_norm": 0.48084452748298645, |
| "learning_rate": 4.6321070234113715e-06, |
| "loss": 3.268, |
| "step": 32300 |
| }, |
| { |
| "epoch": 456.74453069865916, |
| "eval_loss": 3.205712080001831, |
| "eval_runtime": 75.4989, |
| "eval_samples_per_second": 132.452, |
| "eval_steps_per_second": 8.278, |
| "step": 32300 |
| }, |
| { |
| "epoch": 458.1559633027523, |
| "grad_norm": 0.513474702835083, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 3.2673, |
| "step": 32400 |
| }, |
| { |
| "epoch": 458.1559633027523, |
| "eval_loss": 3.2028987407684326, |
| "eval_runtime": 75.5988, |
| "eval_samples_per_second": 132.277, |
| "eval_steps_per_second": 8.267, |
| "step": 32400 |
| }, |
| { |
| "epoch": 459.56739590684543, |
| "grad_norm": 0.4173973798751831, |
| "learning_rate": 4.59866220735786e-06, |
| "loss": 3.2659, |
| "step": 32500 |
| }, |
| { |
| "epoch": 459.56739590684543, |
| "eval_loss": 3.2002434730529785, |
| "eval_runtime": 75.375, |
| "eval_samples_per_second": 132.67, |
| "eval_steps_per_second": 8.292, |
| "step": 32500 |
| }, |
| { |
| "epoch": 460.9788285109386, |
| "grad_norm": 0.44799456000328064, |
| "learning_rate": 4.581939799331104e-06, |
| "loss": 3.2648, |
| "step": 32600 |
| }, |
| { |
| "epoch": 460.9788285109386, |
| "eval_loss": 3.2009098529815674, |
| "eval_runtime": 75.4432, |
| "eval_samples_per_second": 132.55, |
| "eval_steps_per_second": 8.284, |
| "step": 32600 |
| }, |
| { |
| "epoch": 462.39026111503176, |
| "grad_norm": 0.4446271061897278, |
| "learning_rate": 4.565217391304348e-06, |
| "loss": 3.2645, |
| "step": 32700 |
| }, |
| { |
| "epoch": 462.39026111503176, |
| "eval_loss": 3.1985297203063965, |
| "eval_runtime": 75.461, |
| "eval_samples_per_second": 132.519, |
| "eval_steps_per_second": 8.282, |
| "step": 32700 |
| }, |
| { |
| "epoch": 463.8016937191249, |
| "grad_norm": 0.43961068987846375, |
| "learning_rate": 4.548494983277592e-06, |
| "loss": 3.2624, |
| "step": 32800 |
| }, |
| { |
| "epoch": 463.8016937191249, |
| "eval_loss": 3.202833890914917, |
| "eval_runtime": 75.521, |
| "eval_samples_per_second": 132.413, |
| "eval_steps_per_second": 8.276, |
| "step": 32800 |
| }, |
| { |
| "epoch": 465.2131263232181, |
| "grad_norm": 0.47034966945648193, |
| "learning_rate": 4.531772575250837e-06, |
| "loss": 3.2617, |
| "step": 32900 |
| }, |
| { |
| "epoch": 465.2131263232181, |
| "eval_loss": 3.2012619972229004, |
| "eval_runtime": 75.7188, |
| "eval_samples_per_second": 132.068, |
| "eval_steps_per_second": 8.254, |
| "step": 32900 |
| }, |
| { |
| "epoch": 466.6245589273112, |
| "grad_norm": 0.4704744517803192, |
| "learning_rate": 4.51505016722408e-06, |
| "loss": 3.2614, |
| "step": 33000 |
| }, |
| { |
| "epoch": 466.6245589273112, |
| "eval_loss": 3.1989195346832275, |
| "eval_runtime": 75.4062, |
| "eval_samples_per_second": 132.615, |
| "eval_steps_per_second": 8.288, |
| "step": 33000 |
| }, |
| { |
| "epoch": 468.03599153140436, |
| "grad_norm": 0.5247920155525208, |
| "learning_rate": 4.498327759197324e-06, |
| "loss": 3.2597, |
| "step": 33100 |
| }, |
| { |
| "epoch": 468.03599153140436, |
| "eval_loss": 3.199160099029541, |
| "eval_runtime": 75.54, |
| "eval_samples_per_second": 132.38, |
| "eval_steps_per_second": 8.274, |
| "step": 33100 |
| }, |
| { |
| "epoch": 469.44742413549756, |
| "grad_norm": 0.5080961585044861, |
| "learning_rate": 4.481605351170569e-06, |
| "loss": 3.2591, |
| "step": 33200 |
| }, |
| { |
| "epoch": 469.44742413549756, |
| "eval_loss": 3.1960511207580566, |
| "eval_runtime": 75.4862, |
| "eval_samples_per_second": 132.475, |
| "eval_steps_per_second": 8.28, |
| "step": 33200 |
| }, |
| { |
| "epoch": 470.8588567395907, |
| "grad_norm": 0.47639742493629456, |
| "learning_rate": 4.4648829431438125e-06, |
| "loss": 3.2584, |
| "step": 33300 |
| }, |
| { |
| "epoch": 470.8588567395907, |
| "eval_loss": 3.1962883472442627, |
| "eval_runtime": 75.505, |
| "eval_samples_per_second": 132.442, |
| "eval_steps_per_second": 8.278, |
| "step": 33300 |
| }, |
| { |
| "epoch": 472.2702893436838, |
| "grad_norm": 0.5254648923873901, |
| "learning_rate": 4.448160535117057e-06, |
| "loss": 3.2572, |
| "step": 33400 |
| }, |
| { |
| "epoch": 472.2702893436838, |
| "eval_loss": 3.1971852779388428, |
| "eval_runtime": 73.998, |
| "eval_samples_per_second": 135.139, |
| "eval_steps_per_second": 8.446, |
| "step": 33400 |
| }, |
| { |
| "epoch": 473.681721947777, |
| "grad_norm": 0.424251526594162, |
| "learning_rate": 4.431438127090301e-06, |
| "loss": 3.2564, |
| "step": 33500 |
| }, |
| { |
| "epoch": 473.681721947777, |
| "eval_loss": 3.1964030265808105, |
| "eval_runtime": 74.3305, |
| "eval_samples_per_second": 134.534, |
| "eval_steps_per_second": 8.408, |
| "step": 33500 |
| }, |
| { |
| "epoch": 475.09315455187016, |
| "grad_norm": 0.47462198138237, |
| "learning_rate": 4.4147157190635455e-06, |
| "loss": 3.2543, |
| "step": 33600 |
| }, |
| { |
| "epoch": 475.09315455187016, |
| "eval_loss": 3.1918516159057617, |
| "eval_runtime": 75.4902, |
| "eval_samples_per_second": 132.468, |
| "eval_steps_per_second": 8.279, |
| "step": 33600 |
| }, |
| { |
| "epoch": 476.5045871559633, |
| "grad_norm": 0.4497796297073364, |
| "learning_rate": 4.397993311036789e-06, |
| "loss": 3.255, |
| "step": 33700 |
| }, |
| { |
| "epoch": 476.5045871559633, |
| "eval_loss": 3.1941463947296143, |
| "eval_runtime": 75.3615, |
| "eval_samples_per_second": 132.694, |
| "eval_steps_per_second": 8.293, |
| "step": 33700 |
| }, |
| { |
| "epoch": 477.9160197600565, |
| "grad_norm": 0.3968687355518341, |
| "learning_rate": 4.381270903010034e-06, |
| "loss": 3.2538, |
| "step": 33800 |
| }, |
| { |
| "epoch": 477.9160197600565, |
| "eval_loss": 3.188390016555786, |
| "eval_runtime": 75.5247, |
| "eval_samples_per_second": 132.407, |
| "eval_steps_per_second": 8.275, |
| "step": 33800 |
| }, |
| { |
| "epoch": 479.3274523641496, |
| "grad_norm": 0.4359143376350403, |
| "learning_rate": 4.364548494983278e-06, |
| "loss": 3.2524, |
| "step": 33900 |
| }, |
| { |
| "epoch": 479.3274523641496, |
| "eval_loss": 3.191363573074341, |
| "eval_runtime": 75.4796, |
| "eval_samples_per_second": 132.486, |
| "eval_steps_per_second": 8.28, |
| "step": 33900 |
| }, |
| { |
| "epoch": 480.73888496824276, |
| "grad_norm": 0.39059412479400635, |
| "learning_rate": 4.347826086956522e-06, |
| "loss": 3.251, |
| "step": 34000 |
| }, |
| { |
| "epoch": 480.73888496824276, |
| "eval_loss": 3.191560745239258, |
| "eval_runtime": 73.7784, |
| "eval_samples_per_second": 135.541, |
| "eval_steps_per_second": 8.471, |
| "step": 34000 |
| }, |
| { |
| "epoch": 482.1503175723359, |
| "grad_norm": 0.5065882802009583, |
| "learning_rate": 4.331103678929766e-06, |
| "loss": 3.251, |
| "step": 34100 |
| }, |
| { |
| "epoch": 482.1503175723359, |
| "eval_loss": 3.18879771232605, |
| "eval_runtime": 74.1572, |
| "eval_samples_per_second": 134.849, |
| "eval_steps_per_second": 8.428, |
| "step": 34100 |
| }, |
| { |
| "epoch": 483.5617501764291, |
| "grad_norm": 0.5141859650611877, |
| "learning_rate": 4.3143812709030106e-06, |
| "loss": 3.2489, |
| "step": 34200 |
| }, |
| { |
| "epoch": 483.5617501764291, |
| "eval_loss": 3.189378023147583, |
| "eval_runtime": 73.8069, |
| "eval_samples_per_second": 135.489, |
| "eval_steps_per_second": 8.468, |
| "step": 34200 |
| }, |
| { |
| "epoch": 484.9731827805222, |
| "grad_norm": 0.4633908271789551, |
| "learning_rate": 4.297658862876254e-06, |
| "loss": 3.2497, |
| "step": 34300 |
| }, |
| { |
| "epoch": 484.9731827805222, |
| "eval_loss": 3.1851470470428467, |
| "eval_runtime": 74.1044, |
| "eval_samples_per_second": 134.945, |
| "eval_steps_per_second": 8.434, |
| "step": 34300 |
| }, |
| { |
| "epoch": 486.38461538461536, |
| "grad_norm": 0.5483572483062744, |
| "learning_rate": 4.280936454849499e-06, |
| "loss": 3.2476, |
| "step": 34400 |
| }, |
| { |
| "epoch": 486.38461538461536, |
| "eval_loss": 3.1872897148132324, |
| "eval_runtime": 74.0461, |
| "eval_samples_per_second": 135.051, |
| "eval_steps_per_second": 8.441, |
| "step": 34400 |
| }, |
| { |
| "epoch": 487.79604798870855, |
| "grad_norm": 0.46806034445762634, |
| "learning_rate": 4.264214046822743e-06, |
| "loss": 3.2475, |
| "step": 34500 |
| }, |
| { |
| "epoch": 487.79604798870855, |
| "eval_loss": 3.1827573776245117, |
| "eval_runtime": 74.1108, |
| "eval_samples_per_second": 134.933, |
| "eval_steps_per_second": 8.433, |
| "step": 34500 |
| }, |
| { |
| "epoch": 489.2074805928017, |
| "grad_norm": 0.47688722610473633, |
| "learning_rate": 4.247491638795987e-06, |
| "loss": 3.2457, |
| "step": 34600 |
| }, |
| { |
| "epoch": 489.2074805928017, |
| "eval_loss": 3.1827759742736816, |
| "eval_runtime": 73.8156, |
| "eval_samples_per_second": 135.473, |
| "eval_steps_per_second": 8.467, |
| "step": 34600 |
| }, |
| { |
| "epoch": 490.6189131968948, |
| "grad_norm": 0.40754401683807373, |
| "learning_rate": 4.230769230769231e-06, |
| "loss": 3.2449, |
| "step": 34700 |
| }, |
| { |
| "epoch": 490.6189131968948, |
| "eval_loss": 3.1827995777130127, |
| "eval_runtime": 74.1338, |
| "eval_samples_per_second": 134.891, |
| "eval_steps_per_second": 8.431, |
| "step": 34700 |
| }, |
| { |
| "epoch": 492.030345800988, |
| "grad_norm": 0.4188888370990753, |
| "learning_rate": 4.214046822742475e-06, |
| "loss": 3.2445, |
| "step": 34800 |
| }, |
| { |
| "epoch": 492.030345800988, |
| "eval_loss": 3.181915283203125, |
| "eval_runtime": 74.1167, |
| "eval_samples_per_second": 134.922, |
| "eval_steps_per_second": 8.433, |
| "step": 34800 |
| }, |
| { |
| "epoch": 493.44177840508115, |
| "grad_norm": 0.4735229015350342, |
| "learning_rate": 4.197324414715719e-06, |
| "loss": 3.2435, |
| "step": 34900 |
| }, |
| { |
| "epoch": 493.44177840508115, |
| "eval_loss": 3.179292917251587, |
| "eval_runtime": 74.1214, |
| "eval_samples_per_second": 134.914, |
| "eval_steps_per_second": 8.432, |
| "step": 34900 |
| }, |
| { |
| "epoch": 494.8532110091743, |
| "grad_norm": 0.5335908532142639, |
| "learning_rate": 4.180602006688963e-06, |
| "loss": 3.2426, |
| "step": 35000 |
| }, |
| { |
| "epoch": 494.8532110091743, |
| "eval_loss": 3.180798053741455, |
| "eval_runtime": 74.1495, |
| "eval_samples_per_second": 134.863, |
| "eval_steps_per_second": 8.429, |
| "step": 35000 |
| }, |
| { |
| "epoch": 496.2646436132675, |
| "grad_norm": 0.48122459650039673, |
| "learning_rate": 4.163879598662208e-06, |
| "loss": 3.2424, |
| "step": 35100 |
| }, |
| { |
| "epoch": 496.2646436132675, |
| "eval_loss": 3.1801769733428955, |
| "eval_runtime": 73.8056, |
| "eval_samples_per_second": 135.491, |
| "eval_steps_per_second": 8.468, |
| "step": 35100 |
| }, |
| { |
| "epoch": 497.6760762173606, |
| "grad_norm": 0.4501022398471832, |
| "learning_rate": 4.1471571906354515e-06, |
| "loss": 3.2409, |
| "step": 35200 |
| }, |
| { |
| "epoch": 497.6760762173606, |
| "eval_loss": 3.180279016494751, |
| "eval_runtime": 75.5755, |
| "eval_samples_per_second": 132.318, |
| "eval_steps_per_second": 8.27, |
| "step": 35200 |
| }, |
| { |
| "epoch": 499.08750882145375, |
| "grad_norm": 0.45166870951652527, |
| "learning_rate": 4.130434782608696e-06, |
| "loss": 3.2396, |
| "step": 35300 |
| }, |
| { |
| "epoch": 499.08750882145375, |
| "eval_loss": 3.1797635555267334, |
| "eval_runtime": 75.5905, |
| "eval_samples_per_second": 132.292, |
| "eval_steps_per_second": 8.268, |
| "step": 35300 |
| }, |
| { |
| "epoch": 500.49894142554695, |
| "grad_norm": 0.48286956548690796, |
| "learning_rate": 4.11371237458194e-06, |
| "loss": 3.2387, |
| "step": 35400 |
| }, |
| { |
| "epoch": 500.49894142554695, |
| "eval_loss": 3.180513858795166, |
| "eval_runtime": 75.5591, |
| "eval_samples_per_second": 132.347, |
| "eval_steps_per_second": 8.272, |
| "step": 35400 |
| }, |
| { |
| "epoch": 501.9103740296401, |
| "grad_norm": 0.4946323335170746, |
| "learning_rate": 4.0969899665551845e-06, |
| "loss": 3.2381, |
| "step": 35500 |
| }, |
| { |
| "epoch": 501.9103740296401, |
| "eval_loss": 3.1741671562194824, |
| "eval_runtime": 75.5669, |
| "eval_samples_per_second": 132.333, |
| "eval_steps_per_second": 8.271, |
| "step": 35500 |
| }, |
| { |
| "epoch": 503.3218066337332, |
| "grad_norm": 0.5477709174156189, |
| "learning_rate": 4.080267558528428e-06, |
| "loss": 3.237, |
| "step": 35600 |
| }, |
| { |
| "epoch": 503.3218066337332, |
| "eval_loss": 3.1784253120422363, |
| "eval_runtime": 74.1136, |
| "eval_samples_per_second": 134.928, |
| "eval_steps_per_second": 8.433, |
| "step": 35600 |
| }, |
| { |
| "epoch": 504.7332392378264, |
| "grad_norm": 0.4314318299293518, |
| "learning_rate": 4.063545150501673e-06, |
| "loss": 3.2368, |
| "step": 35700 |
| }, |
| { |
| "epoch": 504.7332392378264, |
| "eval_loss": 3.1778831481933594, |
| "eval_runtime": 74.392, |
| "eval_samples_per_second": 134.423, |
| "eval_steps_per_second": 8.401, |
| "step": 35700 |
| }, |
| { |
| "epoch": 506.14467184191955, |
| "grad_norm": 0.5121575593948364, |
| "learning_rate": 4.046822742474917e-06, |
| "loss": 3.2348, |
| "step": 35800 |
| }, |
| { |
| "epoch": 506.14467184191955, |
| "eval_loss": 3.1743390560150146, |
| "eval_runtime": 74.2648, |
| "eval_samples_per_second": 134.653, |
| "eval_steps_per_second": 8.416, |
| "step": 35800 |
| }, |
| { |
| "epoch": 507.5561044460127, |
| "grad_norm": 0.5254048705101013, |
| "learning_rate": 4.030100334448161e-06, |
| "loss": 3.2356, |
| "step": 35900 |
| }, |
| { |
| "epoch": 507.5561044460127, |
| "eval_loss": 3.1765570640563965, |
| "eval_runtime": 74.3603, |
| "eval_samples_per_second": 134.48, |
| "eval_steps_per_second": 8.405, |
| "step": 35900 |
| }, |
| { |
| "epoch": 508.9675370501059, |
| "grad_norm": 0.5628880858421326, |
| "learning_rate": 4.013377926421405e-06, |
| "loss": 3.2345, |
| "step": 36000 |
| }, |
| { |
| "epoch": 508.9675370501059, |
| "eval_loss": 3.1755409240722656, |
| "eval_runtime": 74.3666, |
| "eval_samples_per_second": 134.469, |
| "eval_steps_per_second": 8.404, |
| "step": 36000 |
| }, |
| { |
| "epoch": 510.378969654199, |
| "grad_norm": 0.43160390853881836, |
| "learning_rate": 3.99665551839465e-06, |
| "loss": 3.2327, |
| "step": 36100 |
| }, |
| { |
| "epoch": 510.378969654199, |
| "eval_loss": 3.1721553802490234, |
| "eval_runtime": 75.0918, |
| "eval_samples_per_second": 133.17, |
| "eval_steps_per_second": 8.323, |
| "step": 36100 |
| }, |
| { |
| "epoch": 511.79040225829215, |
| "grad_norm": 0.4397335350513458, |
| "learning_rate": 3.979933110367893e-06, |
| "loss": 3.2324, |
| "step": 36200 |
| }, |
| { |
| "epoch": 511.79040225829215, |
| "eval_loss": 3.174211263656616, |
| "eval_runtime": 75.613, |
| "eval_samples_per_second": 132.252, |
| "eval_steps_per_second": 8.266, |
| "step": 36200 |
| }, |
| { |
| "epoch": 513.2018348623853, |
| "grad_norm": 0.42536553740501404, |
| "learning_rate": 3.963210702341138e-06, |
| "loss": 3.2324, |
| "step": 36300 |
| }, |
| { |
| "epoch": 513.2018348623853, |
| "eval_loss": 3.172475814819336, |
| "eval_runtime": 75.4025, |
| "eval_samples_per_second": 132.622, |
| "eval_steps_per_second": 8.289, |
| "step": 36300 |
| }, |
| { |
| "epoch": 514.6132674664784, |
| "grad_norm": 0.46854251623153687, |
| "learning_rate": 3.946488294314382e-06, |
| "loss": 3.2306, |
| "step": 36400 |
| }, |
| { |
| "epoch": 514.6132674664784, |
| "eval_loss": 3.1727116107940674, |
| "eval_runtime": 75.467, |
| "eval_samples_per_second": 132.508, |
| "eval_steps_per_second": 8.282, |
| "step": 36400 |
| }, |
| { |
| "epoch": 516.0247000705716, |
| "grad_norm": 0.40861082077026367, |
| "learning_rate": 3.929765886287626e-06, |
| "loss": 3.2304, |
| "step": 36500 |
| }, |
| { |
| "epoch": 516.0247000705716, |
| "eval_loss": 3.171363353729248, |
| "eval_runtime": 75.3463, |
| "eval_samples_per_second": 132.721, |
| "eval_steps_per_second": 8.295, |
| "step": 36500 |
| }, |
| { |
| "epoch": 517.4361326746648, |
| "grad_norm": 0.4804244339466095, |
| "learning_rate": 3.91304347826087e-06, |
| "loss": 3.2291, |
| "step": 36600 |
| }, |
| { |
| "epoch": 517.4361326746648, |
| "eval_loss": 3.168253183364868, |
| "eval_runtime": 75.5393, |
| "eval_samples_per_second": 132.381, |
| "eval_steps_per_second": 8.274, |
| "step": 36600 |
| }, |
| { |
| "epoch": 518.8475652787579, |
| "grad_norm": 0.4989255964756012, |
| "learning_rate": 3.896321070234114e-06, |
| "loss": 3.2283, |
| "step": 36700 |
| }, |
| { |
| "epoch": 518.8475652787579, |
| "eval_loss": 3.1694819927215576, |
| "eval_runtime": 75.6196, |
| "eval_samples_per_second": 132.241, |
| "eval_steps_per_second": 8.265, |
| "step": 36700 |
| }, |
| { |
| "epoch": 520.2589978828511, |
| "grad_norm": 0.4205889403820038, |
| "learning_rate": 3.8795986622073584e-06, |
| "loss": 3.2272, |
| "step": 36800 |
| }, |
| { |
| "epoch": 520.2589978828511, |
| "eval_loss": 3.167910575866699, |
| "eval_runtime": 75.533, |
| "eval_samples_per_second": 132.392, |
| "eval_steps_per_second": 8.275, |
| "step": 36800 |
| }, |
| { |
| "epoch": 521.6704304869443, |
| "grad_norm": 0.49398526549339294, |
| "learning_rate": 3.862876254180602e-06, |
| "loss": 3.227, |
| "step": 36900 |
| }, |
| { |
| "epoch": 521.6704304869443, |
| "eval_loss": 3.166125774383545, |
| "eval_runtime": 75.4176, |
| "eval_samples_per_second": 132.595, |
| "eval_steps_per_second": 8.287, |
| "step": 36900 |
| }, |
| { |
| "epoch": 523.0818630910373, |
| "grad_norm": 0.42277684807777405, |
| "learning_rate": 3.846153846153847e-06, |
| "loss": 3.2252, |
| "step": 37000 |
| }, |
| { |
| "epoch": 523.0818630910373, |
| "eval_loss": 3.1657633781433105, |
| "eval_runtime": 75.5313, |
| "eval_samples_per_second": 132.395, |
| "eval_steps_per_second": 8.275, |
| "step": 37000 |
| }, |
| { |
| "epoch": 524.4932956951305, |
| "grad_norm": 0.4734378159046173, |
| "learning_rate": 3.8294314381270906e-06, |
| "loss": 3.225, |
| "step": 37100 |
| }, |
| { |
| "epoch": 524.4932956951305, |
| "eval_loss": 3.167834997177124, |
| "eval_runtime": 75.5739, |
| "eval_samples_per_second": 132.321, |
| "eval_steps_per_second": 8.27, |
| "step": 37100 |
| }, |
| { |
| "epoch": 525.9047282992237, |
| "grad_norm": 0.466743528842926, |
| "learning_rate": 3.812709030100335e-06, |
| "loss": 3.225, |
| "step": 37200 |
| }, |
| { |
| "epoch": 525.9047282992237, |
| "eval_loss": 3.165196180343628, |
| "eval_runtime": 75.556, |
| "eval_samples_per_second": 132.352, |
| "eval_steps_per_second": 8.272, |
| "step": 37200 |
| }, |
| { |
| "epoch": 527.3161609033168, |
| "grad_norm": 0.4317799210548401, |
| "learning_rate": 3.7959866220735793e-06, |
| "loss": 3.224, |
| "step": 37300 |
| }, |
| { |
| "epoch": 527.3161609033168, |
| "eval_loss": 3.1631827354431152, |
| "eval_runtime": 75.6043, |
| "eval_samples_per_second": 132.268, |
| "eval_steps_per_second": 8.267, |
| "step": 37300 |
| }, |
| { |
| "epoch": 528.72759350741, |
| "grad_norm": 0.39390048384666443, |
| "learning_rate": 3.7792642140468235e-06, |
| "loss": 3.2237, |
| "step": 37400 |
| }, |
| { |
| "epoch": 528.72759350741, |
| "eval_loss": 3.165860414505005, |
| "eval_runtime": 75.4156, |
| "eval_samples_per_second": 132.599, |
| "eval_steps_per_second": 8.287, |
| "step": 37400 |
| }, |
| { |
| "epoch": 530.1390261115032, |
| "grad_norm": 0.4415905177593231, |
| "learning_rate": 3.7625418060200673e-06, |
| "loss": 3.2233, |
| "step": 37500 |
| }, |
| { |
| "epoch": 530.1390261115032, |
| "eval_loss": 3.163349151611328, |
| "eval_runtime": 75.5719, |
| "eval_samples_per_second": 132.324, |
| "eval_steps_per_second": 8.27, |
| "step": 37500 |
| }, |
| { |
| "epoch": 531.5504587155963, |
| "grad_norm": 0.5063501596450806, |
| "learning_rate": 3.745819397993311e-06, |
| "loss": 3.2206, |
| "step": 37600 |
| }, |
| { |
| "epoch": 531.5504587155963, |
| "eval_loss": 3.163081407546997, |
| "eval_runtime": 75.5766, |
| "eval_samples_per_second": 132.316, |
| "eval_steps_per_second": 8.27, |
| "step": 37600 |
| }, |
| { |
| "epoch": 532.9618913196895, |
| "grad_norm": 0.4227210283279419, |
| "learning_rate": 3.7290969899665552e-06, |
| "loss": 3.2212, |
| "step": 37700 |
| }, |
| { |
| "epoch": 532.9618913196895, |
| "eval_loss": 3.1642813682556152, |
| "eval_runtime": 75.6173, |
| "eval_samples_per_second": 132.245, |
| "eval_steps_per_second": 8.265, |
| "step": 37700 |
| }, |
| { |
| "epoch": 534.3733239237827, |
| "grad_norm": 0.5058636665344238, |
| "learning_rate": 3.7123745819397994e-06, |
| "loss": 3.2198, |
| "step": 37800 |
| }, |
| { |
| "epoch": 534.3733239237827, |
| "eval_loss": 3.1637208461761475, |
| "eval_runtime": 75.5813, |
| "eval_samples_per_second": 132.308, |
| "eval_steps_per_second": 8.269, |
| "step": 37800 |
| }, |
| { |
| "epoch": 535.7847565278757, |
| "grad_norm": 0.37743571400642395, |
| "learning_rate": 3.6956521739130436e-06, |
| "loss": 3.2193, |
| "step": 37900 |
| }, |
| { |
| "epoch": 535.7847565278757, |
| "eval_loss": 3.161426544189453, |
| "eval_runtime": 75.4254, |
| "eval_samples_per_second": 132.581, |
| "eval_steps_per_second": 8.286, |
| "step": 37900 |
| }, |
| { |
| "epoch": 537.1961891319689, |
| "grad_norm": 0.4442863464355469, |
| "learning_rate": 3.6789297658862878e-06, |
| "loss": 3.2186, |
| "step": 38000 |
| }, |
| { |
| "epoch": 537.1961891319689, |
| "eval_loss": 3.1596264839172363, |
| "eval_runtime": 75.6205, |
| "eval_samples_per_second": 132.239, |
| "eval_steps_per_second": 8.265, |
| "step": 38000 |
| }, |
| { |
| "epoch": 538.6076217360621, |
| "grad_norm": 0.46085959672927856, |
| "learning_rate": 3.662207357859532e-06, |
| "loss": 3.2176, |
| "step": 38100 |
| }, |
| { |
| "epoch": 538.6076217360621, |
| "eval_loss": 3.161233901977539, |
| "eval_runtime": 75.6478, |
| "eval_samples_per_second": 132.191, |
| "eval_steps_per_second": 8.262, |
| "step": 38100 |
| }, |
| { |
| "epoch": 540.0190543401552, |
| "grad_norm": 0.45532315969467163, |
| "learning_rate": 3.645484949832776e-06, |
| "loss": 3.217, |
| "step": 38200 |
| }, |
| { |
| "epoch": 540.0190543401552, |
| "eval_loss": 3.159839391708374, |
| "eval_runtime": 75.6969, |
| "eval_samples_per_second": 132.106, |
| "eval_steps_per_second": 8.257, |
| "step": 38200 |
| }, |
| { |
| "epoch": 541.4304869442484, |
| "grad_norm": 0.4042709469795227, |
| "learning_rate": 3.6287625418060203e-06, |
| "loss": 3.2167, |
| "step": 38300 |
| }, |
| { |
| "epoch": 541.4304869442484, |
| "eval_loss": 3.1588003635406494, |
| "eval_runtime": 75.6208, |
| "eval_samples_per_second": 132.239, |
| "eval_steps_per_second": 8.265, |
| "step": 38300 |
| }, |
| { |
| "epoch": 542.8419195483416, |
| "grad_norm": 0.4348280727863312, |
| "learning_rate": 3.6120401337792645e-06, |
| "loss": 3.2158, |
| "step": 38400 |
| }, |
| { |
| "epoch": 542.8419195483416, |
| "eval_loss": 3.156273365020752, |
| "eval_runtime": 75.5256, |
| "eval_samples_per_second": 132.405, |
| "eval_steps_per_second": 8.275, |
| "step": 38400 |
| }, |
| { |
| "epoch": 544.2533521524347, |
| "grad_norm": 0.49892503023147583, |
| "learning_rate": 3.5953177257525087e-06, |
| "loss": 3.2145, |
| "step": 38500 |
| }, |
| { |
| "epoch": 544.2533521524347, |
| "eval_loss": 3.1565325260162354, |
| "eval_runtime": 75.5712, |
| "eval_samples_per_second": 132.325, |
| "eval_steps_per_second": 8.27, |
| "step": 38500 |
| }, |
| { |
| "epoch": 545.6647847565279, |
| "grad_norm": 0.522286057472229, |
| "learning_rate": 3.578595317725753e-06, |
| "loss": 3.214, |
| "step": 38600 |
| }, |
| { |
| "epoch": 545.6647847565279, |
| "eval_loss": 3.156520128250122, |
| "eval_runtime": 75.3314, |
| "eval_samples_per_second": 132.747, |
| "eval_steps_per_second": 8.297, |
| "step": 38600 |
| }, |
| { |
| "epoch": 547.0762173606211, |
| "grad_norm": 0.4889813959598541, |
| "learning_rate": 3.5618729096989966e-06, |
| "loss": 3.2125, |
| "step": 38700 |
| }, |
| { |
| "epoch": 547.0762173606211, |
| "eval_loss": 3.156526565551758, |
| "eval_runtime": 75.5675, |
| "eval_samples_per_second": 132.332, |
| "eval_steps_per_second": 8.271, |
| "step": 38700 |
| }, |
| { |
| "epoch": 548.4876499647141, |
| "grad_norm": 0.43349429965019226, |
| "learning_rate": 3.5451505016722408e-06, |
| "loss": 3.2127, |
| "step": 38800 |
| }, |
| { |
| "epoch": 548.4876499647141, |
| "eval_loss": 3.155153274536133, |
| "eval_runtime": 75.5428, |
| "eval_samples_per_second": 132.375, |
| "eval_steps_per_second": 8.273, |
| "step": 38800 |
| }, |
| { |
| "epoch": 549.8990825688073, |
| "grad_norm": 0.4011054039001465, |
| "learning_rate": 3.528428093645485e-06, |
| "loss": 3.2116, |
| "step": 38900 |
| }, |
| { |
| "epoch": 549.8990825688073, |
| "eval_loss": 3.152825355529785, |
| "eval_runtime": 75.5836, |
| "eval_samples_per_second": 132.304, |
| "eval_steps_per_second": 8.269, |
| "step": 38900 |
| }, |
| { |
| "epoch": 551.3105151729005, |
| "grad_norm": 0.4457249939441681, |
| "learning_rate": 3.511705685618729e-06, |
| "loss": 3.2117, |
| "step": 39000 |
| }, |
| { |
| "epoch": 551.3105151729005, |
| "eval_loss": 3.151512384414673, |
| "eval_runtime": 75.5827, |
| "eval_samples_per_second": 132.305, |
| "eval_steps_per_second": 8.269, |
| "step": 39000 |
| }, |
| { |
| "epoch": 552.7219477769936, |
| "grad_norm": 0.42113420367240906, |
| "learning_rate": 3.4949832775919733e-06, |
| "loss": 3.2107, |
| "step": 39100 |
| }, |
| { |
| "epoch": 552.7219477769936, |
| "eval_loss": 3.1546247005462646, |
| "eval_runtime": 75.4062, |
| "eval_samples_per_second": 132.615, |
| "eval_steps_per_second": 8.288, |
| "step": 39100 |
| }, |
| { |
| "epoch": 554.1333803810868, |
| "grad_norm": 0.3948840796947479, |
| "learning_rate": 3.4782608695652175e-06, |
| "loss": 3.2097, |
| "step": 39200 |
| }, |
| { |
| "epoch": 554.1333803810868, |
| "eval_loss": 3.1522581577301025, |
| "eval_runtime": 75.6127, |
| "eval_samples_per_second": 132.253, |
| "eval_steps_per_second": 8.266, |
| "step": 39200 |
| }, |
| { |
| "epoch": 555.54481298518, |
| "grad_norm": 0.4366632103919983, |
| "learning_rate": 3.4615384615384617e-06, |
| "loss": 3.2087, |
| "step": 39300 |
| }, |
| { |
| "epoch": 555.54481298518, |
| "eval_loss": 3.1504454612731934, |
| "eval_runtime": 75.6076, |
| "eval_samples_per_second": 132.262, |
| "eval_steps_per_second": 8.266, |
| "step": 39300 |
| }, |
| { |
| "epoch": 556.9562455892731, |
| "grad_norm": 0.48205867409706116, |
| "learning_rate": 3.444816053511706e-06, |
| "loss": 3.2093, |
| "step": 39400 |
| }, |
| { |
| "epoch": 556.9562455892731, |
| "eval_loss": 3.1492788791656494, |
| "eval_runtime": 75.6481, |
| "eval_samples_per_second": 132.191, |
| "eval_steps_per_second": 8.262, |
| "step": 39400 |
| }, |
| { |
| "epoch": 558.3676781933663, |
| "grad_norm": 0.37896397709846497, |
| "learning_rate": 3.42809364548495e-06, |
| "loss": 3.2076, |
| "step": 39500 |
| }, |
| { |
| "epoch": 558.3676781933663, |
| "eval_loss": 3.1518449783325195, |
| "eval_runtime": 75.7, |
| "eval_samples_per_second": 132.1, |
| "eval_steps_per_second": 8.256, |
| "step": 39500 |
| }, |
| { |
| "epoch": 559.7791107974595, |
| "grad_norm": 0.5973118543624878, |
| "learning_rate": 3.4113712374581942e-06, |
| "loss": 3.2063, |
| "step": 39600 |
| }, |
| { |
| "epoch": 559.7791107974595, |
| "eval_loss": 3.149134397506714, |
| "eval_runtime": 75.4698, |
| "eval_samples_per_second": 132.503, |
| "eval_steps_per_second": 8.281, |
| "step": 39600 |
| }, |
| { |
| "epoch": 561.1905434015525, |
| "grad_norm": 0.45621445775032043, |
| "learning_rate": 3.3946488294314384e-06, |
| "loss": 3.2068, |
| "step": 39700 |
| }, |
| { |
| "epoch": 561.1905434015525, |
| "eval_loss": 3.1491434574127197, |
| "eval_runtime": 75.6152, |
| "eval_samples_per_second": 132.248, |
| "eval_steps_per_second": 8.266, |
| "step": 39700 |
| }, |
| { |
| "epoch": 562.6019760056457, |
| "grad_norm": 0.43519681692123413, |
| "learning_rate": 3.3779264214046826e-06, |
| "loss": 3.2064, |
| "step": 39800 |
| }, |
| { |
| "epoch": 562.6019760056457, |
| "eval_loss": 3.150059938430786, |
| "eval_runtime": 75.5035, |
| "eval_samples_per_second": 132.444, |
| "eval_steps_per_second": 8.278, |
| "step": 39800 |
| }, |
| { |
| "epoch": 564.0134086097389, |
| "grad_norm": 0.4100271165370941, |
| "learning_rate": 3.3612040133779268e-06, |
| "loss": 3.2052, |
| "step": 39900 |
| }, |
| { |
| "epoch": 564.0134086097389, |
| "eval_loss": 3.146653175354004, |
| "eval_runtime": 75.6217, |
| "eval_samples_per_second": 132.237, |
| "eval_steps_per_second": 8.265, |
| "step": 39900 |
| }, |
| { |
| "epoch": 565.424841213832, |
| "grad_norm": 0.5362276434898376, |
| "learning_rate": 3.344481605351171e-06, |
| "loss": 3.2053, |
| "step": 40000 |
| }, |
| { |
| "epoch": 565.424841213832, |
| "eval_loss": 3.1463942527770996, |
| "eval_runtime": 75.2649, |
| "eval_samples_per_second": 132.864, |
| "eval_steps_per_second": 8.304, |
| "step": 40000 |
| }, |
| { |
| "epoch": 566.8362738179252, |
| "grad_norm": 0.49909672141075134, |
| "learning_rate": 3.327759197324415e-06, |
| "loss": 3.2037, |
| "step": 40100 |
| }, |
| { |
| "epoch": 566.8362738179252, |
| "eval_loss": 3.146095037460327, |
| "eval_runtime": 76.2008, |
| "eval_samples_per_second": 131.232, |
| "eval_steps_per_second": 8.202, |
| "step": 40100 |
| }, |
| { |
| "epoch": 568.2477064220184, |
| "grad_norm": 0.4283011257648468, |
| "learning_rate": 3.3110367892976593e-06, |
| "loss": 3.203, |
| "step": 40200 |
| }, |
| { |
| "epoch": 568.2477064220184, |
| "eval_loss": 3.1441144943237305, |
| "eval_runtime": 75.2084, |
| "eval_samples_per_second": 132.964, |
| "eval_steps_per_second": 8.31, |
| "step": 40200 |
| }, |
| { |
| "epoch": 569.6591390261115, |
| "grad_norm": 0.48537662625312805, |
| "learning_rate": 3.2943143812709035e-06, |
| "loss": 3.202, |
| "step": 40300 |
| }, |
| { |
| "epoch": 569.6591390261115, |
| "eval_loss": 3.146061658859253, |
| "eval_runtime": 75.1567, |
| "eval_samples_per_second": 133.055, |
| "eval_steps_per_second": 8.316, |
| "step": 40300 |
| }, |
| { |
| "epoch": 571.0705716302047, |
| "grad_norm": 0.5248004794120789, |
| "learning_rate": 3.2775919732441473e-06, |
| "loss": 3.2024, |
| "step": 40400 |
| }, |
| { |
| "epoch": 571.0705716302047, |
| "eval_loss": 3.1482038497924805, |
| "eval_runtime": 75.2627, |
| "eval_samples_per_second": 132.868, |
| "eval_steps_per_second": 8.304, |
| "step": 40400 |
| }, |
| { |
| "epoch": 572.4820042342978, |
| "grad_norm": 0.406740665435791, |
| "learning_rate": 3.2608695652173914e-06, |
| "loss": 3.2005, |
| "step": 40500 |
| }, |
| { |
| "epoch": 572.4820042342978, |
| "eval_loss": 3.1435883045196533, |
| "eval_runtime": 75.3704, |
| "eval_samples_per_second": 132.678, |
| "eval_steps_per_second": 8.292, |
| "step": 40500 |
| }, |
| { |
| "epoch": 573.8934368383909, |
| "grad_norm": 0.4268272817134857, |
| "learning_rate": 3.2441471571906356e-06, |
| "loss": 3.2016, |
| "step": 40600 |
| }, |
| { |
| "epoch": 573.8934368383909, |
| "eval_loss": 3.1432077884674072, |
| "eval_runtime": 75.5314, |
| "eval_samples_per_second": 132.395, |
| "eval_steps_per_second": 8.275, |
| "step": 40600 |
| }, |
| { |
| "epoch": 575.3048694424841, |
| "grad_norm": 0.3838156461715698, |
| "learning_rate": 3.22742474916388e-06, |
| "loss": 3.1999, |
| "step": 40700 |
| }, |
| { |
| "epoch": 575.3048694424841, |
| "eval_loss": 3.1417696475982666, |
| "eval_runtime": 75.5553, |
| "eval_samples_per_second": 132.353, |
| "eval_steps_per_second": 8.272, |
| "step": 40700 |
| }, |
| { |
| "epoch": 576.7163020465773, |
| "grad_norm": 0.3539990484714508, |
| "learning_rate": 3.210702341137124e-06, |
| "loss": 3.1997, |
| "step": 40800 |
| }, |
| { |
| "epoch": 576.7163020465773, |
| "eval_loss": 3.141932964324951, |
| "eval_runtime": 75.6878, |
| "eval_samples_per_second": 132.122, |
| "eval_steps_per_second": 8.258, |
| "step": 40800 |
| }, |
| { |
| "epoch": 578.1277346506704, |
| "grad_norm": 0.45554637908935547, |
| "learning_rate": 3.193979933110368e-06, |
| "loss": 3.1994, |
| "step": 40900 |
| }, |
| { |
| "epoch": 578.1277346506704, |
| "eval_loss": 3.142242670059204, |
| "eval_runtime": 75.5684, |
| "eval_samples_per_second": 132.33, |
| "eval_steps_per_second": 8.271, |
| "step": 40900 |
| }, |
| { |
| "epoch": 579.5391672547636, |
| "grad_norm": 0.44678226113319397, |
| "learning_rate": 3.1772575250836123e-06, |
| "loss": 3.1994, |
| "step": 41000 |
| }, |
| { |
| "epoch": 579.5391672547636, |
| "eval_loss": 3.141388177871704, |
| "eval_runtime": 75.4266, |
| "eval_samples_per_second": 132.579, |
| "eval_steps_per_second": 8.286, |
| "step": 41000 |
| }, |
| { |
| "epoch": 580.9505998588568, |
| "grad_norm": 0.4156506061553955, |
| "learning_rate": 3.1605351170568565e-06, |
| "loss": 3.1978, |
| "step": 41100 |
| }, |
| { |
| "epoch": 580.9505998588568, |
| "eval_loss": 3.138974905014038, |
| "eval_runtime": 75.6365, |
| "eval_samples_per_second": 132.211, |
| "eval_steps_per_second": 8.263, |
| "step": 41100 |
| }, |
| { |
| "epoch": 582.3620324629499, |
| "grad_norm": 0.42633840441703796, |
| "learning_rate": 3.1438127090301007e-06, |
| "loss": 3.1976, |
| "step": 41200 |
| }, |
| { |
| "epoch": 582.3620324629499, |
| "eval_loss": 3.1378233432769775, |
| "eval_runtime": 75.5912, |
| "eval_samples_per_second": 132.291, |
| "eval_steps_per_second": 8.268, |
| "step": 41200 |
| }, |
| { |
| "epoch": 583.773465067043, |
| "grad_norm": 0.3734727203845978, |
| "learning_rate": 3.127090301003345e-06, |
| "loss": 3.1982, |
| "step": 41300 |
| }, |
| { |
| "epoch": 583.773465067043, |
| "eval_loss": 3.1387135982513428, |
| "eval_runtime": 75.5016, |
| "eval_samples_per_second": 132.448, |
| "eval_steps_per_second": 8.278, |
| "step": 41300 |
| }, |
| { |
| "epoch": 585.1848976711362, |
| "grad_norm": 0.46358761191368103, |
| "learning_rate": 3.110367892976589e-06, |
| "loss": 3.1963, |
| "step": 41400 |
| }, |
| { |
| "epoch": 585.1848976711362, |
| "eval_loss": 3.138916492462158, |
| "eval_runtime": 75.6514, |
| "eval_samples_per_second": 132.185, |
| "eval_steps_per_second": 8.262, |
| "step": 41400 |
| }, |
| { |
| "epoch": 586.5963302752293, |
| "grad_norm": 0.48239365220069885, |
| "learning_rate": 3.0936454849498333e-06, |
| "loss": 3.1956, |
| "step": 41500 |
| }, |
| { |
| "epoch": 586.5963302752293, |
| "eval_loss": 3.1406919956207275, |
| "eval_runtime": 75.3804, |
| "eval_samples_per_second": 132.66, |
| "eval_steps_per_second": 8.291, |
| "step": 41500 |
| }, |
| { |
| "epoch": 588.0077628793225, |
| "grad_norm": 0.3974038362503052, |
| "learning_rate": 3.0769230769230774e-06, |
| "loss": 3.1953, |
| "step": 41600 |
| }, |
| { |
| "epoch": 588.0077628793225, |
| "eval_loss": 3.138157844543457, |
| "eval_runtime": 75.5546, |
| "eval_samples_per_second": 132.355, |
| "eval_steps_per_second": 8.272, |
| "step": 41600 |
| }, |
| { |
| "epoch": 589.4191954834157, |
| "grad_norm": 0.379353404045105, |
| "learning_rate": 3.0602006688963216e-06, |
| "loss": 3.195, |
| "step": 41700 |
| }, |
| { |
| "epoch": 589.4191954834157, |
| "eval_loss": 3.137274980545044, |
| "eval_runtime": 75.5651, |
| "eval_samples_per_second": 132.336, |
| "eval_steps_per_second": 8.271, |
| "step": 41700 |
| }, |
| { |
| "epoch": 590.8306280875088, |
| "grad_norm": 0.40416419506073, |
| "learning_rate": 3.043478260869566e-06, |
| "loss": 3.1938, |
| "step": 41800 |
| }, |
| { |
| "epoch": 590.8306280875088, |
| "eval_loss": 3.134814739227295, |
| "eval_runtime": 75.6798, |
| "eval_samples_per_second": 132.136, |
| "eval_steps_per_second": 8.258, |
| "step": 41800 |
| }, |
| { |
| "epoch": 592.242060691602, |
| "grad_norm": 0.49428197741508484, |
| "learning_rate": 3.02675585284281e-06, |
| "loss": 3.1933, |
| "step": 41900 |
| }, |
| { |
| "epoch": 592.242060691602, |
| "eval_loss": 3.13729190826416, |
| "eval_runtime": 75.4545, |
| "eval_samples_per_second": 132.53, |
| "eval_steps_per_second": 8.283, |
| "step": 41900 |
| }, |
| { |
| "epoch": 593.6534932956952, |
| "grad_norm": 0.39984115958213806, |
| "learning_rate": 3.010033444816054e-06, |
| "loss": 3.1939, |
| "step": 42000 |
| }, |
| { |
| "epoch": 593.6534932956952, |
| "eval_loss": 3.1380887031555176, |
| "eval_runtime": 74.1402, |
| "eval_samples_per_second": 134.88, |
| "eval_steps_per_second": 8.43, |
| "step": 42000 |
| }, |
| { |
| "epoch": 595.0649258997882, |
| "grad_norm": 0.42510271072387695, |
| "learning_rate": 2.9933110367892983e-06, |
| "loss": 3.1925, |
| "step": 42100 |
| }, |
| { |
| "epoch": 595.0649258997882, |
| "eval_loss": 3.1366851329803467, |
| "eval_runtime": 74.1063, |
| "eval_samples_per_second": 134.941, |
| "eval_steps_per_second": 8.434, |
| "step": 42100 |
| }, |
| { |
| "epoch": 596.4763585038814, |
| "grad_norm": 0.39396584033966064, |
| "learning_rate": 2.976588628762542e-06, |
| "loss": 3.192, |
| "step": 42200 |
| }, |
| { |
| "epoch": 596.4763585038814, |
| "eval_loss": 3.1356325149536133, |
| "eval_runtime": 74.0085, |
| "eval_samples_per_second": 135.12, |
| "eval_steps_per_second": 8.445, |
| "step": 42200 |
| }, |
| { |
| "epoch": 597.8877911079746, |
| "grad_norm": 0.38813215494155884, |
| "learning_rate": 2.9598662207357863e-06, |
| "loss": 3.1908, |
| "step": 42300 |
| }, |
| { |
| "epoch": 597.8877911079746, |
| "eval_loss": 3.1375906467437744, |
| "eval_runtime": 75.3924, |
| "eval_samples_per_second": 132.639, |
| "eval_steps_per_second": 8.29, |
| "step": 42300 |
| }, |
| { |
| "epoch": 599.2992237120677, |
| "grad_norm": 0.39870092272758484, |
| "learning_rate": 2.9431438127090305e-06, |
| "loss": 3.1907, |
| "step": 42400 |
| }, |
| { |
| "epoch": 599.2992237120677, |
| "eval_loss": 3.1329517364501953, |
| "eval_runtime": 75.5449, |
| "eval_samples_per_second": 132.372, |
| "eval_steps_per_second": 8.273, |
| "step": 42400 |
| }, |
| { |
| "epoch": 600.7106563161609, |
| "grad_norm": 0.40714314579963684, |
| "learning_rate": 2.9264214046822746e-06, |
| "loss": 3.1908, |
| "step": 42500 |
| }, |
| { |
| "epoch": 600.7106563161609, |
| "eval_loss": 3.130784034729004, |
| "eval_runtime": 75.095, |
| "eval_samples_per_second": 133.165, |
| "eval_steps_per_second": 8.323, |
| "step": 42500 |
| }, |
| { |
| "epoch": 602.1220889202541, |
| "grad_norm": 0.4590309262275696, |
| "learning_rate": 2.9096989966555184e-06, |
| "loss": 3.1906, |
| "step": 42600 |
| }, |
| { |
| "epoch": 602.1220889202541, |
| "eval_loss": 3.130798816680908, |
| "eval_runtime": 75.2081, |
| "eval_samples_per_second": 132.964, |
| "eval_steps_per_second": 8.31, |
| "step": 42600 |
| }, |
| { |
| "epoch": 603.5335215243472, |
| "grad_norm": 0.4752305746078491, |
| "learning_rate": 2.8929765886287626e-06, |
| "loss": 3.1885, |
| "step": 42700 |
| }, |
| { |
| "epoch": 603.5335215243472, |
| "eval_loss": 3.1303343772888184, |
| "eval_runtime": 75.2947, |
| "eval_samples_per_second": 132.812, |
| "eval_steps_per_second": 8.301, |
| "step": 42700 |
| }, |
| { |
| "epoch": 604.9449541284404, |
| "grad_norm": 0.5041990876197815, |
| "learning_rate": 2.8762541806020068e-06, |
| "loss": 3.1889, |
| "step": 42800 |
| }, |
| { |
| "epoch": 604.9449541284404, |
| "eval_loss": 3.133119821548462, |
| "eval_runtime": 75.4469, |
| "eval_samples_per_second": 132.544, |
| "eval_steps_per_second": 8.284, |
| "step": 42800 |
| }, |
| { |
| "epoch": 606.3563867325336, |
| "grad_norm": 0.4223128855228424, |
| "learning_rate": 2.859531772575251e-06, |
| "loss": 3.1883, |
| "step": 42900 |
| }, |
| { |
| "epoch": 606.3563867325336, |
| "eval_loss": 3.13100266456604, |
| "eval_runtime": 75.1208, |
| "eval_samples_per_second": 133.119, |
| "eval_steps_per_second": 8.32, |
| "step": 42900 |
| }, |
| { |
| "epoch": 607.7678193366266, |
| "grad_norm": 0.4178304672241211, |
| "learning_rate": 2.842809364548495e-06, |
| "loss": 3.1884, |
| "step": 43000 |
| }, |
| { |
| "epoch": 607.7678193366266, |
| "eval_loss": 3.1309714317321777, |
| "eval_runtime": 75.1871, |
| "eval_samples_per_second": 133.001, |
| "eval_steps_per_second": 8.313, |
| "step": 43000 |
| }, |
| { |
| "epoch": 609.1792519407198, |
| "grad_norm": 0.38649800419807434, |
| "learning_rate": 2.8260869565217393e-06, |
| "loss": 3.187, |
| "step": 43100 |
| }, |
| { |
| "epoch": 609.1792519407198, |
| "eval_loss": 3.1306862831115723, |
| "eval_runtime": 75.2092, |
| "eval_samples_per_second": 132.962, |
| "eval_steps_per_second": 8.31, |
| "step": 43100 |
| }, |
| { |
| "epoch": 610.590684544813, |
| "grad_norm": 0.4299427270889282, |
| "learning_rate": 2.8093645484949835e-06, |
| "loss": 3.1873, |
| "step": 43200 |
| }, |
| { |
| "epoch": 610.590684544813, |
| "eval_loss": 3.1312477588653564, |
| "eval_runtime": 75.2144, |
| "eval_samples_per_second": 132.953, |
| "eval_steps_per_second": 8.31, |
| "step": 43200 |
| }, |
| { |
| "epoch": 612.0021171489061, |
| "grad_norm": 0.46031683683395386, |
| "learning_rate": 2.7926421404682277e-06, |
| "loss": 3.1868, |
| "step": 43300 |
| }, |
| { |
| "epoch": 612.0021171489061, |
| "eval_loss": 3.1275081634521484, |
| "eval_runtime": 75.6145, |
| "eval_samples_per_second": 132.25, |
| "eval_steps_per_second": 8.266, |
| "step": 43300 |
| }, |
| { |
| "epoch": 613.4135497529993, |
| "grad_norm": 0.4347423017024994, |
| "learning_rate": 2.7759197324414714e-06, |
| "loss": 3.1856, |
| "step": 43400 |
| }, |
| { |
| "epoch": 613.4135497529993, |
| "eval_loss": 3.1303658485412598, |
| "eval_runtime": 75.6408, |
| "eval_samples_per_second": 132.204, |
| "eval_steps_per_second": 8.263, |
| "step": 43400 |
| }, |
| { |
| "epoch": 614.8249823570925, |
| "grad_norm": 0.38185596466064453, |
| "learning_rate": 2.7591973244147156e-06, |
| "loss": 3.1844, |
| "step": 43500 |
| }, |
| { |
| "epoch": 614.8249823570925, |
| "eval_loss": 3.1326870918273926, |
| "eval_runtime": 75.6056, |
| "eval_samples_per_second": 132.265, |
| "eval_steps_per_second": 8.267, |
| "step": 43500 |
| }, |
| { |
| "epoch": 616.2364149611856, |
| "grad_norm": 0.40985825657844543, |
| "learning_rate": 2.74247491638796e-06, |
| "loss": 3.1847, |
| "step": 43600 |
| }, |
| { |
| "epoch": 616.2364149611856, |
| "eval_loss": 3.128065347671509, |
| "eval_runtime": 75.6157, |
| "eval_samples_per_second": 132.248, |
| "eval_steps_per_second": 8.265, |
| "step": 43600 |
| }, |
| { |
| "epoch": 617.6478475652788, |
| "grad_norm": 0.43245822191238403, |
| "learning_rate": 2.725752508361204e-06, |
| "loss": 3.1839, |
| "step": 43700 |
| }, |
| { |
| "epoch": 617.6478475652788, |
| "eval_loss": 3.128894805908203, |
| "eval_runtime": 75.5829, |
| "eval_samples_per_second": 132.305, |
| "eval_steps_per_second": 8.269, |
| "step": 43700 |
| }, |
| { |
| "epoch": 619.059280169372, |
| "grad_norm": 0.3881818950176239, |
| "learning_rate": 2.709030100334448e-06, |
| "loss": 3.1827, |
| "step": 43800 |
| }, |
| { |
| "epoch": 619.059280169372, |
| "eval_loss": 3.1263980865478516, |
| "eval_runtime": 75.3715, |
| "eval_samples_per_second": 132.676, |
| "eval_steps_per_second": 8.292, |
| "step": 43800 |
| }, |
| { |
| "epoch": 620.470712773465, |
| "grad_norm": 0.36317145824432373, |
| "learning_rate": 2.6923076923076923e-06, |
| "loss": 3.1842, |
| "step": 43900 |
| }, |
| { |
| "epoch": 620.470712773465, |
| "eval_loss": 3.1291022300720215, |
| "eval_runtime": 75.6179, |
| "eval_samples_per_second": 132.244, |
| "eval_steps_per_second": 8.265, |
| "step": 43900 |
| }, |
| { |
| "epoch": 621.8821453775582, |
| "grad_norm": 0.3756316602230072, |
| "learning_rate": 2.6755852842809365e-06, |
| "loss": 3.1834, |
| "step": 44000 |
| }, |
| { |
| "epoch": 621.8821453775582, |
| "eval_loss": 3.1239471435546875, |
| "eval_runtime": 75.2359, |
| "eval_samples_per_second": 132.915, |
| "eval_steps_per_second": 8.307, |
| "step": 44000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 60000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 858, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.901642959904604e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|