{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8208820882088209, "eval_steps": 23, "global_step": 76, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010801080108010801, "grad_norm": 63.828086432644454, "learning_rate": 5.333333333333333e-07, "loss": 6.498, "step": 1 }, { "epoch": 0.010801080108010801, "eval_loss": NaN, "eval_runtime": 122.6901, "eval_samples_per_second": 8.77, "eval_steps_per_second": 2.193, "step": 1 }, { "epoch": 0.021602160216021602, "grad_norm": 66.4128492022695, "learning_rate": 1.0666666666666667e-06, "loss": 6.5865, "step": 2 }, { "epoch": 0.032403240324032405, "grad_norm": 71.37254447564402, "learning_rate": 1.6e-06, "loss": 6.789, "step": 3 }, { "epoch": 0.043204320432043204, "grad_norm": 60.356993414372305, "learning_rate": 2.1333333333333334e-06, "loss": 6.4573, "step": 4 }, { "epoch": 0.054005400540054004, "grad_norm": 55.853845541360975, "learning_rate": 2.6666666666666664e-06, "loss": 6.6238, "step": 5 }, { "epoch": 0.06480648064806481, "grad_norm": 33.76487342816726, "learning_rate": 3.2e-06, "loss": 5.9981, "step": 6 }, { "epoch": 0.07560756075607561, "grad_norm": 23.47380616191989, "learning_rate": 3.7333333333333333e-06, "loss": 5.721, "step": 7 }, { "epoch": 0.08640864086408641, "grad_norm": 19.539382371968657, "learning_rate": 4.266666666666667e-06, "loss": 5.6131, "step": 8 }, { "epoch": 0.09720972097209721, "grad_norm": 14.327743602474902, "learning_rate": 4.8e-06, "loss": 5.2541, "step": 9 }, { "epoch": 0.10801080108010801, "grad_norm": 13.592775716109132, "learning_rate": 5.333333333333333e-06, "loss": 5.1978, "step": 10 }, { "epoch": 0.1188118811881188, "grad_norm": 10.979858971620041, "learning_rate": 5.866666666666666e-06, "loss": 5.0864, "step": 11 }, { "epoch": 0.12961296129612962, "grad_norm": 9.794947757674, "learning_rate": 6.4e-06, "loss": 4.8428, "step": 12 }, { "epoch": 0.1404140414041404, "grad_norm": 9.874078411160985, "learning_rate": 6.933333333333334e-06, "loss": 4.5415, "step": 13 }, { "epoch": 0.15121512151215122, "grad_norm": 9.139382870149774, "learning_rate": 7.466666666666667e-06, "loss": 4.5053, "step": 14 }, { "epoch": 0.162016201620162, "grad_norm": 8.650696439508351, "learning_rate": 8e-06, "loss": 4.2912, "step": 15 }, { "epoch": 0.17281728172817282, "grad_norm": 8.866106038295735, "learning_rate": 7.996671197378331e-06, "loss": 4.0837, "step": 16 }, { "epoch": 0.18361836183618363, "grad_norm": 8.09340750575385, "learning_rate": 7.986690329976772e-06, "loss": 3.794, "step": 17 }, { "epoch": 0.19441944194419442, "grad_norm": 7.969568017047467, "learning_rate": 7.97007400996411e-06, "loss": 3.7631, "step": 18 }, { "epoch": 0.20522052205220523, "grad_norm": 8.622778590137475, "learning_rate": 7.946849893565155e-06, "loss": 3.6164, "step": 19 }, { "epoch": 0.21602160216021601, "grad_norm": 8.935855589318281, "learning_rate": 7.917056635029685e-06, "loss": 3.4103, "step": 20 }, { "epoch": 0.22682268226822683, "grad_norm": 8.294919154259096, "learning_rate": 7.880743822296258e-06, "loss": 3.1529, "step": 21 }, { "epoch": 0.2376237623762376, "grad_norm": 10.069947949166714, "learning_rate": 7.837971894457989e-06, "loss": 3.1933, "step": 22 }, { "epoch": 0.24842484248424843, "grad_norm": 7.613383614891953, "learning_rate": 7.78881204116764e-06, "loss": 2.8644, "step": 23 }, { "epoch": 0.24842484248424843, "eval_loss": NaN, "eval_runtime": 122.7992, "eval_samples_per_second": 8.762, "eval_steps_per_second": 2.191, "step": 23 }, { "epoch": 0.25922592259225924, "grad_norm": 7.683615665676093, "learning_rate": 7.733346084149467e-06, "loss": 2.7816, "step": 24 }, { "epoch": 0.27002700270027, "grad_norm": 6.616788299926916, "learning_rate": 7.671666341015038e-06, "loss": 2.6587, "step": 25 }, { "epoch": 0.2808280828082808, "grad_norm": 8.248800446233975, "learning_rate": 7.6038754716096755e-06, "loss": 2.5965, "step": 26 }, { "epoch": 0.29162916291629165, "grad_norm": 5.768076660552814, "learning_rate": 7.5300863071452845e-06, "loss": 2.3966, "step": 27 }, { "epoch": 0.30243024302430244, "grad_norm": 6.248576515957054, "learning_rate": 7.450421662403922e-06, "loss": 2.3689, "step": 28 }, { "epoch": 0.3132313231323132, "grad_norm": 6.340341378272499, "learning_rate": 7.365014131324725e-06, "loss": 2.2287, "step": 29 }, { "epoch": 0.324032403240324, "grad_norm": 4.698539858663757, "learning_rate": 7.274005866314374e-06, "loss": 2.1459, "step": 30 }, { "epoch": 0.33483348334833485, "grad_norm": 5.352213910449207, "learning_rate": 7.17754834164845e-06, "loss": 2.1892, "step": 31 }, { "epoch": 0.34563456345634563, "grad_norm": 3.8948062761704185, "learning_rate": 7.075802101357448e-06, "loss": 2.0545, "step": 32 }, { "epoch": 0.3564356435643564, "grad_norm": 4.275507225855982, "learning_rate": 6.96893649201708e-06, "loss": 1.9693, "step": 33 }, { "epoch": 0.36723672367236726, "grad_norm": 3.827840441222993, "learning_rate": 6.857129380887614e-06, "loss": 2.2615, "step": 34 }, { "epoch": 0.37803780378037805, "grad_norm": 4.061222798809487, "learning_rate": 6.740566859871377e-06, "loss": 1.907, "step": 35 }, { "epoch": 0.38883888388838883, "grad_norm": 3.764768942445119, "learning_rate": 6.619442935781141e-06, "loss": 1.7926, "step": 36 }, { "epoch": 0.3996399639963996, "grad_norm": 3.6889692023791203, "learning_rate": 6.493959207434934e-06, "loss": 1.8239, "step": 37 }, { "epoch": 0.41044104410441046, "grad_norm": 3.877637135877313, "learning_rate": 6.364324530114706e-06, "loss": 1.7841, "step": 38 }, { "epoch": 0.42124212421242124, "grad_norm": 3.419151921848927, "learning_rate": 6.230754667947318e-06, "loss": 1.7599, "step": 39 }, { "epoch": 0.43204320432043203, "grad_norm": 3.400475978424493, "learning_rate": 6.093471934786448e-06, "loss": 1.7395, "step": 40 }, { "epoch": 0.44284428442844287, "grad_norm": 3.0959631594585924, "learning_rate": 5.952704824193125e-06, "loss": 1.6571, "step": 41 }, { "epoch": 0.45364536453645365, "grad_norm": 2.5886990345494696, "learning_rate": 5.808687629130743e-06, "loss": 1.6854, "step": 42 }, { "epoch": 0.46444644464446444, "grad_norm": 3.764322932438931, "learning_rate": 5.661660052007546e-06, "loss": 1.627, "step": 43 }, { "epoch": 0.4752475247524752, "grad_norm": 2.99531105375008, "learning_rate": 5.511866805715626e-06, "loss": 1.9103, "step": 44 }, { "epoch": 0.48604860486048607, "grad_norm": 3.5132876271811373, "learning_rate": 5.359557206330466e-06, "loss": 1.572, "step": 45 }, { "epoch": 0.49684968496849685, "grad_norm": 2.5665673168109153, "learning_rate": 5.2049847581489365e-06, "loss": 1.5603, "step": 46 }, { "epoch": 0.49684968496849685, "eval_loss": NaN, "eval_runtime": 123.3224, "eval_samples_per_second": 8.725, "eval_steps_per_second": 2.181, "step": 46 }, { "epoch": 0.5076507650765076, "grad_norm": 3.4576040645448205, "learning_rate": 5.048406731756408e-06, "loss": 1.5601, "step": 47 }, { "epoch": 0.5184518451845185, "grad_norm": 2.687807440391295, "learning_rate": 4.890083735825257e-06, "loss": 1.6058, "step": 48 }, { "epoch": 0.5292529252925292, "grad_norm": 2.8487624057382654, "learning_rate": 4.730279283357447e-06, "loss": 1.5174, "step": 49 }, { "epoch": 0.54005400540054, "grad_norm": 2.9197691857946797, "learning_rate": 4.569259353093141e-06, "loss": 1.5429, "step": 50 }, { "epoch": 0.5508550855085509, "grad_norm": 2.7015326813188962, "learning_rate": 4.407291946815342e-06, "loss": 1.6792, "step": 51 }, { "epoch": 0.5616561656165616, "grad_norm": 3.22348351998704, "learning_rate": 4.244646643287371e-06, "loss": 1.5513, "step": 52 }, { "epoch": 0.5724572457245725, "grad_norm": 2.1285589649043466, "learning_rate": 4.081594149565622e-06, "loss": 1.5162, "step": 53 }, { "epoch": 0.5832583258325833, "grad_norm": 2.780214646079723, "learning_rate": 3.918405850434379e-06, "loss": 1.4872, "step": 54 }, { "epoch": 0.594059405940594, "grad_norm": 2.2883370030394503, "learning_rate": 3.75535335671263e-06, "loss": 1.5223, "step": 55 }, { "epoch": 0.6048604860486049, "grad_norm": 2.0052138675618894, "learning_rate": 3.5927080531846593e-06, "loss": 1.5324, "step": 56 }, { "epoch": 0.6156615661566157, "grad_norm": 2.1807110551254403, "learning_rate": 3.4307406469068596e-06, "loss": 1.5016, "step": 57 }, { "epoch": 0.6264626462646264, "grad_norm": 2.105213591481246, "learning_rate": 3.2697207166425537e-06, "loss": 1.4809, "step": 58 }, { "epoch": 0.6372637263726373, "grad_norm": 2.2669612123587544, "learning_rate": 3.1099162641747427e-06, "loss": 1.4593, "step": 59 }, { "epoch": 0.648064806480648, "grad_norm": 2.064860643781488, "learning_rate": 2.9515932682435922e-06, "loss": 1.4086, "step": 60 }, { "epoch": 0.6588658865886589, "grad_norm": 1.8917741415627494, "learning_rate": 2.7950152418510636e-06, "loss": 1.45, "step": 61 }, { "epoch": 0.6696669666966697, "grad_norm": 2.245572260361595, "learning_rate": 2.6404427936695337e-06, "loss": 1.4905, "step": 62 }, { "epoch": 0.6804680468046804, "grad_norm": 2.045925075000629, "learning_rate": 2.4881331942843742e-06, "loss": 1.4649, "step": 63 }, { "epoch": 0.6912691269126913, "grad_norm": 1.877871724293037, "learning_rate": 2.3383399479924544e-06, "loss": 1.4154, "step": 64 }, { "epoch": 0.7020702070207021, "grad_norm": 2.0602410525870822, "learning_rate": 2.1913123708692577e-06, "loss": 1.4173, "step": 65 }, { "epoch": 0.7128712871287128, "grad_norm": 1.8021432076698494, "learning_rate": 2.047295175806876e-06, "loss": 1.4453, "step": 66 }, { "epoch": 0.7236723672367237, "grad_norm": 2.010814723300771, "learning_rate": 1.9065280652135524e-06, "loss": 1.4403, "step": 67 }, { "epoch": 0.7344734473447345, "grad_norm": 1.829550265461561, "learning_rate": 1.7692453320526827e-06, "loss": 1.4541, "step": 68 }, { "epoch": 0.7452745274527453, "grad_norm": 1.8322738616732164, "learning_rate": 1.6356754698852942e-06, "loss": 1.4255, "step": 69 }, { "epoch": 0.7452745274527453, "eval_loss": NaN, "eval_runtime": 122.9964, "eval_samples_per_second": 8.748, "eval_steps_per_second": 2.187, "step": 69 }, { "epoch": 0.7560756075607561, "grad_norm": 1.768581496163203, "learning_rate": 1.506040792565066e-06, "loss": 1.4073, "step": 70 }, { "epoch": 0.7668766876687669, "grad_norm": 1.6601983391555746, "learning_rate": 1.38055706421886e-06, "loss": 1.3416, "step": 71 }, { "epoch": 0.7776777677767777, "grad_norm": 1.5416403691873033, "learning_rate": 1.2594331401286233e-06, "loss": 1.3795, "step": 72 }, { "epoch": 0.7884788478847885, "grad_norm": 1.708242131742048, "learning_rate": 1.1428706191123855e-06, "loss": 1.4519, "step": 73 }, { "epoch": 0.7992799279927992, "grad_norm": 1.58302694826516, "learning_rate": 1.0310635079829202e-06, "loss": 1.4105, "step": 74 }, { "epoch": 0.8100810081008101, "grad_norm": 1.8230575660793402, "learning_rate": 9.241978986425513e-07, "loss": 1.4175, "step": 75 }, { "epoch": 0.8208820882088209, "grad_norm": 1.738630228058889, "learning_rate": 8.224516583515493e-07, "loss": 1.4058, "step": 76 } ], "logging_steps": 1, "max_steps": 92, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 19, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 43330264104960.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }