3AI / model /trainer_state.json
navidfalah's picture
refactor
71ad36b
{
"best_global_step": 650,
"best_metric": 1.6722568273544312,
"best_model_checkpoint": "/content/drive/MyDrive/mistral_aggressive_training/checkpoint-650",
"epoch": 1.9475262368815591,
"eval_steps": 25,
"global_step": 650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014992503748125937,
"grad_norm": 1.3109337091445923,
"learning_rate": 2.4242424242424244e-06,
"loss": 2.1417,
"step": 5
},
{
"epoch": 0.029985007496251874,
"grad_norm": 1.2851405143737793,
"learning_rate": 5.4545454545454545e-06,
"loss": 2.0593,
"step": 10
},
{
"epoch": 0.044977511244377814,
"grad_norm": 1.3587734699249268,
"learning_rate": 8.484848484848486e-06,
"loss": 2.0723,
"step": 15
},
{
"epoch": 0.05997001499250375,
"grad_norm": 1.1362162828445435,
"learning_rate": 1.1515151515151517e-05,
"loss": 2.0299,
"step": 20
},
{
"epoch": 0.07496251874062969,
"grad_norm": 1.0861225128173828,
"learning_rate": 1.4545454545454546e-05,
"loss": 2.0103,
"step": 25
},
{
"epoch": 0.07496251874062969,
"eval_loss": 1.9985558986663818,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 464.1818,
"eval_samples_per_second": 2.001,
"eval_steps_per_second": 0.502,
"step": 25
},
{
"epoch": 0.08995502248875563,
"grad_norm": 1.1834365129470825,
"learning_rate": 1.7575757575757576e-05,
"loss": 1.9597,
"step": 30
},
{
"epoch": 0.10494752623688156,
"grad_norm": 0.9937852025032043,
"learning_rate": 1.9968503937007875e-05,
"loss": 1.9421,
"step": 35
},
{
"epoch": 0.1199400299850075,
"grad_norm": 1.0617958307266235,
"learning_rate": 1.9811023622047244e-05,
"loss": 2.024,
"step": 40
},
{
"epoch": 0.13493253373313344,
"grad_norm": 1.0851763486862183,
"learning_rate": 1.9653543307086616e-05,
"loss": 1.8302,
"step": 45
},
{
"epoch": 0.14992503748125938,
"grad_norm": 1.1397876739501953,
"learning_rate": 1.9496062992125985e-05,
"loss": 1.9633,
"step": 50
},
{
"epoch": 0.14992503748125938,
"eval_loss": 1.9225448369979858,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9066,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 0.502,
"step": 50
},
{
"epoch": 0.16491754122938532,
"grad_norm": 1.0546423196792603,
"learning_rate": 1.9338582677165357e-05,
"loss": 1.8928,
"step": 55
},
{
"epoch": 0.17991004497751126,
"grad_norm": 1.0976288318634033,
"learning_rate": 1.9181102362204726e-05,
"loss": 1.8646,
"step": 60
},
{
"epoch": 0.19490254872563717,
"grad_norm": 1.0164967775344849,
"learning_rate": 1.9023622047244098e-05,
"loss": 1.9247,
"step": 65
},
{
"epoch": 0.2098950524737631,
"grad_norm": 1.0609030723571777,
"learning_rate": 1.8866141732283464e-05,
"loss": 1.8895,
"step": 70
},
{
"epoch": 0.22488755622188905,
"grad_norm": 1.2414894104003906,
"learning_rate": 1.8708661417322836e-05,
"loss": 1.9753,
"step": 75
},
{
"epoch": 0.22488755622188905,
"eval_loss": 1.8853719234466553,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.658,
"eval_samples_per_second": 2.004,
"eval_steps_per_second": 0.503,
"step": 75
},
{
"epoch": 0.239880059970015,
"grad_norm": 1.1238614320755005,
"learning_rate": 1.8551181102362205e-05,
"loss": 1.8656,
"step": 80
},
{
"epoch": 0.25487256371814093,
"grad_norm": 1.143813967704773,
"learning_rate": 1.8393700787401577e-05,
"loss": 1.8317,
"step": 85
},
{
"epoch": 0.2698650674662669,
"grad_norm": 1.2623740434646606,
"learning_rate": 1.8236220472440946e-05,
"loss": 1.8669,
"step": 90
},
{
"epoch": 0.2848575712143928,
"grad_norm": 1.04632568359375,
"learning_rate": 1.8078740157480318e-05,
"loss": 1.7412,
"step": 95
},
{
"epoch": 0.29985007496251875,
"grad_norm": 1.0752465724945068,
"learning_rate": 1.7921259842519687e-05,
"loss": 1.808,
"step": 100
},
{
"epoch": 0.29985007496251875,
"eval_loss": 1.8590933084487915,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9821,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 100
},
{
"epoch": 0.3148425787106447,
"grad_norm": 1.1382629871368408,
"learning_rate": 1.776377952755906e-05,
"loss": 1.8629,
"step": 105
},
{
"epoch": 0.32983508245877063,
"grad_norm": 1.4844969511032104,
"learning_rate": 1.7606299212598424e-05,
"loss": 1.8546,
"step": 110
},
{
"epoch": 0.3448275862068966,
"grad_norm": 1.1989675760269165,
"learning_rate": 1.7448818897637796e-05,
"loss": 1.8497,
"step": 115
},
{
"epoch": 0.3598200899550225,
"grad_norm": 1.2271519899368286,
"learning_rate": 1.7291338582677165e-05,
"loss": 1.896,
"step": 120
},
{
"epoch": 0.3748125937031484,
"grad_norm": 1.179254412651062,
"learning_rate": 1.7133858267716537e-05,
"loss": 1.8161,
"step": 125
},
{
"epoch": 0.3748125937031484,
"eval_loss": 1.836364507675171,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 464.1432,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 125
},
{
"epoch": 0.38980509745127434,
"grad_norm": 1.3870253562927246,
"learning_rate": 1.6976377952755906e-05,
"loss": 1.8299,
"step": 130
},
{
"epoch": 0.4047976011994003,
"grad_norm": 1.1159974336624146,
"learning_rate": 1.6818897637795278e-05,
"loss": 1.7106,
"step": 135
},
{
"epoch": 0.4197901049475262,
"grad_norm": 1.2132537364959717,
"learning_rate": 1.6661417322834647e-05,
"loss": 1.8106,
"step": 140
},
{
"epoch": 0.43478260869565216,
"grad_norm": 1.180285930633545,
"learning_rate": 1.650393700787402e-05,
"loss": 1.7911,
"step": 145
},
{
"epoch": 0.4497751124437781,
"grad_norm": 1.2773538827896118,
"learning_rate": 1.6346456692913385e-05,
"loss": 1.8412,
"step": 150
},
{
"epoch": 0.4497751124437781,
"eval_loss": 1.817762017250061,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.8218,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 0.502,
"step": 150
},
{
"epoch": 0.46476761619190404,
"grad_norm": 1.2192872762680054,
"learning_rate": 1.6188976377952757e-05,
"loss": 1.8606,
"step": 155
},
{
"epoch": 0.47976011994003,
"grad_norm": 1.4267574548721313,
"learning_rate": 1.6031496062992126e-05,
"loss": 1.8088,
"step": 160
},
{
"epoch": 0.4947526236881559,
"grad_norm": 1.4275727272033691,
"learning_rate": 1.5874015748031498e-05,
"loss": 1.7979,
"step": 165
},
{
"epoch": 0.5097451274362819,
"grad_norm": 1.3130961656570435,
"learning_rate": 1.5716535433070866e-05,
"loss": 1.8059,
"step": 170
},
{
"epoch": 0.5247376311844077,
"grad_norm": 1.2740048170089722,
"learning_rate": 1.555905511811024e-05,
"loss": 1.7689,
"step": 175
},
{
"epoch": 0.5247376311844077,
"eval_loss": 1.801321268081665,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.8247,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 0.502,
"step": 175
},
{
"epoch": 0.5397301349325337,
"grad_norm": 1.1887905597686768,
"learning_rate": 1.5401574803149607e-05,
"loss": 1.7052,
"step": 180
},
{
"epoch": 0.5547226386806596,
"grad_norm": 1.3410052061080933,
"learning_rate": 1.5244094488188978e-05,
"loss": 1.7916,
"step": 185
},
{
"epoch": 0.5697151424287856,
"grad_norm": 1.2976702451705933,
"learning_rate": 1.5086614173228347e-05,
"loss": 1.6751,
"step": 190
},
{
"epoch": 0.5847076461769115,
"grad_norm": 1.3378161191940308,
"learning_rate": 1.4929133858267717e-05,
"loss": 1.6846,
"step": 195
},
{
"epoch": 0.5997001499250375,
"grad_norm": 1.3700908422470093,
"learning_rate": 1.4771653543307088e-05,
"loss": 1.7537,
"step": 200
},
{
"epoch": 0.5997001499250375,
"eval_loss": 1.786393165588379,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 464.0429,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 200
},
{
"epoch": 0.6146926536731634,
"grad_norm": 1.3000359535217285,
"learning_rate": 1.4614173228346458e-05,
"loss": 1.7362,
"step": 205
},
{
"epoch": 0.6296851574212894,
"grad_norm": 1.3432403802871704,
"learning_rate": 1.4456692913385829e-05,
"loss": 1.7752,
"step": 210
},
{
"epoch": 0.6446776611694153,
"grad_norm": 1.4008256196975708,
"learning_rate": 1.4299212598425199e-05,
"loss": 1.7294,
"step": 215
},
{
"epoch": 0.6596701649175413,
"grad_norm": 1.2891970872879028,
"learning_rate": 1.4141732283464568e-05,
"loss": 1.7085,
"step": 220
},
{
"epoch": 0.6746626686656672,
"grad_norm": 1.1324400901794434,
"learning_rate": 1.3984251968503938e-05,
"loss": 1.7697,
"step": 225
},
{
"epoch": 0.6746626686656672,
"eval_loss": 1.7725938558578491,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 464.0819,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 225
},
{
"epoch": 0.6896551724137931,
"grad_norm": 1.4434219598770142,
"learning_rate": 1.3826771653543307e-05,
"loss": 1.7308,
"step": 230
},
{
"epoch": 0.704647676161919,
"grad_norm": 1.3460474014282227,
"learning_rate": 1.3669291338582678e-05,
"loss": 1.7323,
"step": 235
},
{
"epoch": 0.719640179910045,
"grad_norm": 1.2641957998275757,
"learning_rate": 1.3511811023622048e-05,
"loss": 1.6987,
"step": 240
},
{
"epoch": 0.7346326836581709,
"grad_norm": 1.3505923748016357,
"learning_rate": 1.3354330708661419e-05,
"loss": 1.6961,
"step": 245
},
{
"epoch": 0.7496251874062968,
"grad_norm": 1.4946131706237793,
"learning_rate": 1.3196850393700789e-05,
"loss": 1.733,
"step": 250
},
{
"epoch": 0.7496251874062968,
"eval_loss": 1.7611132860183716,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.8197,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 0.502,
"step": 250
},
{
"epoch": 0.7646176911544228,
"grad_norm": 1.433475375175476,
"learning_rate": 1.303937007874016e-05,
"loss": 1.8026,
"step": 255
},
{
"epoch": 0.7796101949025487,
"grad_norm": 1.448909044265747,
"learning_rate": 1.288188976377953e-05,
"loss": 1.7112,
"step": 260
},
{
"epoch": 0.7946026986506747,
"grad_norm": 1.3357568979263306,
"learning_rate": 1.2724409448818899e-05,
"loss": 1.6413,
"step": 265
},
{
"epoch": 0.8095952023988006,
"grad_norm": 1.4122332334518433,
"learning_rate": 1.2566929133858268e-05,
"loss": 1.7038,
"step": 270
},
{
"epoch": 0.8245877061469266,
"grad_norm": 1.5254954099655151,
"learning_rate": 1.2409448818897638e-05,
"loss": 1.6762,
"step": 275
},
{
"epoch": 0.8245877061469266,
"eval_loss": 1.7509944438934326,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 464.0835,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 275
},
{
"epoch": 0.8395802098950524,
"grad_norm": 1.4556777477264404,
"learning_rate": 1.2251968503937009e-05,
"loss": 1.7421,
"step": 280
},
{
"epoch": 0.8545727136431784,
"grad_norm": 1.2593179941177368,
"learning_rate": 1.2094488188976379e-05,
"loss": 1.7368,
"step": 285
},
{
"epoch": 0.8695652173913043,
"grad_norm": 1.5513230562210083,
"learning_rate": 1.193700787401575e-05,
"loss": 1.6937,
"step": 290
},
{
"epoch": 0.8845577211394303,
"grad_norm": 1.450356125831604,
"learning_rate": 1.177952755905512e-05,
"loss": 1.6819,
"step": 295
},
{
"epoch": 0.8995502248875562,
"grad_norm": 1.4953676462173462,
"learning_rate": 1.162204724409449e-05,
"loss": 1.7522,
"step": 300
},
{
"epoch": 0.8995502248875562,
"eval_loss": 1.7417070865631104,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9637,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 300
},
{
"epoch": 0.9145427286356822,
"grad_norm": 1.3364105224609375,
"learning_rate": 1.1464566929133861e-05,
"loss": 1.7016,
"step": 305
},
{
"epoch": 0.9295352323838081,
"grad_norm": 1.7076566219329834,
"learning_rate": 1.1307086614173228e-05,
"loss": 1.7504,
"step": 310
},
{
"epoch": 0.9445277361319341,
"grad_norm": 1.397580862045288,
"learning_rate": 1.1149606299212599e-05,
"loss": 1.6943,
"step": 315
},
{
"epoch": 0.95952023988006,
"grad_norm": 1.4239177703857422,
"learning_rate": 1.0992125984251969e-05,
"loss": 1.6791,
"step": 320
},
{
"epoch": 0.974512743628186,
"grad_norm": 1.4564177989959717,
"learning_rate": 1.083464566929134e-05,
"loss": 1.7237,
"step": 325
},
{
"epoch": 0.974512743628186,
"eval_loss": 1.7306653261184692,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 464.0457,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 325
},
{
"epoch": 0.9895052473763118,
"grad_norm": 1.4340523481369019,
"learning_rate": 1.067716535433071e-05,
"loss": 1.7005,
"step": 330
},
{
"epoch": 1.0029985007496252,
"grad_norm": 1.3697952032089233,
"learning_rate": 1.051968503937008e-05,
"loss": 1.6647,
"step": 335
},
{
"epoch": 1.0179910044977512,
"grad_norm": 1.4047369956970215,
"learning_rate": 1.0362204724409451e-05,
"loss": 1.7556,
"step": 340
},
{
"epoch": 1.0329835082458771,
"grad_norm": 1.493369460105896,
"learning_rate": 1.0204724409448821e-05,
"loss": 1.6457,
"step": 345
},
{
"epoch": 1.047976011994003,
"grad_norm": 1.6234937906265259,
"learning_rate": 1.0047244094488188e-05,
"loss": 1.6274,
"step": 350
},
{
"epoch": 1.047976011994003,
"eval_loss": 1.7234022617340088,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.754,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 0.502,
"step": 350
},
{
"epoch": 1.062968515742129,
"grad_norm": 1.6586315631866455,
"learning_rate": 9.88976377952756e-06,
"loss": 1.6198,
"step": 355
},
{
"epoch": 1.077961019490255,
"grad_norm": 1.5955413579940796,
"learning_rate": 9.73228346456693e-06,
"loss": 1.5932,
"step": 360
},
{
"epoch": 1.092953523238381,
"grad_norm": 1.6776100397109985,
"learning_rate": 9.5748031496063e-06,
"loss": 1.6465,
"step": 365
},
{
"epoch": 1.1079460269865067,
"grad_norm": 1.4672406911849976,
"learning_rate": 9.41732283464567e-06,
"loss": 1.6208,
"step": 370
},
{
"epoch": 1.1229385307346327,
"grad_norm": 1.507462978363037,
"learning_rate": 9.259842519685041e-06,
"loss": 1.6205,
"step": 375
},
{
"epoch": 1.1229385307346327,
"eval_loss": 1.7159619331359863,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9627,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 375
},
{
"epoch": 1.1379310344827587,
"grad_norm": 1.6919342279434204,
"learning_rate": 9.10236220472441e-06,
"loss": 1.6921,
"step": 380
},
{
"epoch": 1.1529235382308847,
"grad_norm": 1.569090723991394,
"learning_rate": 8.94488188976378e-06,
"loss": 1.6457,
"step": 385
},
{
"epoch": 1.1679160419790104,
"grad_norm": 1.6199414730072021,
"learning_rate": 8.78740157480315e-06,
"loss": 1.6988,
"step": 390
},
{
"epoch": 1.1829085457271364,
"grad_norm": 1.4945182800292969,
"learning_rate": 8.629921259842521e-06,
"loss": 1.5946,
"step": 395
},
{
"epoch": 1.1979010494752624,
"grad_norm": 1.6097604036331177,
"learning_rate": 8.47244094488189e-06,
"loss": 1.6567,
"step": 400
},
{
"epoch": 1.1979010494752624,
"eval_loss": 1.7092427015304565,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9844,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 400
},
{
"epoch": 1.2128935532233882,
"grad_norm": 1.5406345129013062,
"learning_rate": 8.31496062992126e-06,
"loss": 1.5858,
"step": 405
},
{
"epoch": 1.2278860569715142,
"grad_norm": 1.7372868061065674,
"learning_rate": 8.157480314960631e-06,
"loss": 1.5864,
"step": 410
},
{
"epoch": 1.2428785607196402,
"grad_norm": 1.7523705959320068,
"learning_rate": 8.000000000000001e-06,
"loss": 1.6189,
"step": 415
},
{
"epoch": 1.2578710644677662,
"grad_norm": 1.6225429773330688,
"learning_rate": 7.84251968503937e-06,
"loss": 1.5691,
"step": 420
},
{
"epoch": 1.272863568215892,
"grad_norm": 1.6807804107666016,
"learning_rate": 7.68503937007874e-06,
"loss": 1.5915,
"step": 425
},
{
"epoch": 1.272863568215892,
"eval_loss": 1.7035413980484009,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9113,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 0.502,
"step": 425
},
{
"epoch": 1.287856071964018,
"grad_norm": 1.6341338157653809,
"learning_rate": 7.527559055118111e-06,
"loss": 1.5694,
"step": 430
},
{
"epoch": 1.302848575712144,
"grad_norm": 1.6684147119522095,
"learning_rate": 7.3700787401574816e-06,
"loss": 1.7089,
"step": 435
},
{
"epoch": 1.31784107946027,
"grad_norm": 1.7899603843688965,
"learning_rate": 7.21259842519685e-06,
"loss": 1.613,
"step": 440
},
{
"epoch": 1.3328335832083957,
"grad_norm": 1.667357325553894,
"learning_rate": 7.055118110236221e-06,
"loss": 1.6017,
"step": 445
},
{
"epoch": 1.3478260869565217,
"grad_norm": 1.6792216300964355,
"learning_rate": 6.897637795275591e-06,
"loss": 1.6203,
"step": 450
},
{
"epoch": 1.3478260869565217,
"eval_loss": 1.6988191604614258,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.913,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 0.502,
"step": 450
},
{
"epoch": 1.3628185907046477,
"grad_norm": 1.6012407541275024,
"learning_rate": 6.740157480314962e-06,
"loss": 1.6082,
"step": 455
},
{
"epoch": 1.3778110944527735,
"grad_norm": 1.7225656509399414,
"learning_rate": 6.5826771653543306e-06,
"loss": 1.5841,
"step": 460
},
{
"epoch": 1.3928035982008995,
"grad_norm": 1.6564732789993286,
"learning_rate": 6.425196850393701e-06,
"loss": 1.7057,
"step": 465
},
{
"epoch": 1.4077961019490255,
"grad_norm": 1.5676624774932861,
"learning_rate": 6.2677165354330715e-06,
"loss": 1.5861,
"step": 470
},
{
"epoch": 1.4227886056971515,
"grad_norm": 1.7391592264175415,
"learning_rate": 6.110236220472442e-06,
"loss": 1.6339,
"step": 475
},
{
"epoch": 1.4227886056971515,
"eval_loss": 1.6934857368469238,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.8519,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 0.502,
"step": 475
},
{
"epoch": 1.4377811094452775,
"grad_norm": 1.8171783685684204,
"learning_rate": 5.952755905511811e-06,
"loss": 1.5567,
"step": 480
},
{
"epoch": 1.4527736131934033,
"grad_norm": 1.6563059091567993,
"learning_rate": 5.795275590551181e-06,
"loss": 1.5669,
"step": 485
},
{
"epoch": 1.4677661169415293,
"grad_norm": 1.7603881359100342,
"learning_rate": 5.637795275590552e-06,
"loss": 1.6006,
"step": 490
},
{
"epoch": 1.4827586206896552,
"grad_norm": 1.951175570487976,
"learning_rate": 5.480314960629922e-06,
"loss": 1.7085,
"step": 495
},
{
"epoch": 1.497751124437781,
"grad_norm": 1.6208112239837646,
"learning_rate": 5.322834645669291e-06,
"loss": 1.6304,
"step": 500
},
{
"epoch": 1.497751124437781,
"eval_loss": 1.6872224807739258,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9678,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 500
},
{
"epoch": 1.512743628185907,
"grad_norm": 1.71792471408844,
"learning_rate": 5.1653543307086615e-06,
"loss": 1.6648,
"step": 505
},
{
"epoch": 1.527736131934033,
"grad_norm": 1.71132493019104,
"learning_rate": 5.007874015748032e-06,
"loss": 1.594,
"step": 510
},
{
"epoch": 1.5427286356821588,
"grad_norm": 1.6670012474060059,
"learning_rate": 4.850393700787402e-06,
"loss": 1.5709,
"step": 515
},
{
"epoch": 1.557721139430285,
"grad_norm": 1.6870834827423096,
"learning_rate": 4.692913385826772e-06,
"loss": 1.6262,
"step": 520
},
{
"epoch": 1.5727136431784108,
"grad_norm": 1.8740547895431519,
"learning_rate": 4.535433070866142e-06,
"loss": 1.7051,
"step": 525
},
{
"epoch": 1.5727136431784108,
"eval_loss": 1.682806372642517,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 464.0592,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 525
},
{
"epoch": 1.5877061469265368,
"grad_norm": 1.6121946573257446,
"learning_rate": 4.377952755905512e-06,
"loss": 1.577,
"step": 530
},
{
"epoch": 1.6026986506746628,
"grad_norm": 1.8216692209243774,
"learning_rate": 4.220472440944882e-06,
"loss": 1.5505,
"step": 535
},
{
"epoch": 1.6176911544227885,
"grad_norm": 1.7587759494781494,
"learning_rate": 4.062992125984252e-06,
"loss": 1.6189,
"step": 540
},
{
"epoch": 1.6326836581709145,
"grad_norm": 1.8402221202850342,
"learning_rate": 3.905511811023622e-06,
"loss": 1.6749,
"step": 545
},
{
"epoch": 1.6476761619190405,
"grad_norm": 1.6589854955673218,
"learning_rate": 3.748031496062993e-06,
"loss": 1.7241,
"step": 550
},
{
"epoch": 1.6476761619190405,
"eval_loss": 1.6803463697433472,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9675,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 550
},
{
"epoch": 1.6626686656671663,
"grad_norm": 1.6675294637680054,
"learning_rate": 3.5905511811023625e-06,
"loss": 1.5372,
"step": 555
},
{
"epoch": 1.6776611694152923,
"grad_norm": 1.6088923215866089,
"learning_rate": 3.433070866141733e-06,
"loss": 1.6013,
"step": 560
},
{
"epoch": 1.6926536731634183,
"grad_norm": 1.9371333122253418,
"learning_rate": 3.2755905511811026e-06,
"loss": 1.7105,
"step": 565
},
{
"epoch": 1.707646176911544,
"grad_norm": 1.9125174283981323,
"learning_rate": 3.118110236220473e-06,
"loss": 1.6931,
"step": 570
},
{
"epoch": 1.7226386806596703,
"grad_norm": 1.805245041847229,
"learning_rate": 2.9606299212598427e-06,
"loss": 1.5554,
"step": 575
},
{
"epoch": 1.7226386806596703,
"eval_loss": 1.6775026321411133,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9093,
"eval_samples_per_second": 2.003,
"eval_steps_per_second": 0.502,
"step": 575
},
{
"epoch": 1.737631184407796,
"grad_norm": 1.753201961517334,
"learning_rate": 2.803149606299213e-06,
"loss": 1.5935,
"step": 580
},
{
"epoch": 1.752623688155922,
"grad_norm": 1.9081984758377075,
"learning_rate": 2.645669291338583e-06,
"loss": 1.6316,
"step": 585
},
{
"epoch": 1.767616191904048,
"grad_norm": 1.7977555990219116,
"learning_rate": 2.488188976377953e-06,
"loss": 1.5451,
"step": 590
},
{
"epoch": 1.7826086956521738,
"grad_norm": 1.817696213722229,
"learning_rate": 2.330708661417323e-06,
"loss": 1.5799,
"step": 595
},
{
"epoch": 1.7976011994002998,
"grad_norm": 1.7235620021820068,
"learning_rate": 2.173228346456693e-06,
"loss": 1.6196,
"step": 600
},
{
"epoch": 1.7976011994002998,
"eval_loss": 1.6750439405441284,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 464.07,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 600
},
{
"epoch": 1.8125937031484258,
"grad_norm": 1.7125576734542847,
"learning_rate": 2.015748031496063e-06,
"loss": 1.5546,
"step": 605
},
{
"epoch": 1.8275862068965516,
"grad_norm": 1.693699598312378,
"learning_rate": 1.8582677165354333e-06,
"loss": 1.5906,
"step": 610
},
{
"epoch": 1.8425787106446778,
"grad_norm": 1.7908601760864258,
"learning_rate": 1.7007874015748034e-06,
"loss": 1.6616,
"step": 615
},
{
"epoch": 1.8575712143928036,
"grad_norm": 1.7427383661270142,
"learning_rate": 1.5433070866141734e-06,
"loss": 1.5276,
"step": 620
},
{
"epoch": 1.8725637181409296,
"grad_norm": 2.0088937282562256,
"learning_rate": 1.3858267716535435e-06,
"loss": 1.5377,
"step": 625
},
{
"epoch": 1.8725637181409296,
"eval_loss": 1.673450231552124,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 464.0489,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 625
},
{
"epoch": 1.8875562218890556,
"grad_norm": 1.863299012184143,
"learning_rate": 1.2283464566929135e-06,
"loss": 1.545,
"step": 630
},
{
"epoch": 1.9025487256371814,
"grad_norm": 1.5976277589797974,
"learning_rate": 1.0708661417322836e-06,
"loss": 1.6201,
"step": 635
},
{
"epoch": 1.9175412293853074,
"grad_norm": 1.758497953414917,
"learning_rate": 9.133858267716536e-07,
"loss": 1.6426,
"step": 640
},
{
"epoch": 1.9325337331334334,
"grad_norm": 1.696065902709961,
"learning_rate": 7.559055118110237e-07,
"loss": 1.7097,
"step": 645
},
{
"epoch": 1.9475262368815591,
"grad_norm": 1.9475734233856201,
"learning_rate": 5.984251968503937e-07,
"loss": 1.6048,
"step": 650
},
{
"epoch": 1.9475262368815591,
"eval_loss": 1.6722568273544312,
"eval_model_preparation_time": 0.0233,
"eval_runtime": 463.9611,
"eval_samples_per_second": 2.002,
"eval_steps_per_second": 0.502,
"step": 650
}
],
"logging_steps": 5,
"max_steps": 668,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0005
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.115437053132145e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}