Shahradmz's picture
dataset 0 reward model training
65bb19b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.38278977185729596,
"eval_steps": 10,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001531159087429184,
"grad_norm": 270.11077880859375,
"learning_rate": 4.996172102281427e-06,
"loss": 1.2137,
"step": 10
},
{
"epoch": 0.001531159087429184,
"eval_accuracy": 0.5366216907106497,
"eval_loss": 1.2146648168563843,
"eval_runtime": 277.3281,
"eval_samples_per_second": 162.796,
"eval_steps_per_second": 20.351,
"step": 10
},
{
"epoch": 0.003062318174858368,
"grad_norm": 69.96483612060547,
"learning_rate": 4.9923442045628545e-06,
"loss": 0.8743,
"step": 20
},
{
"epoch": 0.003062318174858368,
"eval_accuracy": 0.5232508865445948,
"eval_loss": 1.02613365650177,
"eval_runtime": 276.3955,
"eval_samples_per_second": 163.346,
"eval_steps_per_second": 20.42,
"step": 20
},
{
"epoch": 0.004593477262287551,
"grad_norm": 120.09966278076172,
"learning_rate": 4.988516306844281e-06,
"loss": 1.0324,
"step": 30
},
{
"epoch": 0.004593477262287551,
"eval_accuracy": 0.5467704523151039,
"eval_loss": 0.9261217713356018,
"eval_runtime": 277.0496,
"eval_samples_per_second": 162.96,
"eval_steps_per_second": 20.372,
"step": 30
},
{
"epoch": 0.006124636349716736,
"grad_norm": 76.13516998291016,
"learning_rate": 4.984688409125709e-06,
"loss": 0.9529,
"step": 40
},
{
"epoch": 0.006124636349716736,
"eval_accuracy": 0.5361401310932641,
"eval_loss": 0.8699733018875122,
"eval_runtime": 276.196,
"eval_samples_per_second": 163.464,
"eval_steps_per_second": 20.435,
"step": 40
},
{
"epoch": 0.007655795437145919,
"grad_norm": 51.14548110961914,
"learning_rate": 4.980860511407135e-06,
"loss": 0.8279,
"step": 50
},
{
"epoch": 0.007655795437145919,
"eval_accuracy": 0.5364695340501792,
"eval_loss": 0.8454414010047913,
"eval_runtime": 277.7047,
"eval_samples_per_second": 162.576,
"eval_steps_per_second": 20.324,
"step": 50
},
{
"epoch": 0.009186954524575103,
"grad_norm": 29.365215301513672,
"learning_rate": 4.977032613688563e-06,
"loss": 0.6843,
"step": 60
},
{
"epoch": 0.009186954524575103,
"eval_accuracy": 0.5485189652861546,
"eval_loss": 0.8400074243545532,
"eval_runtime": 276.2282,
"eval_samples_per_second": 163.445,
"eval_steps_per_second": 20.432,
"step": 60
},
{
"epoch": 0.010718113612004287,
"grad_norm": 167.26617431640625,
"learning_rate": 4.9732047159699895e-06,
"loss": 0.8245,
"step": 70
},
{
"epoch": 0.010718113612004287,
"eval_accuracy": 0.5589761867634938,
"eval_loss": 1.03753662109375,
"eval_runtime": 280.6426,
"eval_samples_per_second": 160.874,
"eval_steps_per_second": 20.111,
"step": 70
},
{
"epoch": 0.012249272699433471,
"grad_norm": 104.5846939086914,
"learning_rate": 4.969376818251417e-06,
"loss": 0.8412,
"step": 80
},
{
"epoch": 0.012249272699433471,
"eval_accuracy": 0.5760374802939805,
"eval_loss": 0.7869167923927307,
"eval_runtime": 281.2402,
"eval_samples_per_second": 160.532,
"eval_steps_per_second": 20.068,
"step": 80
},
{
"epoch": 0.013780431786862656,
"grad_norm": 35.11406326293945,
"learning_rate": 4.965548920532844e-06,
"loss": 0.8279,
"step": 90
},
{
"epoch": 0.013780431786862656,
"eval_accuracy": 0.5787423483583751,
"eval_loss": 0.724981963634491,
"eval_runtime": 277.5138,
"eval_samples_per_second": 162.687,
"eval_steps_per_second": 20.338,
"step": 90
},
{
"epoch": 0.015311590874291839,
"grad_norm": 25.292316436767578,
"learning_rate": 4.961721022814271e-06,
"loss": 0.6736,
"step": 100
},
{
"epoch": 0.015311590874291839,
"eval_accuracy": 0.5338034748948856,
"eval_loss": 0.8150990009307861,
"eval_runtime": 278.062,
"eval_samples_per_second": 162.367,
"eval_steps_per_second": 20.298,
"step": 100
},
{
"epoch": 0.016842749961721023,
"grad_norm": 21.010047912597656,
"learning_rate": 4.957893125095698e-06,
"loss": 0.729,
"step": 110
},
{
"epoch": 0.016842749961721023,
"eval_accuracy": 0.5677641824249166,
"eval_loss": 0.833014965057373,
"eval_runtime": 278.6412,
"eval_samples_per_second": 162.029,
"eval_steps_per_second": 20.255,
"step": 110
},
{
"epoch": 0.018373909049150206,
"grad_norm": 29.881685256958008,
"learning_rate": 4.954065227377125e-06,
"loss": 0.771,
"step": 120
},
{
"epoch": 0.018373909049150206,
"eval_accuracy": 0.5731047331765958,
"eval_loss": 0.7811030745506287,
"eval_runtime": 276.9087,
"eval_samples_per_second": 163.043,
"eval_steps_per_second": 20.382,
"step": 120
},
{
"epoch": 0.01990506813657939,
"grad_norm": 15.595231056213379,
"learning_rate": 4.950237329658552e-06,
"loss": 0.7783,
"step": 130
},
{
"epoch": 0.01990506813657939,
"eval_accuracy": 0.5626015898109594,
"eval_loss": 0.7167708873748779,
"eval_runtime": 277.1636,
"eval_samples_per_second": 162.893,
"eval_steps_per_second": 20.363,
"step": 130
},
{
"epoch": 0.021436227224008574,
"grad_norm": 22.25090789794922,
"learning_rate": 4.946409431939979e-06,
"loss": 0.6794,
"step": 140
},
{
"epoch": 0.021436227224008574,
"eval_accuracy": 0.5578002997964069,
"eval_loss": 0.7202938795089722,
"eval_runtime": 278.0355,
"eval_samples_per_second": 162.382,
"eval_steps_per_second": 20.3,
"step": 140
},
{
"epoch": 0.022967386311437757,
"grad_norm": 37.13331604003906,
"learning_rate": 4.942581534221406e-06,
"loss": 0.7133,
"step": 150
},
{
"epoch": 0.022967386311437757,
"eval_accuracy": 0.5772544642857143,
"eval_loss": 0.7133862972259521,
"eval_runtime": 277.8264,
"eval_samples_per_second": 162.504,
"eval_steps_per_second": 20.315,
"step": 150
},
{
"epoch": 0.024498545398866943,
"grad_norm": 28.552654266357422,
"learning_rate": 4.9387536365028335e-06,
"loss": 0.7334,
"step": 160
},
{
"epoch": 0.024498545398866943,
"eval_accuracy": 0.5714637421665174,
"eval_loss": 0.7050605416297913,
"eval_runtime": 277.1231,
"eval_samples_per_second": 162.917,
"eval_steps_per_second": 20.366,
"step": 160
},
{
"epoch": 0.026029704486296126,
"grad_norm": 18.05495834350586,
"learning_rate": 4.93492573878426e-06,
"loss": 0.7056,
"step": 170
},
{
"epoch": 0.026029704486296126,
"eval_accuracy": 0.5888312385382655,
"eval_loss": 0.6785433888435364,
"eval_runtime": 275.2348,
"eval_samples_per_second": 164.034,
"eval_steps_per_second": 20.506,
"step": 170
},
{
"epoch": 0.02756086357372531,
"grad_norm": 14.354559898376465,
"learning_rate": 4.931097841065688e-06,
"loss": 0.6317,
"step": 180
},
{
"epoch": 0.02756086357372531,
"eval_accuracy": 0.5905926454837558,
"eval_loss": 0.6833683252334595,
"eval_runtime": 278.3913,
"eval_samples_per_second": 162.175,
"eval_steps_per_second": 20.274,
"step": 180
},
{
"epoch": 0.029092022661154494,
"grad_norm": 10.222257614135742,
"learning_rate": 4.927269943347114e-06,
"loss": 0.6975,
"step": 190
},
{
"epoch": 0.029092022661154494,
"eval_accuracy": 0.5688126660860633,
"eval_loss": 0.6987228989601135,
"eval_runtime": 278.0516,
"eval_samples_per_second": 162.373,
"eval_steps_per_second": 20.298,
"step": 190
},
{
"epoch": 0.030623181748583677,
"grad_norm": 19.388473510742188,
"learning_rate": 4.923442045628542e-06,
"loss": 0.675,
"step": 200
},
{
"epoch": 0.030623181748583677,
"eval_accuracy": 0.5726792755463477,
"eval_loss": 0.7145602107048035,
"eval_runtime": 280.8737,
"eval_samples_per_second": 160.741,
"eval_steps_per_second": 20.094,
"step": 200
},
{
"epoch": 0.03215434083601286,
"grad_norm": 21.06436538696289,
"learning_rate": 4.919614147909968e-06,
"loss": 0.6576,
"step": 210
},
{
"epoch": 0.03215434083601286,
"eval_accuracy": 0.5798941563639598,
"eval_loss": 0.7224599123001099,
"eval_runtime": 276.3195,
"eval_samples_per_second": 163.391,
"eval_steps_per_second": 20.426,
"step": 210
},
{
"epoch": 0.033685499923442046,
"grad_norm": 14.679203987121582,
"learning_rate": 4.915786250191396e-06,
"loss": 0.631,
"step": 220
},
{
"epoch": 0.033685499923442046,
"eval_accuracy": 0.5863426131815351,
"eval_loss": 0.6974319815635681,
"eval_runtime": 275.464,
"eval_samples_per_second": 163.898,
"eval_steps_per_second": 20.489,
"step": 220
},
{
"epoch": 0.03521665901087123,
"grad_norm": 22.14579963684082,
"learning_rate": 4.9119583524728225e-06,
"loss": 0.7267,
"step": 230
},
{
"epoch": 0.03521665901087123,
"eval_accuracy": 0.5820226223144368,
"eval_loss": 0.7019667625427246,
"eval_runtime": 276.7093,
"eval_samples_per_second": 163.16,
"eval_steps_per_second": 20.397,
"step": 230
},
{
"epoch": 0.03674781809830041,
"grad_norm": 7.788635730743408,
"learning_rate": 4.908130454754249e-06,
"loss": 0.6079,
"step": 240
},
{
"epoch": 0.03674781809830041,
"eval_accuracy": 0.5955016145195412,
"eval_loss": 0.6954487562179565,
"eval_runtime": 279.6934,
"eval_samples_per_second": 161.42,
"eval_steps_per_second": 20.179,
"step": 240
},
{
"epoch": 0.0382789771857296,
"grad_norm": 10.69842529296875,
"learning_rate": 4.904302557035676e-06,
"loss": 0.6862,
"step": 250
},
{
"epoch": 0.0382789771857296,
"eval_accuracy": 0.5917668001780151,
"eval_loss": 0.7000778913497925,
"eval_runtime": 279.4881,
"eval_samples_per_second": 161.538,
"eval_steps_per_second": 20.194,
"step": 250
},
{
"epoch": 0.03981013627315878,
"grad_norm": 11.247802734375,
"learning_rate": 4.900474659317103e-06,
"loss": 0.6138,
"step": 260
},
{
"epoch": 0.03981013627315878,
"eval_accuracy": 0.5918217597225002,
"eval_loss": 0.7174475789070129,
"eval_runtime": 279.259,
"eval_samples_per_second": 161.671,
"eval_steps_per_second": 20.211,
"step": 260
},
{
"epoch": 0.04134129536058796,
"grad_norm": 14.550101280212402,
"learning_rate": 4.89664676159853e-06,
"loss": 0.7057,
"step": 270
},
{
"epoch": 0.04134129536058796,
"eval_accuracy": 0.5835486457590877,
"eval_loss": 0.7064123749732971,
"eval_runtime": 279.3626,
"eval_samples_per_second": 161.611,
"eval_steps_per_second": 20.203,
"step": 270
},
{
"epoch": 0.04287245444801715,
"grad_norm": 10.610974311828613,
"learning_rate": 4.8928188638799574e-06,
"loss": 0.6402,
"step": 280
},
{
"epoch": 0.04287245444801715,
"eval_accuracy": 0.565008347245409,
"eval_loss": 0.7342826128005981,
"eval_runtime": 276.5673,
"eval_samples_per_second": 163.244,
"eval_steps_per_second": 20.407,
"step": 280
},
{
"epoch": 0.044403613535446335,
"grad_norm": 16.33481216430664,
"learning_rate": 4.888990966161384e-06,
"loss": 0.6989,
"step": 290
},
{
"epoch": 0.044403613535446335,
"eval_accuracy": 0.5488992095744206,
"eval_loss": 0.7351524829864502,
"eval_runtime": 277.0818,
"eval_samples_per_second": 162.941,
"eval_steps_per_second": 20.369,
"step": 290
},
{
"epoch": 0.045934772622875514,
"grad_norm": 7.8038530349731445,
"learning_rate": 4.8851630684428116e-06,
"loss": 0.8763,
"step": 300
},
{
"epoch": 0.045934772622875514,
"eval_accuracy": 0.5477137400292102,
"eval_loss": 0.7000990509986877,
"eval_runtime": 276.7186,
"eval_samples_per_second": 163.155,
"eval_steps_per_second": 20.396,
"step": 300
},
{
"epoch": 0.0474659317103047,
"grad_norm": 7.672823429107666,
"learning_rate": 4.881335170724238e-06,
"loss": 0.6821,
"step": 310
},
{
"epoch": 0.0474659317103047,
"eval_accuracy": 0.5508064880495188,
"eval_loss": 0.6869771480560303,
"eval_runtime": 276.0483,
"eval_samples_per_second": 163.551,
"eval_steps_per_second": 20.446,
"step": 310
},
{
"epoch": 0.048997090797733886,
"grad_norm": 6.58916711807251,
"learning_rate": 4.877507273005666e-06,
"loss": 0.6771,
"step": 320
},
{
"epoch": 0.048997090797733886,
"eval_accuracy": 0.5899522937439237,
"eval_loss": 0.6720254421234131,
"eval_runtime": 276.7493,
"eval_samples_per_second": 163.137,
"eval_steps_per_second": 20.394,
"step": 320
},
{
"epoch": 0.050528249885163065,
"grad_norm": 5.778668403625488,
"learning_rate": 4.873679375287092e-06,
"loss": 0.6747,
"step": 330
},
{
"epoch": 0.050528249885163065,
"eval_accuracy": 0.59793675409615,
"eval_loss": 0.662046492099762,
"eval_runtime": 275.7574,
"eval_samples_per_second": 163.724,
"eval_steps_per_second": 20.467,
"step": 330
},
{
"epoch": 0.05205940897259225,
"grad_norm": 6.639097690582275,
"learning_rate": 4.86985147756852e-06,
"loss": 0.6283,
"step": 340
},
{
"epoch": 0.05205940897259225,
"eval_accuracy": 0.5966143497757848,
"eval_loss": 0.6647851467132568,
"eval_runtime": 274.2604,
"eval_samples_per_second": 164.617,
"eval_steps_per_second": 20.579,
"step": 340
},
{
"epoch": 0.05359056806002144,
"grad_norm": 7.865772724151611,
"learning_rate": 4.8660235798499465e-06,
"loss": 0.6396,
"step": 350
},
{
"epoch": 0.05359056806002144,
"eval_accuracy": 0.5950059134626113,
"eval_loss": 0.6750874519348145,
"eval_runtime": 275.118,
"eval_samples_per_second": 164.104,
"eval_steps_per_second": 20.515,
"step": 350
},
{
"epoch": 0.05512172714745062,
"grad_norm": 12.208525657653809,
"learning_rate": 4.862195682131374e-06,
"loss": 0.6802,
"step": 360
},
{
"epoch": 0.05512172714745062,
"eval_accuracy": 0.5931904836228232,
"eval_loss": 0.6721886992454529,
"eval_runtime": 276.9399,
"eval_samples_per_second": 163.025,
"eval_steps_per_second": 20.38,
"step": 360
},
{
"epoch": 0.0566528862348798,
"grad_norm": 21.355411529541016,
"learning_rate": 4.858367784412801e-06,
"loss": 0.7074,
"step": 370
},
{
"epoch": 0.0566528862348798,
"eval_accuracy": 0.5865997770345597,
"eval_loss": 0.6712462306022644,
"eval_runtime": 277.2497,
"eval_samples_per_second": 162.842,
"eval_steps_per_second": 20.357,
"step": 370
},
{
"epoch": 0.05818404532230899,
"grad_norm": 12.31411075592041,
"learning_rate": 4.854539886694228e-06,
"loss": 0.7007,
"step": 380
},
{
"epoch": 0.05818404532230899,
"eval_accuracy": 0.5599766601584416,
"eval_loss": 0.6877785921096802,
"eval_runtime": 278.0283,
"eval_samples_per_second": 162.386,
"eval_steps_per_second": 20.3,
"step": 380
},
{
"epoch": 0.059715204409738175,
"grad_norm": 8.385506629943848,
"learning_rate": 4.850711988975655e-06,
"loss": 0.6589,
"step": 390
},
{
"epoch": 0.059715204409738175,
"eval_accuracy": 0.5800094486063305,
"eval_loss": 0.6738844513893127,
"eval_runtime": 279.2165,
"eval_samples_per_second": 161.695,
"eval_steps_per_second": 20.214,
"step": 390
},
{
"epoch": 0.061246363497167354,
"grad_norm": 9.52385139465332,
"learning_rate": 4.846884091257082e-06,
"loss": 0.6144,
"step": 400
},
{
"epoch": 0.061246363497167354,
"eval_accuracy": 0.5881171772160372,
"eval_loss": 0.6737349033355713,
"eval_runtime": 277.7475,
"eval_samples_per_second": 162.551,
"eval_steps_per_second": 20.321,
"step": 400
},
{
"epoch": 0.06277752258459654,
"grad_norm": 9.535078048706055,
"learning_rate": 4.843056193538509e-06,
"loss": 0.6653,
"step": 410
},
{
"epoch": 0.06277752258459654,
"eval_accuracy": 0.5889365121885882,
"eval_loss": 0.6895773410797119,
"eval_runtime": 277.0672,
"eval_samples_per_second": 162.95,
"eval_steps_per_second": 20.371,
"step": 410
},
{
"epoch": 0.06430868167202572,
"grad_norm": 13.276960372924805,
"learning_rate": 4.839228295819936e-06,
"loss": 0.6033,
"step": 420
},
{
"epoch": 0.06430868167202572,
"eval_accuracy": 0.5839217088211637,
"eval_loss": 0.7118301391601562,
"eval_runtime": 277.1875,
"eval_samples_per_second": 162.879,
"eval_steps_per_second": 20.362,
"step": 420
},
{
"epoch": 0.06583984075945491,
"grad_norm": 11.221186637878418,
"learning_rate": 4.835400398101363e-06,
"loss": 0.6102,
"step": 430
},
{
"epoch": 0.06583984075945491,
"eval_accuracy": 0.5721997903049502,
"eval_loss": 0.7291567325592041,
"eval_runtime": 277.8978,
"eval_samples_per_second": 162.463,
"eval_steps_per_second": 20.31,
"step": 430
},
{
"epoch": 0.06737099984688409,
"grad_norm": 12.54729175567627,
"learning_rate": 4.8315725003827905e-06,
"loss": 0.7269,
"step": 440
},
{
"epoch": 0.06737099984688409,
"eval_accuracy": 0.5739306990338918,
"eval_loss": 0.7139677405357361,
"eval_runtime": 278.0873,
"eval_samples_per_second": 162.352,
"eval_steps_per_second": 20.296,
"step": 440
},
{
"epoch": 0.06890215893431327,
"grad_norm": 7.917787075042725,
"learning_rate": 4.827744602664217e-06,
"loss": 0.625,
"step": 450
},
{
"epoch": 0.06890215893431327,
"eval_accuracy": 0.5735848215281029,
"eval_loss": 0.7001749873161316,
"eval_runtime": 274.8459,
"eval_samples_per_second": 164.267,
"eval_steps_per_second": 20.535,
"step": 450
},
{
"epoch": 0.07043331802174246,
"grad_norm": 9.970056533813477,
"learning_rate": 4.823916704945645e-06,
"loss": 0.6168,
"step": 460
},
{
"epoch": 0.07043331802174246,
"eval_accuracy": 0.5706444127097465,
"eval_loss": 0.7068008184432983,
"eval_runtime": 275.5547,
"eval_samples_per_second": 163.844,
"eval_steps_per_second": 20.482,
"step": 460
},
{
"epoch": 0.07196447710917164,
"grad_norm": 14.022720336914062,
"learning_rate": 4.820088807227071e-06,
"loss": 0.5978,
"step": 470
},
{
"epoch": 0.07196447710917164,
"eval_accuracy": 0.5733915328597199,
"eval_loss": 0.7220426797866821,
"eval_runtime": 276.0799,
"eval_samples_per_second": 163.532,
"eval_steps_per_second": 20.443,
"step": 470
},
{
"epoch": 0.07349563619660082,
"grad_norm": 15.758445739746094,
"learning_rate": 4.816260909508498e-06,
"loss": 0.6583,
"step": 480
},
{
"epoch": 0.07349563619660082,
"eval_accuracy": 0.5795259437643544,
"eval_loss": 0.7102298736572266,
"eval_runtime": 277.2435,
"eval_samples_per_second": 162.846,
"eval_steps_per_second": 20.358,
"step": 480
},
{
"epoch": 0.07502679528403002,
"grad_norm": 9.483732223510742,
"learning_rate": 4.8124330117899254e-06,
"loss": 0.6455,
"step": 490
},
{
"epoch": 0.07502679528403002,
"eval_accuracy": 0.5872728491919802,
"eval_loss": 0.6983802318572998,
"eval_runtime": 277.7989,
"eval_samples_per_second": 162.52,
"eval_steps_per_second": 20.317,
"step": 490
},
{
"epoch": 0.0765579543714592,
"grad_norm": 11.97518539428711,
"learning_rate": 4.808605114071352e-06,
"loss": 0.6796,
"step": 500
},
{
"epoch": 0.0765579543714592,
"eval_accuracy": 0.5854998659876709,
"eval_loss": 0.7021452188491821,
"eval_runtime": 278.7441,
"eval_samples_per_second": 161.969,
"eval_steps_per_second": 20.248,
"step": 500
},
{
"epoch": 0.07808911345888837,
"grad_norm": 10.056512832641602,
"learning_rate": 4.8047772163527796e-06,
"loss": 0.6509,
"step": 510
},
{
"epoch": 0.07808911345888837,
"eval_accuracy": 0.57778125558934,
"eval_loss": 0.7093414664268494,
"eval_runtime": 286.7453,
"eval_samples_per_second": 157.45,
"eval_steps_per_second": 19.683,
"step": 510
},
{
"epoch": 0.07962027254631757,
"grad_norm": 11.229554176330566,
"learning_rate": 4.800949318634206e-06,
"loss": 0.5777,
"step": 520
},
{
"epoch": 0.07962027254631757,
"eval_accuracy": 0.5840340820377846,
"eval_loss": 0.7045831084251404,
"eval_runtime": 284.2635,
"eval_samples_per_second": 158.824,
"eval_steps_per_second": 19.855,
"step": 520
},
{
"epoch": 0.08115143163374675,
"grad_norm": 16.119789123535156,
"learning_rate": 4.797121420915634e-06,
"loss": 0.6145,
"step": 530
},
{
"epoch": 0.08115143163374675,
"eval_accuracy": 0.5854364178573018,
"eval_loss": 0.7088597416877747,
"eval_runtime": 279.337,
"eval_samples_per_second": 161.626,
"eval_steps_per_second": 20.205,
"step": 530
},
{
"epoch": 0.08268259072117592,
"grad_norm": 14.363024711608887,
"learning_rate": 4.79329352319706e-06,
"loss": 0.6973,
"step": 540
},
{
"epoch": 0.08268259072117592,
"eval_accuracy": 0.5904418635696169,
"eval_loss": 0.6892778277397156,
"eval_runtime": 279.0651,
"eval_samples_per_second": 161.783,
"eval_steps_per_second": 20.225,
"step": 540
},
{
"epoch": 0.08421374980860512,
"grad_norm": 12.421643257141113,
"learning_rate": 4.789465625478488e-06,
"loss": 0.6444,
"step": 550
},
{
"epoch": 0.08421374980860512,
"eval_accuracy": 0.5964966878584449,
"eval_loss": 0.6805678009986877,
"eval_runtime": 277.6804,
"eval_samples_per_second": 162.59,
"eval_steps_per_second": 20.326,
"step": 550
},
{
"epoch": 0.0857449088960343,
"grad_norm": 13.641290664672852,
"learning_rate": 4.7856377277599145e-06,
"loss": 0.6197,
"step": 560
},
{
"epoch": 0.0857449088960343,
"eval_accuracy": 0.6008450077829665,
"eval_loss": 0.6836313605308533,
"eval_runtime": 278.3481,
"eval_samples_per_second": 162.2,
"eval_steps_per_second": 20.277,
"step": 560
},
{
"epoch": 0.08727606798346348,
"grad_norm": 11.80357837677002,
"learning_rate": 4.781809830041342e-06,
"loss": 0.6241,
"step": 570
},
{
"epoch": 0.08727606798346348,
"eval_accuracy": 0.601209668453003,
"eval_loss": 0.6760628819465637,
"eval_runtime": 279.0499,
"eval_samples_per_second": 161.792,
"eval_steps_per_second": 20.226,
"step": 570
},
{
"epoch": 0.08880722707089267,
"grad_norm": 10.733393669128418,
"learning_rate": 4.777981932322769e-06,
"loss": 0.713,
"step": 580
},
{
"epoch": 0.08880722707089267,
"eval_accuracy": 0.5967536955697755,
"eval_loss": 0.6692460775375366,
"eval_runtime": 279.8616,
"eval_samples_per_second": 161.323,
"eval_steps_per_second": 20.167,
"step": 580
},
{
"epoch": 0.09033838615832185,
"grad_norm": 11.116392135620117,
"learning_rate": 4.774154034604196e-06,
"loss": 0.6109,
"step": 590
},
{
"epoch": 0.09033838615832185,
"eval_accuracy": 0.5920902946621758,
"eval_loss": 0.674372673034668,
"eval_runtime": 276.8783,
"eval_samples_per_second": 163.061,
"eval_steps_per_second": 20.384,
"step": 590
},
{
"epoch": 0.09186954524575103,
"grad_norm": 9.64384937286377,
"learning_rate": 4.770326136885623e-06,
"loss": 0.6704,
"step": 600
},
{
"epoch": 0.09186954524575103,
"eval_accuracy": 0.586066763425254,
"eval_loss": 0.6863875389099121,
"eval_runtime": 279.7067,
"eval_samples_per_second": 161.412,
"eval_steps_per_second": 20.178,
"step": 600
},
{
"epoch": 0.09340070433318022,
"grad_norm": 13.213354110717773,
"learning_rate": 4.76649823916705e-06,
"loss": 0.6605,
"step": 610
},
{
"epoch": 0.09340070433318022,
"eval_accuracy": 0.5854859919317092,
"eval_loss": 0.6997817158699036,
"eval_runtime": 278.2489,
"eval_samples_per_second": 162.258,
"eval_steps_per_second": 20.284,
"step": 610
},
{
"epoch": 0.0949318634206094,
"grad_norm": 8.854043006896973,
"learning_rate": 4.762670341448477e-06,
"loss": 0.6467,
"step": 620
},
{
"epoch": 0.0949318634206094,
"eval_accuracy": 0.5874072750022343,
"eval_loss": 0.6911128759384155,
"eval_runtime": 279.0402,
"eval_samples_per_second": 161.797,
"eval_steps_per_second": 20.226,
"step": 620
},
{
"epoch": 0.09646302250803858,
"grad_norm": 8.668028831481934,
"learning_rate": 4.758842443729904e-06,
"loss": 0.653,
"step": 630
},
{
"epoch": 0.09646302250803858,
"eval_accuracy": 0.5896617883276816,
"eval_loss": 0.683178186416626,
"eval_runtime": 278.9227,
"eval_samples_per_second": 161.866,
"eval_steps_per_second": 20.235,
"step": 630
},
{
"epoch": 0.09799418159546777,
"grad_norm": 8.654230117797852,
"learning_rate": 4.755014546011331e-06,
"loss": 0.6292,
"step": 640
},
{
"epoch": 0.09799418159546777,
"eval_accuracy": 0.5951244535641003,
"eval_loss": 0.6820477843284607,
"eval_runtime": 279.422,
"eval_samples_per_second": 161.576,
"eval_steps_per_second": 20.199,
"step": 640
},
{
"epoch": 0.09952534068289695,
"grad_norm": 12.051166534423828,
"learning_rate": 4.7511866482927585e-06,
"loss": 0.6319,
"step": 650
},
{
"epoch": 0.09952534068289695,
"eval_accuracy": 0.5965489637996976,
"eval_loss": 0.6887350678443909,
"eval_runtime": 279.2633,
"eval_samples_per_second": 161.668,
"eval_steps_per_second": 20.21,
"step": 650
},
{
"epoch": 0.10105649977032613,
"grad_norm": 10.261021614074707,
"learning_rate": 4.747358750574185e-06,
"loss": 0.687,
"step": 660
},
{
"epoch": 0.10105649977032613,
"eval_accuracy": 0.5923518675154699,
"eval_loss": 0.6835098266601562,
"eval_runtime": 279.8292,
"eval_samples_per_second": 161.341,
"eval_steps_per_second": 20.169,
"step": 660
},
{
"epoch": 0.10258765885775532,
"grad_norm": 9.035223007202148,
"learning_rate": 4.743530852855613e-06,
"loss": 0.6705,
"step": 670
},
{
"epoch": 0.10258765885775532,
"eval_accuracy": 0.5858986422906305,
"eval_loss": 0.6886019706726074,
"eval_runtime": 279.2751,
"eval_samples_per_second": 161.661,
"eval_steps_per_second": 20.209,
"step": 670
},
{
"epoch": 0.1041188179451845,
"grad_norm": 7.328871726989746,
"learning_rate": 4.739702955137039e-06,
"loss": 0.565,
"step": 680
},
{
"epoch": 0.1041188179451845,
"eval_accuracy": 0.5869302949061662,
"eval_loss": 0.7063195109367371,
"eval_runtime": 279.7163,
"eval_samples_per_second": 161.406,
"eval_steps_per_second": 20.178,
"step": 680
},
{
"epoch": 0.10564997703261368,
"grad_norm": 21.045848846435547,
"learning_rate": 4.735875057418467e-06,
"loss": 0.6541,
"step": 690
},
{
"epoch": 0.10564997703261368,
"eval_accuracy": 0.5889380826306538,
"eval_loss": 0.7606213092803955,
"eval_runtime": 280.23,
"eval_samples_per_second": 161.111,
"eval_steps_per_second": 20.141,
"step": 690
},
{
"epoch": 0.10718113612004287,
"grad_norm": 12.267477989196777,
"learning_rate": 4.732047159699893e-06,
"loss": 0.7604,
"step": 700
},
{
"epoch": 0.10718113612004287,
"eval_accuracy": 0.5878412959789937,
"eval_loss": 0.7354863882064819,
"eval_runtime": 280.1519,
"eval_samples_per_second": 161.155,
"eval_steps_per_second": 20.146,
"step": 700
},
{
"epoch": 0.10871229520747205,
"grad_norm": 8.619697570800781,
"learning_rate": 4.72821926198132e-06,
"loss": 0.6401,
"step": 710
},
{
"epoch": 0.10871229520747205,
"eval_accuracy": 0.5879179670084708,
"eval_loss": 0.699480414390564,
"eval_runtime": 282.5216,
"eval_samples_per_second": 159.804,
"eval_steps_per_second": 19.977,
"step": 710
},
{
"epoch": 0.11024345429490125,
"grad_norm": 6.322849750518799,
"learning_rate": 4.724391364262747e-06,
"loss": 0.6129,
"step": 720
},
{
"epoch": 0.11024345429490125,
"eval_accuracy": 0.5929760364139408,
"eval_loss": 0.688232958316803,
"eval_runtime": 278.9404,
"eval_samples_per_second": 161.855,
"eval_steps_per_second": 20.234,
"step": 720
},
{
"epoch": 0.11177461338233043,
"grad_norm": 9.213967323303223,
"learning_rate": 4.720563466544174e-06,
"loss": 0.6502,
"step": 730
},
{
"epoch": 0.11177461338233043,
"eval_accuracy": 0.5926983206583555,
"eval_loss": 0.6913579702377319,
"eval_runtime": 277.6463,
"eval_samples_per_second": 162.61,
"eval_steps_per_second": 20.328,
"step": 730
},
{
"epoch": 0.1133057724697596,
"grad_norm": 7.4615349769592285,
"learning_rate": 4.716735568825601e-06,
"loss": 0.6199,
"step": 740
},
{
"epoch": 0.1133057724697596,
"eval_accuracy": 0.5917493589028877,
"eval_loss": 0.6992406845092773,
"eval_runtime": 278.7175,
"eval_samples_per_second": 161.985,
"eval_steps_per_second": 20.25,
"step": 740
},
{
"epoch": 0.1148369315571888,
"grad_norm": 11.835037231445312,
"learning_rate": 4.712907671107028e-06,
"loss": 0.5761,
"step": 750
},
{
"epoch": 0.1148369315571888,
"eval_accuracy": 0.5893469260561813,
"eval_loss": 0.7284606099128723,
"eval_runtime": 279.7689,
"eval_samples_per_second": 161.376,
"eval_steps_per_second": 20.174,
"step": 750
},
{
"epoch": 0.11636809064461798,
"grad_norm": 9.900166511535645,
"learning_rate": 4.709079773388455e-06,
"loss": 0.6017,
"step": 760
},
{
"epoch": 0.11636809064461798,
"eval_accuracy": 0.5889316629208483,
"eval_loss": 0.7434907555580139,
"eval_runtime": 281.7435,
"eval_samples_per_second": 160.245,
"eval_steps_per_second": 20.032,
"step": 760
},
{
"epoch": 0.11789924973204716,
"grad_norm": 12.274435997009277,
"learning_rate": 4.7052518756698825e-06,
"loss": 0.5757,
"step": 770
},
{
"epoch": 0.11789924973204716,
"eval_accuracy": 0.5897960545337277,
"eval_loss": 0.7581047415733337,
"eval_runtime": 279.4034,
"eval_samples_per_second": 161.587,
"eval_steps_per_second": 20.2,
"step": 770
},
{
"epoch": 0.11943040881947635,
"grad_norm": 10.7369384765625,
"learning_rate": 4.701423977951309e-06,
"loss": 0.6231,
"step": 780
},
{
"epoch": 0.11943040881947635,
"eval_accuracy": 0.5952973044984236,
"eval_loss": 0.7496009469032288,
"eval_runtime": 279.5031,
"eval_samples_per_second": 161.529,
"eval_steps_per_second": 20.193,
"step": 780
},
{
"epoch": 0.12096156790690553,
"grad_norm": 11.4940824508667,
"learning_rate": 4.697596080232737e-06,
"loss": 0.6995,
"step": 790
},
{
"epoch": 0.12096156790690553,
"eval_accuracy": 0.5959564541213064,
"eval_loss": 0.7335057258605957,
"eval_runtime": 279.459,
"eval_samples_per_second": 161.555,
"eval_steps_per_second": 20.196,
"step": 790
},
{
"epoch": 0.12249272699433471,
"grad_norm": 8.03961181640625,
"learning_rate": 4.693768182514163e-06,
"loss": 0.6434,
"step": 800
},
{
"epoch": 0.12249272699433471,
"eval_accuracy": 0.5859463796215819,
"eval_loss": 0.728286623954773,
"eval_runtime": 280.7002,
"eval_samples_per_second": 160.841,
"eval_steps_per_second": 20.107,
"step": 800
},
{
"epoch": 0.1240238860817639,
"grad_norm": 8.879143714904785,
"learning_rate": 4.689940284795591e-06,
"loss": 0.7005,
"step": 810
},
{
"epoch": 0.1240238860817639,
"eval_accuracy": 0.5771607003457121,
"eval_loss": 0.7147245407104492,
"eval_runtime": 276.8558,
"eval_samples_per_second": 163.074,
"eval_steps_per_second": 20.386,
"step": 810
},
{
"epoch": 0.12555504516919308,
"grad_norm": 7.983788967132568,
"learning_rate": 4.686112387077017e-06,
"loss": 0.6639,
"step": 820
},
{
"epoch": 0.12555504516919308,
"eval_accuracy": 0.5777222309014216,
"eval_loss": 0.6992844939231873,
"eval_runtime": 278.3704,
"eval_samples_per_second": 162.187,
"eval_steps_per_second": 20.275,
"step": 820
},
{
"epoch": 0.12708620425662226,
"grad_norm": 6.068845748901367,
"learning_rate": 4.682284489358445e-06,
"loss": 0.6211,
"step": 830
},
{
"epoch": 0.12708620425662226,
"eval_accuracy": 0.5828836462560764,
"eval_loss": 0.689354419708252,
"eval_runtime": 278.7274,
"eval_samples_per_second": 161.979,
"eval_steps_per_second": 20.249,
"step": 830
},
{
"epoch": 0.12861736334405144,
"grad_norm": 7.345738887786865,
"learning_rate": 4.6784565916398715e-06,
"loss": 0.6456,
"step": 840
},
{
"epoch": 0.12861736334405144,
"eval_accuracy": 0.5853175045103236,
"eval_loss": 0.6859722137451172,
"eval_runtime": 279.9849,
"eval_samples_per_second": 161.252,
"eval_steps_per_second": 20.158,
"step": 840
},
{
"epoch": 0.13014852243148062,
"grad_norm": 11.570878028869629,
"learning_rate": 4.674628693921299e-06,
"loss": 0.6255,
"step": 850
},
{
"epoch": 0.13014852243148062,
"eval_accuracy": 0.5951967978652435,
"eval_loss": 0.6828535199165344,
"eval_runtime": 281.2947,
"eval_samples_per_second": 160.501,
"eval_steps_per_second": 20.064,
"step": 850
},
{
"epoch": 0.13167968151890982,
"grad_norm": 10.029556274414062,
"learning_rate": 4.670800796202726e-06,
"loss": 0.5931,
"step": 860
},
{
"epoch": 0.13167968151890982,
"eval_accuracy": 0.5980770938804512,
"eval_loss": 0.6795242428779602,
"eval_runtime": 280.5479,
"eval_samples_per_second": 160.928,
"eval_steps_per_second": 20.118,
"step": 860
},
{
"epoch": 0.133210840606339,
"grad_norm": 16.333703994750977,
"learning_rate": 4.666972898484153e-06,
"loss": 0.7352,
"step": 870
},
{
"epoch": 0.133210840606339,
"eval_accuracy": 0.5991164979577339,
"eval_loss": 0.6769992709159851,
"eval_runtime": 280.0752,
"eval_samples_per_second": 161.2,
"eval_steps_per_second": 20.152,
"step": 870
},
{
"epoch": 0.13474199969376818,
"grad_norm": 6.884426593780518,
"learning_rate": 4.66314500076558e-06,
"loss": 0.6425,
"step": 880
},
{
"epoch": 0.13474199969376818,
"eval_accuracy": 0.598591236334548,
"eval_loss": 0.6669920086860657,
"eval_runtime": 279.9453,
"eval_samples_per_second": 161.274,
"eval_steps_per_second": 20.161,
"step": 880
},
{
"epoch": 0.13627315878119736,
"grad_norm": 9.960312843322754,
"learning_rate": 4.659317103047007e-06,
"loss": 0.6905,
"step": 890
},
{
"epoch": 0.13627315878119736,
"eval_accuracy": 0.5981738203145828,
"eval_loss": 0.6663030385971069,
"eval_runtime": 279.6741,
"eval_samples_per_second": 161.431,
"eval_steps_per_second": 20.181,
"step": 890
},
{
"epoch": 0.13780431786862654,
"grad_norm": 7.798278331756592,
"learning_rate": 4.655489205328434e-06,
"loss": 0.5681,
"step": 900
},
{
"epoch": 0.13780431786862654,
"eval_accuracy": 0.6002266515565629,
"eval_loss": 0.6640587449073792,
"eval_runtime": 279.9048,
"eval_samples_per_second": 161.298,
"eval_steps_per_second": 20.164,
"step": 900
},
{
"epoch": 0.13933547695605572,
"grad_norm": 10.989811897277832,
"learning_rate": 4.651661307609861e-06,
"loss": 0.5809,
"step": 910
},
{
"epoch": 0.13933547695605572,
"eval_accuracy": 0.599174562318326,
"eval_loss": 0.6831759810447693,
"eval_runtime": 278.8518,
"eval_samples_per_second": 161.907,
"eval_steps_per_second": 20.24,
"step": 910
},
{
"epoch": 0.14086663604348493,
"grad_norm": 13.403684616088867,
"learning_rate": 4.647833409891288e-06,
"loss": 0.5984,
"step": 920
},
{
"epoch": 0.14086663604348493,
"eval_accuracy": 0.5973414996782282,
"eval_loss": 0.7047386765480042,
"eval_runtime": 277.7644,
"eval_samples_per_second": 162.541,
"eval_steps_per_second": 20.319,
"step": 920
},
{
"epoch": 0.1423977951309141,
"grad_norm": 11.702314376831055,
"learning_rate": 4.6440055121727155e-06,
"loss": 0.631,
"step": 930
},
{
"epoch": 0.1423977951309141,
"eval_accuracy": 0.5949097681018046,
"eval_loss": 0.7055184841156006,
"eval_runtime": 277.5329,
"eval_samples_per_second": 162.676,
"eval_steps_per_second": 20.336,
"step": 930
},
{
"epoch": 0.14392895421834329,
"grad_norm": 10.92357063293457,
"learning_rate": 4.640177614454142e-06,
"loss": 0.6703,
"step": 940
},
{
"epoch": 0.14392895421834329,
"eval_accuracy": 0.5971907868459593,
"eval_loss": 0.684637188911438,
"eval_runtime": 276.7646,
"eval_samples_per_second": 163.128,
"eval_steps_per_second": 20.393,
"step": 940
},
{
"epoch": 0.14546011330577246,
"grad_norm": 9.321954727172852,
"learning_rate": 4.636349716735569e-06,
"loss": 0.6304,
"step": 950
},
{
"epoch": 0.14546011330577246,
"eval_accuracy": 0.5969360568383659,
"eval_loss": 0.6831667423248291,
"eval_runtime": 276.3791,
"eval_samples_per_second": 163.355,
"eval_steps_per_second": 20.421,
"step": 950
},
{
"epoch": 0.14699127239320164,
"grad_norm": 11.221244812011719,
"learning_rate": 4.632521819016996e-06,
"loss": 0.6373,
"step": 960
},
{
"epoch": 0.14699127239320164,
"eval_accuracy": 0.5956472445145944,
"eval_loss": 0.6867417097091675,
"eval_runtime": 275.868,
"eval_samples_per_second": 163.658,
"eval_steps_per_second": 20.459,
"step": 960
},
{
"epoch": 0.14852243148063085,
"grad_norm": 13.402386665344238,
"learning_rate": 4.628693921298423e-06,
"loss": 0.6338,
"step": 970
},
{
"epoch": 0.14852243148063085,
"eval_accuracy": 0.595343976519767,
"eval_loss": 0.6871860027313232,
"eval_runtime": 276.6694,
"eval_samples_per_second": 163.184,
"eval_steps_per_second": 20.4,
"step": 970
},
{
"epoch": 0.15005359056806003,
"grad_norm": 6.8687520027160645,
"learning_rate": 4.6248660235798504e-06,
"loss": 0.6541,
"step": 980
},
{
"epoch": 0.15005359056806003,
"eval_accuracy": 0.5944526067405725,
"eval_loss": 0.6828967332839966,
"eval_runtime": 277.3111,
"eval_samples_per_second": 162.806,
"eval_steps_per_second": 20.353,
"step": 980
},
{
"epoch": 0.1515847496554892,
"grad_norm": 8.383277893066406,
"learning_rate": 4.621038125861277e-06,
"loss": 0.6485,
"step": 990
},
{
"epoch": 0.1515847496554892,
"eval_accuracy": 0.5881514159035716,
"eval_loss": 0.6898565292358398,
"eval_runtime": 278.3748,
"eval_samples_per_second": 162.184,
"eval_steps_per_second": 20.275,
"step": 990
},
{
"epoch": 0.1531159087429184,
"grad_norm": 8.281054496765137,
"learning_rate": 4.617210228142705e-06,
"loss": 0.5877,
"step": 1000
},
{
"epoch": 0.1531159087429184,
"eval_accuracy": 0.5914452307829261,
"eval_loss": 0.6997293829917908,
"eval_runtime": 277.7962,
"eval_samples_per_second": 162.522,
"eval_steps_per_second": 20.317,
"step": 1000
},
{
"epoch": 0.15464706783034757,
"grad_norm": 10.8377685546875,
"learning_rate": 4.613382330424131e-06,
"loss": 0.6585,
"step": 1010
},
{
"epoch": 0.15464706783034757,
"eval_accuracy": 0.5923300819872465,
"eval_loss": 0.6947582364082336,
"eval_runtime": 278.9872,
"eval_samples_per_second": 161.828,
"eval_steps_per_second": 20.23,
"step": 1010
},
{
"epoch": 0.15617822691777675,
"grad_norm": 12.618541717529297,
"learning_rate": 4.609554432705559e-06,
"loss": 0.6153,
"step": 1020
},
{
"epoch": 0.15617822691777675,
"eval_accuracy": 0.5965337184757249,
"eval_loss": 0.6904256939888,
"eval_runtime": 278.6853,
"eval_samples_per_second": 162.004,
"eval_steps_per_second": 20.252,
"step": 1020
},
{
"epoch": 0.15770938600520595,
"grad_norm": 15.610793113708496,
"learning_rate": 4.605726534986985e-06,
"loss": 0.6145,
"step": 1030
},
{
"epoch": 0.15770938600520595,
"eval_accuracy": 0.5957805907172996,
"eval_loss": 0.7072130441665649,
"eval_runtime": 278.4706,
"eval_samples_per_second": 162.128,
"eval_steps_per_second": 20.268,
"step": 1030
},
{
"epoch": 0.15924054509263513,
"grad_norm": 10.127962112426758,
"learning_rate": 4.601898637268413e-06,
"loss": 0.6019,
"step": 1040
},
{
"epoch": 0.15924054509263513,
"eval_accuracy": 0.5954627183733269,
"eval_loss": 0.6940288543701172,
"eval_runtime": 278.8001,
"eval_samples_per_second": 161.937,
"eval_steps_per_second": 20.244,
"step": 1040
},
{
"epoch": 0.1607717041800643,
"grad_norm": 18.335458755493164,
"learning_rate": 4.5980707395498395e-06,
"loss": 0.5354,
"step": 1050
},
{
"epoch": 0.1607717041800643,
"eval_accuracy": 0.5993520757982559,
"eval_loss": 0.7147676348686218,
"eval_runtime": 278.6112,
"eval_samples_per_second": 162.047,
"eval_steps_per_second": 20.258,
"step": 1050
},
{
"epoch": 0.1623028632674935,
"grad_norm": 13.370587348937988,
"learning_rate": 4.594242841831267e-06,
"loss": 0.6977,
"step": 1060
},
{
"epoch": 0.1623028632674935,
"eval_accuracy": 0.5989220600629908,
"eval_loss": 0.7047263979911804,
"eval_runtime": 279.4512,
"eval_samples_per_second": 161.56,
"eval_steps_per_second": 20.197,
"step": 1060
},
{
"epoch": 0.16383402235492267,
"grad_norm": 9.09716510772705,
"learning_rate": 4.590414944112694e-06,
"loss": 0.6039,
"step": 1070
},
{
"epoch": 0.16383402235492267,
"eval_accuracy": 0.5984739258700619,
"eval_loss": 0.6938444972038269,
"eval_runtime": 280.6964,
"eval_samples_per_second": 160.843,
"eval_steps_per_second": 20.107,
"step": 1070
},
{
"epoch": 0.16536518144235185,
"grad_norm": 11.401485443115234,
"learning_rate": 4.586587046394121e-06,
"loss": 0.6579,
"step": 1080
},
{
"epoch": 0.16536518144235185,
"eval_accuracy": 0.5967512870584059,
"eval_loss": 0.6896911263465881,
"eval_runtime": 279.247,
"eval_samples_per_second": 161.678,
"eval_steps_per_second": 20.212,
"step": 1080
},
{
"epoch": 0.16689634052978106,
"grad_norm": 10.442956924438477,
"learning_rate": 4.582759148675548e-06,
"loss": 0.5409,
"step": 1090
},
{
"epoch": 0.16689634052978106,
"eval_accuracy": 0.5923961292613636,
"eval_loss": 0.7205661535263062,
"eval_runtime": 278.5362,
"eval_samples_per_second": 162.09,
"eval_steps_per_second": 20.263,
"step": 1090
},
{
"epoch": 0.16842749961721024,
"grad_norm": 24.116500854492188,
"learning_rate": 4.578931250956975e-06,
"loss": 0.5717,
"step": 1100
},
{
"epoch": 0.16842749961721024,
"eval_accuracy": 0.5918675367336973,
"eval_loss": 0.7739020586013794,
"eval_runtime": 277.7642,
"eval_samples_per_second": 162.541,
"eval_steps_per_second": 20.319,
"step": 1100
},
{
"epoch": 0.16995865870463941,
"grad_norm": 17.19237518310547,
"learning_rate": 4.575103353238402e-06,
"loss": 0.7444,
"step": 1110
},
{
"epoch": 0.16995865870463941,
"eval_accuracy": 0.5971056953877569,
"eval_loss": 0.7259252667427063,
"eval_runtime": 280.0491,
"eval_samples_per_second": 161.215,
"eval_steps_per_second": 20.154,
"step": 1110
},
{
"epoch": 0.1714898177920686,
"grad_norm": 12.191926002502441,
"learning_rate": 4.571275455519829e-06,
"loss": 0.5495,
"step": 1120
},
{
"epoch": 0.1714898177920686,
"eval_accuracy": 0.5972413486739816,
"eval_loss": 0.7175703644752502,
"eval_runtime": 280.1557,
"eval_samples_per_second": 161.153,
"eval_steps_per_second": 20.146,
"step": 1120
},
{
"epoch": 0.17302097687949777,
"grad_norm": 18.153154373168945,
"learning_rate": 4.567447557801256e-06,
"loss": 0.6002,
"step": 1130
},
{
"epoch": 0.17302097687949777,
"eval_accuracy": 0.5982025962498613,
"eval_loss": 0.7397978901863098,
"eval_runtime": 280.0206,
"eval_samples_per_second": 161.231,
"eval_steps_per_second": 20.156,
"step": 1130
},
{
"epoch": 0.17455213596692695,
"grad_norm": 9.707260131835938,
"learning_rate": 4.5636196600826835e-06,
"loss": 0.648,
"step": 1140
},
{
"epoch": 0.17455213596692695,
"eval_accuracy": 0.5982077501497238,
"eval_loss": 0.7219535708427429,
"eval_runtime": 279.1283,
"eval_samples_per_second": 161.746,
"eval_steps_per_second": 20.22,
"step": 1140
},
{
"epoch": 0.17608329505435616,
"grad_norm": 13.713787078857422,
"learning_rate": 4.55979176236411e-06,
"loss": 0.7169,
"step": 1150
},
{
"epoch": 0.17608329505435616,
"eval_accuracy": 0.5967283703999645,
"eval_loss": 0.7080119848251343,
"eval_runtime": 280.0417,
"eval_samples_per_second": 161.219,
"eval_steps_per_second": 20.154,
"step": 1150
},
{
"epoch": 0.17761445414178534,
"grad_norm": 12.010796546936035,
"learning_rate": 4.555963864645538e-06,
"loss": 0.6007,
"step": 1160
},
{
"epoch": 0.17761445414178534,
"eval_accuracy": 0.593027131524565,
"eval_loss": 0.686759889125824,
"eval_runtime": 277.4036,
"eval_samples_per_second": 162.752,
"eval_steps_per_second": 20.346,
"step": 1160
},
{
"epoch": 0.17914561322921452,
"grad_norm": 11.684185028076172,
"learning_rate": 4.552135966926964e-06,
"loss": 0.5699,
"step": 1170
},
{
"epoch": 0.17914561322921452,
"eval_accuracy": 0.589081225033289,
"eval_loss": 0.6952749490737915,
"eval_runtime": 278.728,
"eval_samples_per_second": 161.979,
"eval_steps_per_second": 20.249,
"step": 1170
},
{
"epoch": 0.1806767723166437,
"grad_norm": 14.61754035949707,
"learning_rate": 4.548308069208391e-06,
"loss": 0.6718,
"step": 1180
},
{
"epoch": 0.1806767723166437,
"eval_accuracy": 0.5850660157550205,
"eval_loss": 0.7031010985374451,
"eval_runtime": 279.3758,
"eval_samples_per_second": 161.603,
"eval_steps_per_second": 20.202,
"step": 1180
},
{
"epoch": 0.18220793140407288,
"grad_norm": 8.807073593139648,
"learning_rate": 4.544480171489818e-06,
"loss": 0.6719,
"step": 1190
},
{
"epoch": 0.18220793140407288,
"eval_accuracy": 0.5842983840494343,
"eval_loss": 0.6897585988044739,
"eval_runtime": 278.4428,
"eval_samples_per_second": 162.145,
"eval_steps_per_second": 20.27,
"step": 1190
},
{
"epoch": 0.18373909049150206,
"grad_norm": 8.141523361206055,
"learning_rate": 4.540652273771245e-06,
"loss": 0.6139,
"step": 1200
},
{
"epoch": 0.18373909049150206,
"eval_accuracy": 0.5901683023224832,
"eval_loss": 0.6856178045272827,
"eval_runtime": 278.2363,
"eval_samples_per_second": 162.265,
"eval_steps_per_second": 20.285,
"step": 1200
},
{
"epoch": 0.18527024957893126,
"grad_norm": 8.22572135925293,
"learning_rate": 4.536824376052672e-06,
"loss": 0.6554,
"step": 1210
},
{
"epoch": 0.18527024957893126,
"eval_accuracy": 0.5936903334665423,
"eval_loss": 0.6899842619895935,
"eval_runtime": 279.6454,
"eval_samples_per_second": 161.447,
"eval_steps_per_second": 20.183,
"step": 1210
},
{
"epoch": 0.18680140866636044,
"grad_norm": 10.63383674621582,
"learning_rate": 4.532996478334099e-06,
"loss": 0.5281,
"step": 1220
},
{
"epoch": 0.18680140866636044,
"eval_accuracy": 0.5959264271926515,
"eval_loss": 0.7073134183883667,
"eval_runtime": 280.0671,
"eval_samples_per_second": 161.204,
"eval_steps_per_second": 20.152,
"step": 1220
},
{
"epoch": 0.18833256775378962,
"grad_norm": 17.710468292236328,
"learning_rate": 4.529168580615526e-06,
"loss": 0.6106,
"step": 1230
},
{
"epoch": 0.18833256775378962,
"eval_accuracy": 0.5961982077899033,
"eval_loss": 0.7480549812316895,
"eval_runtime": 279.3117,
"eval_samples_per_second": 161.64,
"eval_steps_per_second": 20.207,
"step": 1230
},
{
"epoch": 0.1898637268412188,
"grad_norm": 19.713132858276367,
"learning_rate": 4.525340682896953e-06,
"loss": 0.6344,
"step": 1240
},
{
"epoch": 0.1898637268412188,
"eval_accuracy": 0.5923722417084758,
"eval_loss": 0.7380005717277527,
"eval_runtime": 279.4642,
"eval_samples_per_second": 161.552,
"eval_steps_per_second": 20.196,
"step": 1240
},
{
"epoch": 0.19139488592864798,
"grad_norm": 17.92173957824707,
"learning_rate": 4.52151278517838e-06,
"loss": 0.5918,
"step": 1250
},
{
"epoch": 0.19139488592864798,
"eval_accuracy": 0.5899982226961699,
"eval_loss": 0.7242019772529602,
"eval_runtime": 279.4636,
"eval_samples_per_second": 161.552,
"eval_steps_per_second": 20.196,
"step": 1250
},
{
"epoch": 0.19292604501607716,
"grad_norm": 12.8328857421875,
"learning_rate": 4.5176848874598075e-06,
"loss": 0.6847,
"step": 1260
},
{
"epoch": 0.19292604501607716,
"eval_accuracy": 0.5958907152376721,
"eval_loss": 0.6952394843101501,
"eval_runtime": 276.2146,
"eval_samples_per_second": 163.453,
"eval_steps_per_second": 20.433,
"step": 1260
},
{
"epoch": 0.19445720410350636,
"grad_norm": 8.0042142868042,
"learning_rate": 4.513856989741234e-06,
"loss": 0.6312,
"step": 1270
},
{
"epoch": 0.19445720410350636,
"eval_accuracy": 0.5953926887841323,
"eval_loss": 0.6836423873901367,
"eval_runtime": 279.4491,
"eval_samples_per_second": 161.561,
"eval_steps_per_second": 20.197,
"step": 1270
},
{
"epoch": 0.19598836319093554,
"grad_norm": 10.186333656311035,
"learning_rate": 4.510029092022662e-06,
"loss": 0.6135,
"step": 1280
},
{
"epoch": 0.19598836319093554,
"eval_accuracy": 0.5948660962329148,
"eval_loss": 0.6878789067268372,
"eval_runtime": 276.736,
"eval_samples_per_second": 163.145,
"eval_steps_per_second": 20.395,
"step": 1280
},
{
"epoch": 0.19751952227836472,
"grad_norm": 9.086492538452148,
"learning_rate": 4.506201194304088e-06,
"loss": 0.6481,
"step": 1290
},
{
"epoch": 0.19751952227836472,
"eval_accuracy": 0.5944503735325507,
"eval_loss": 0.6821103692054749,
"eval_runtime": 274.4173,
"eval_samples_per_second": 164.523,
"eval_steps_per_second": 20.567,
"step": 1290
},
{
"epoch": 0.1990506813657939,
"grad_norm": 8.008011817932129,
"learning_rate": 4.502373296585516e-06,
"loss": 0.6022,
"step": 1300
},
{
"epoch": 0.1990506813657939,
"eval_accuracy": 0.5919674427913804,
"eval_loss": 0.6874357461929321,
"eval_runtime": 273.5543,
"eval_samples_per_second": 165.042,
"eval_steps_per_second": 20.632,
"step": 1300
},
{
"epoch": 0.20058184045322308,
"grad_norm": 9.115822792053223,
"learning_rate": 4.498545398866942e-06,
"loss": 0.5877,
"step": 1310
},
{
"epoch": 0.20058184045322308,
"eval_accuracy": 0.5949752993012595,
"eval_loss": 0.696998655796051,
"eval_runtime": 277.1667,
"eval_samples_per_second": 162.891,
"eval_steps_per_second": 20.363,
"step": 1310
},
{
"epoch": 0.20211299954065226,
"grad_norm": 14.700295448303223,
"learning_rate": 4.49471750114837e-06,
"loss": 0.6563,
"step": 1320
},
{
"epoch": 0.20211299954065226,
"eval_accuracy": 0.5916631504141775,
"eval_loss": 0.7209578156471252,
"eval_runtime": 277.3197,
"eval_samples_per_second": 162.801,
"eval_steps_per_second": 20.352,
"step": 1320
},
{
"epoch": 0.20364415862808147,
"grad_norm": 12.265641212463379,
"learning_rate": 4.4908896034297965e-06,
"loss": 0.6844,
"step": 1330
},
{
"epoch": 0.20364415862808147,
"eval_accuracy": 0.5921450151057401,
"eval_loss": 0.7018990516662598,
"eval_runtime": 275.9386,
"eval_samples_per_second": 163.616,
"eval_steps_per_second": 20.454,
"step": 1330
},
{
"epoch": 0.20517531771551065,
"grad_norm": 14.695290565490723,
"learning_rate": 4.487061705711224e-06,
"loss": 0.6242,
"step": 1340
},
{
"epoch": 0.20517531771551065,
"eval_accuracy": 0.5874938869870626,
"eval_loss": 0.7059697508811951,
"eval_runtime": 275.3787,
"eval_samples_per_second": 163.949,
"eval_steps_per_second": 20.495,
"step": 1340
},
{
"epoch": 0.20670647680293983,
"grad_norm": 17.197837829589844,
"learning_rate": 4.483233807992651e-06,
"loss": 0.6532,
"step": 1350
},
{
"epoch": 0.20670647680293983,
"eval_accuracy": 0.5832462130480237,
"eval_loss": 0.7054564952850342,
"eval_runtime": 278.9798,
"eval_samples_per_second": 161.833,
"eval_steps_per_second": 20.231,
"step": 1350
},
{
"epoch": 0.208237635890369,
"grad_norm": 10.455153465270996,
"learning_rate": 4.479405910274078e-06,
"loss": 0.6235,
"step": 1360
},
{
"epoch": 0.208237635890369,
"eval_accuracy": 0.5825216811207472,
"eval_loss": 0.7006902098655701,
"eval_runtime": 278.2863,
"eval_samples_per_second": 162.236,
"eval_steps_per_second": 20.281,
"step": 1360
},
{
"epoch": 0.20976879497779818,
"grad_norm": 11.930909156799316,
"learning_rate": 4.475578012555505e-06,
"loss": 0.5851,
"step": 1370
},
{
"epoch": 0.20976879497779818,
"eval_accuracy": 0.5853447126283504,
"eval_loss": 0.7129948139190674,
"eval_runtime": 277.2023,
"eval_samples_per_second": 162.87,
"eval_steps_per_second": 20.361,
"step": 1370
},
{
"epoch": 0.21129995406522736,
"grad_norm": 10.416621208190918,
"learning_rate": 4.471750114836932e-06,
"loss": 0.6387,
"step": 1380
},
{
"epoch": 0.21129995406522736,
"eval_accuracy": 0.5875080603917906,
"eval_loss": 0.7203475832939148,
"eval_runtime": 276.7485,
"eval_samples_per_second": 163.137,
"eval_steps_per_second": 20.394,
"step": 1380
},
{
"epoch": 0.21283111315265657,
"grad_norm": 14.316187858581543,
"learning_rate": 4.467922217118359e-06,
"loss": 0.5589,
"step": 1390
},
{
"epoch": 0.21283111315265657,
"eval_accuracy": 0.5918902114052229,
"eval_loss": 0.7276438474655151,
"eval_runtime": 277.5697,
"eval_samples_per_second": 162.655,
"eval_steps_per_second": 20.334,
"step": 1390
},
{
"epoch": 0.21436227224008575,
"grad_norm": 11.353260040283203,
"learning_rate": 4.4640943193997856e-06,
"loss": 0.5305,
"step": 1400
},
{
"epoch": 0.21436227224008575,
"eval_accuracy": 0.5941167335891918,
"eval_loss": 0.7376775145530701,
"eval_runtime": 277.7475,
"eval_samples_per_second": 162.551,
"eval_steps_per_second": 20.321,
"step": 1400
},
{
"epoch": 0.21589343132751493,
"grad_norm": 25.445398330688477,
"learning_rate": 4.460266421681213e-06,
"loss": 0.6585,
"step": 1410
},
{
"epoch": 0.21589343132751493,
"eval_accuracy": 0.5962537174308669,
"eval_loss": 0.7421597242355347,
"eval_runtime": 276.6197,
"eval_samples_per_second": 163.213,
"eval_steps_per_second": 20.403,
"step": 1410
},
{
"epoch": 0.2174245904149441,
"grad_norm": 12.295394897460938,
"learning_rate": 4.45643852396264e-06,
"loss": 0.6483,
"step": 1420
},
{
"epoch": 0.2174245904149441,
"eval_accuracy": 0.5987611837577426,
"eval_loss": 0.6953349709510803,
"eval_runtime": 278.2037,
"eval_samples_per_second": 162.284,
"eval_steps_per_second": 20.287,
"step": 1420
},
{
"epoch": 0.2189557495023733,
"grad_norm": 11.241786003112793,
"learning_rate": 4.452610626244067e-06,
"loss": 0.5395,
"step": 1430
},
{
"epoch": 0.2189557495023733,
"eval_accuracy": 0.5976324790121263,
"eval_loss": 0.6999543309211731,
"eval_runtime": 279.3688,
"eval_samples_per_second": 161.607,
"eval_steps_per_second": 20.203,
"step": 1430
},
{
"epoch": 0.2204869085898025,
"grad_norm": 14.92603874206543,
"learning_rate": 4.448782728525494e-06,
"loss": 0.619,
"step": 1440
},
{
"epoch": 0.2204869085898025,
"eval_accuracy": 0.5938706670809107,
"eval_loss": 0.7095398306846619,
"eval_runtime": 280.0278,
"eval_samples_per_second": 161.227,
"eval_steps_per_second": 20.155,
"step": 1440
},
{
"epoch": 0.22201806767723167,
"grad_norm": 20.692684173583984,
"learning_rate": 4.444954830806921e-06,
"loss": 0.4735,
"step": 1450
},
{
"epoch": 0.22201806767723167,
"eval_accuracy": 0.5908092395766303,
"eval_loss": 0.734937310218811,
"eval_runtime": 279.9051,
"eval_samples_per_second": 161.298,
"eval_steps_per_second": 20.164,
"step": 1450
},
{
"epoch": 0.22354922676466085,
"grad_norm": 17.677717208862305,
"learning_rate": 4.441126933088348e-06,
"loss": 0.6086,
"step": 1460
},
{
"epoch": 0.22354922676466085,
"eval_accuracy": 0.595049395049395,
"eval_loss": 0.7369093894958496,
"eval_runtime": 280.5701,
"eval_samples_per_second": 160.915,
"eval_steps_per_second": 20.116,
"step": 1460
},
{
"epoch": 0.22508038585209003,
"grad_norm": 15.074790954589844,
"learning_rate": 4.4372990353697755e-06,
"loss": 0.5995,
"step": 1470
},
{
"epoch": 0.22508038585209003,
"eval_accuracy": 0.5964959030044634,
"eval_loss": 0.71633380651474,
"eval_runtime": 280.9412,
"eval_samples_per_second": 160.703,
"eval_steps_per_second": 20.09,
"step": 1470
},
{
"epoch": 0.2266115449395192,
"grad_norm": 14.004373550415039,
"learning_rate": 4.433471137651202e-06,
"loss": 0.6036,
"step": 1480
},
{
"epoch": 0.2266115449395192,
"eval_accuracy": 0.5984582574310214,
"eval_loss": 0.7075589895248413,
"eval_runtime": 279.8252,
"eval_samples_per_second": 161.344,
"eval_steps_per_second": 20.17,
"step": 1480
},
{
"epoch": 0.2281427040269484,
"grad_norm": 12.327754974365234,
"learning_rate": 4.42964323993263e-06,
"loss": 0.6168,
"step": 1490
},
{
"epoch": 0.2281427040269484,
"eval_accuracy": 0.5992044974779459,
"eval_loss": 0.692619800567627,
"eval_runtime": 280.1157,
"eval_samples_per_second": 161.176,
"eval_steps_per_second": 20.149,
"step": 1490
},
{
"epoch": 0.2296738631143776,
"grad_norm": 10.927477836608887,
"learning_rate": 4.425815342214056e-06,
"loss": 0.5584,
"step": 1500
},
{
"epoch": 0.2296738631143776,
"eval_accuracy": 0.5985860696738623,
"eval_loss": 0.7029620409011841,
"eval_runtime": 278.9125,
"eval_samples_per_second": 161.872,
"eval_steps_per_second": 20.236,
"step": 1500
},
{
"epoch": 0.23120502220180678,
"grad_norm": 21.215038299560547,
"learning_rate": 4.421987444495484e-06,
"loss": 0.6836,
"step": 1510
},
{
"epoch": 0.23120502220180678,
"eval_accuracy": 0.5978149842341343,
"eval_loss": 0.7012072205543518,
"eval_runtime": 279.942,
"eval_samples_per_second": 161.276,
"eval_steps_per_second": 20.161,
"step": 1510
},
{
"epoch": 0.23273618128923595,
"grad_norm": 18.26300621032715,
"learning_rate": 4.41815954677691e-06,
"loss": 0.5803,
"step": 1520
},
{
"epoch": 0.23273618128923595,
"eval_accuracy": 0.593573744282098,
"eval_loss": 0.7073465585708618,
"eval_runtime": 278.8484,
"eval_samples_per_second": 161.909,
"eval_steps_per_second": 20.24,
"step": 1520
},
{
"epoch": 0.23426734037666513,
"grad_norm": 15.730330467224121,
"learning_rate": 4.414331649058338e-06,
"loss": 0.6735,
"step": 1530
},
{
"epoch": 0.23426734037666513,
"eval_accuracy": 0.5931864173097022,
"eval_loss": 0.694299578666687,
"eval_runtime": 278.339,
"eval_samples_per_second": 162.205,
"eval_steps_per_second": 20.277,
"step": 1530
},
{
"epoch": 0.2357984994640943,
"grad_norm": 10.599174499511719,
"learning_rate": 4.4105037513397645e-06,
"loss": 0.6482,
"step": 1540
},
{
"epoch": 0.2357984994640943,
"eval_accuracy": 0.5938021401081177,
"eval_loss": 0.6790253520011902,
"eval_runtime": 279.1453,
"eval_samples_per_second": 161.737,
"eval_steps_per_second": 20.219,
"step": 1540
},
{
"epoch": 0.2373296585515235,
"grad_norm": 9.95355224609375,
"learning_rate": 4.406675853621192e-06,
"loss": 0.6667,
"step": 1550
},
{
"epoch": 0.2373296585515235,
"eval_accuracy": 0.5938640206460799,
"eval_loss": 0.6704154014587402,
"eval_runtime": 279.4535,
"eval_samples_per_second": 161.558,
"eval_steps_per_second": 20.197,
"step": 1550
},
{
"epoch": 0.2388608176389527,
"grad_norm": 9.302884101867676,
"learning_rate": 4.402847955902619e-06,
"loss": 0.604,
"step": 1560
},
{
"epoch": 0.2388608176389527,
"eval_accuracy": 0.5950828790744243,
"eval_loss": 0.6687915921211243,
"eval_runtime": 277.3792,
"eval_samples_per_second": 162.766,
"eval_steps_per_second": 20.348,
"step": 1560
},
{
"epoch": 0.24039197672638188,
"grad_norm": 8.783987998962402,
"learning_rate": 4.399020058184046e-06,
"loss": 0.5914,
"step": 1570
},
{
"epoch": 0.24039197672638188,
"eval_accuracy": 0.5949302294527408,
"eval_loss": 0.6737338304519653,
"eval_runtime": 277.8845,
"eval_samples_per_second": 162.47,
"eval_steps_per_second": 20.311,
"step": 1570
},
{
"epoch": 0.24192313581381106,
"grad_norm": 8.757774353027344,
"learning_rate": 4.395192160465473e-06,
"loss": 0.629,
"step": 1580
},
{
"epoch": 0.24192313581381106,
"eval_accuracy": 0.5952777963049423,
"eval_loss": 0.6752948760986328,
"eval_runtime": 279.1759,
"eval_samples_per_second": 161.719,
"eval_steps_per_second": 20.217,
"step": 1580
},
{
"epoch": 0.24345429490124024,
"grad_norm": 8.354512214660645,
"learning_rate": 4.3913642627469e-06,
"loss": 0.6632,
"step": 1590
},
{
"epoch": 0.24345429490124024,
"eval_accuracy": 0.5962355663336819,
"eval_loss": 0.6745610237121582,
"eval_runtime": 279.0496,
"eval_samples_per_second": 161.792,
"eval_steps_per_second": 20.226,
"step": 1590
},
{
"epoch": 0.24498545398866942,
"grad_norm": 13.983068466186523,
"learning_rate": 4.387536365028327e-06,
"loss": 0.6018,
"step": 1600
},
{
"epoch": 0.24498545398866942,
"eval_accuracy": 0.5935825309643993,
"eval_loss": 0.687160849571228,
"eval_runtime": 280.0465,
"eval_samples_per_second": 161.216,
"eval_steps_per_second": 20.154,
"step": 1600
},
{
"epoch": 0.2465166130760986,
"grad_norm": 8.103803634643555,
"learning_rate": 4.383708467309754e-06,
"loss": 0.6217,
"step": 1610
},
{
"epoch": 0.2465166130760986,
"eval_accuracy": 0.5935380578595094,
"eval_loss": 0.6901026368141174,
"eval_runtime": 280.8048,
"eval_samples_per_second": 160.781,
"eval_steps_per_second": 20.099,
"step": 1610
},
{
"epoch": 0.2480477721635278,
"grad_norm": 9.161907196044922,
"learning_rate": 4.379880569591181e-06,
"loss": 0.6106,
"step": 1620
},
{
"epoch": 0.2480477721635278,
"eval_accuracy": 0.5978512323160423,
"eval_loss": 0.6946441531181335,
"eval_runtime": 278.4057,
"eval_samples_per_second": 162.166,
"eval_steps_per_second": 20.273,
"step": 1620
},
{
"epoch": 0.24957893125095698,
"grad_norm": 7.822539329528809,
"learning_rate": 4.3760526718726085e-06,
"loss": 0.693,
"step": 1630
},
{
"epoch": 0.24957893125095698,
"eval_accuracy": 0.598020462633452,
"eval_loss": 0.6881946921348572,
"eval_runtime": 277.0846,
"eval_samples_per_second": 162.939,
"eval_steps_per_second": 20.369,
"step": 1630
},
{
"epoch": 0.25111009033838616,
"grad_norm": 8.115804672241211,
"learning_rate": 4.372224774154035e-06,
"loss": 0.6638,
"step": 1640
},
{
"epoch": 0.25111009033838616,
"eval_accuracy": 0.5966775781058632,
"eval_loss": 0.6835174560546875,
"eval_runtime": 277.1947,
"eval_samples_per_second": 162.875,
"eval_steps_per_second": 20.361,
"step": 1640
},
{
"epoch": 0.25264124942581534,
"grad_norm": 8.402555465698242,
"learning_rate": 4.368396876435462e-06,
"loss": 0.5649,
"step": 1650
},
{
"epoch": 0.25264124942581534,
"eval_accuracy": 0.5948628917378918,
"eval_loss": 0.6932902336120605,
"eval_runtime": 278.2801,
"eval_samples_per_second": 162.239,
"eval_steps_per_second": 20.282,
"step": 1650
},
{
"epoch": 0.2541724085132445,
"grad_norm": 9.621747970581055,
"learning_rate": 4.3645689787168885e-06,
"loss": 0.6463,
"step": 1660
},
{
"epoch": 0.2541724085132445,
"eval_accuracy": 0.593183788710789,
"eval_loss": 0.6967864036560059,
"eval_runtime": 277.2248,
"eval_samples_per_second": 162.857,
"eval_steps_per_second": 20.359,
"step": 1660
},
{
"epoch": 0.2557035676006737,
"grad_norm": 17.633258819580078,
"learning_rate": 4.360741080998316e-06,
"loss": 0.5943,
"step": 1670
},
{
"epoch": 0.2557035676006737,
"eval_accuracy": 0.591132348038671,
"eval_loss": 0.7154887318611145,
"eval_runtime": 277.3036,
"eval_samples_per_second": 162.811,
"eval_steps_per_second": 20.353,
"step": 1670
},
{
"epoch": 0.2572347266881029,
"grad_norm": 16.508804321289062,
"learning_rate": 4.356913183279743e-06,
"loss": 0.5856,
"step": 1680
},
{
"epoch": 0.2572347266881029,
"eval_accuracy": 0.5927422936839299,
"eval_loss": 0.7325928211212158,
"eval_runtime": 279.5671,
"eval_samples_per_second": 161.493,
"eval_steps_per_second": 20.188,
"step": 1680
},
{
"epoch": 0.25876588577553206,
"grad_norm": 25.668621063232422,
"learning_rate": 4.35308528556117e-06,
"loss": 0.6454,
"step": 1690
},
{
"epoch": 0.25876588577553206,
"eval_accuracy": 0.5932041424063291,
"eval_loss": 0.7432768940925598,
"eval_runtime": 281.3082,
"eval_samples_per_second": 160.493,
"eval_steps_per_second": 20.063,
"step": 1690
},
{
"epoch": 0.26029704486296124,
"grad_norm": 16.12009620666504,
"learning_rate": 4.349257387842597e-06,
"loss": 0.597,
"step": 1700
},
{
"epoch": 0.26029704486296124,
"eval_accuracy": 0.5940829190340909,
"eval_loss": 0.7180017232894897,
"eval_runtime": 281.0285,
"eval_samples_per_second": 160.653,
"eval_steps_per_second": 20.083,
"step": 1700
},
{
"epoch": 0.26182820395039047,
"grad_norm": 17.72113609313965,
"learning_rate": 4.345429490124024e-06,
"loss": 0.624,
"step": 1710
},
{
"epoch": 0.26182820395039047,
"eval_accuracy": 0.5941739381424987,
"eval_loss": 0.7116958498954773,
"eval_runtime": 279.9549,
"eval_samples_per_second": 161.269,
"eval_steps_per_second": 20.16,
"step": 1710
},
{
"epoch": 0.26335936303781965,
"grad_norm": 14.417743682861328,
"learning_rate": 4.341601592405451e-06,
"loss": 0.5733,
"step": 1720
},
{
"epoch": 0.26335936303781965,
"eval_accuracy": 0.5937305745493295,
"eval_loss": 0.7162705063819885,
"eval_runtime": 278.2449,
"eval_samples_per_second": 162.26,
"eval_steps_per_second": 20.284,
"step": 1720
},
{
"epoch": 0.26489052212524883,
"grad_norm": 18.933935165405273,
"learning_rate": 4.337773694686878e-06,
"loss": 0.5191,
"step": 1730
},
{
"epoch": 0.26489052212524883,
"eval_accuracy": 0.5937839937839938,
"eval_loss": 0.7459293603897095,
"eval_runtime": 280.6883,
"eval_samples_per_second": 160.847,
"eval_steps_per_second": 20.108,
"step": 1730
},
{
"epoch": 0.266421681212678,
"grad_norm": 21.37299346923828,
"learning_rate": 4.333945796968305e-06,
"loss": 0.6065,
"step": 1740
},
{
"epoch": 0.266421681212678,
"eval_accuracy": 0.5947049555047602,
"eval_loss": 0.7559405565261841,
"eval_runtime": 279.6788,
"eval_samples_per_second": 161.428,
"eval_steps_per_second": 20.18,
"step": 1740
},
{
"epoch": 0.2679528403001072,
"grad_norm": 17.455568313598633,
"learning_rate": 4.3301178992497325e-06,
"loss": 0.641,
"step": 1750
},
{
"epoch": 0.2679528403001072,
"eval_accuracy": 0.5933846529272134,
"eval_loss": 0.7480175495147705,
"eval_runtime": 277.648,
"eval_samples_per_second": 162.609,
"eval_steps_per_second": 20.328,
"step": 1750
},
{
"epoch": 0.26948399938753637,
"grad_norm": 14.3558349609375,
"learning_rate": 4.326290001531159e-06,
"loss": 0.6186,
"step": 1760
},
{
"epoch": 0.26948399938753637,
"eval_accuracy": 0.5932022659113628,
"eval_loss": 0.7287299633026123,
"eval_runtime": 281.2354,
"eval_samples_per_second": 160.535,
"eval_steps_per_second": 20.069,
"step": 1760
},
{
"epoch": 0.27101515847496555,
"grad_norm": 10.249687194824219,
"learning_rate": 4.322462103812587e-06,
"loss": 0.6375,
"step": 1770
},
{
"epoch": 0.27101515847496555,
"eval_accuracy": 0.5906849680170576,
"eval_loss": 0.7209318280220032,
"eval_runtime": 279.922,
"eval_samples_per_second": 161.288,
"eval_steps_per_second": 20.163,
"step": 1770
},
{
"epoch": 0.2725463175623947,
"grad_norm": 13.502520561218262,
"learning_rate": 4.318634206094013e-06,
"loss": 0.6078,
"step": 1780
},
{
"epoch": 0.2725463175623947,
"eval_accuracy": 0.590238919975131,
"eval_loss": 0.713438868522644,
"eval_runtime": 281.1596,
"eval_samples_per_second": 160.578,
"eval_steps_per_second": 20.074,
"step": 1780
},
{
"epoch": 0.2740774766498239,
"grad_norm": 8.710155487060547,
"learning_rate": 4.314806308375441e-06,
"loss": 0.6112,
"step": 1790
},
{
"epoch": 0.2740774766498239,
"eval_accuracy": 0.5918866080156403,
"eval_loss": 0.7061217427253723,
"eval_runtime": 278.6715,
"eval_samples_per_second": 162.012,
"eval_steps_per_second": 20.253,
"step": 1790
},
{
"epoch": 0.2756086357372531,
"grad_norm": 12.963603973388672,
"learning_rate": 4.310978410656867e-06,
"loss": 0.6836,
"step": 1800
},
{
"epoch": 0.2756086357372531,
"eval_accuracy": 0.589742449179307,
"eval_loss": 0.7048377394676208,
"eval_runtime": 280.3976,
"eval_samples_per_second": 161.014,
"eval_steps_per_second": 20.129,
"step": 1800
},
{
"epoch": 0.27713979482468226,
"grad_norm": 18.37137794494629,
"learning_rate": 4.307150512938295e-06,
"loss": 0.5662,
"step": 1810
},
{
"epoch": 0.27713979482468226,
"eval_accuracy": 0.5890812901504879,
"eval_loss": 0.7051539421081543,
"eval_runtime": 280.7367,
"eval_samples_per_second": 160.82,
"eval_steps_per_second": 20.104,
"step": 1810
},
{
"epoch": 0.27867095391211144,
"grad_norm": 8.255058288574219,
"learning_rate": 4.3033226152197215e-06,
"loss": 0.6022,
"step": 1820
},
{
"epoch": 0.27867095391211144,
"eval_accuracy": 0.5886953430501244,
"eval_loss": 0.7059171199798584,
"eval_runtime": 278.1826,
"eval_samples_per_second": 162.296,
"eval_steps_per_second": 20.289,
"step": 1820
},
{
"epoch": 0.2802021129995407,
"grad_norm": 12.834601402282715,
"learning_rate": 4.299494717501149e-06,
"loss": 0.5255,
"step": 1830
},
{
"epoch": 0.2802021129995407,
"eval_accuracy": 0.5897788828700826,
"eval_loss": 0.724184513092041,
"eval_runtime": 281.9681,
"eval_samples_per_second": 160.117,
"eval_steps_per_second": 20.016,
"step": 1830
},
{
"epoch": 0.28173327208696985,
"grad_norm": 13.296520233154297,
"learning_rate": 4.295666819782576e-06,
"loss": 0.5974,
"step": 1840
},
{
"epoch": 0.28173327208696985,
"eval_accuracy": 0.5901501208506109,
"eval_loss": 0.7438974380493164,
"eval_runtime": 279.7112,
"eval_samples_per_second": 161.409,
"eval_steps_per_second": 20.178,
"step": 1840
},
{
"epoch": 0.28326443117439903,
"grad_norm": 14.873211860656738,
"learning_rate": 4.291838922064003e-06,
"loss": 0.6871,
"step": 1850
},
{
"epoch": 0.28326443117439903,
"eval_accuracy": 0.5945105702611476,
"eval_loss": 0.7173364162445068,
"eval_runtime": 278.1451,
"eval_samples_per_second": 162.318,
"eval_steps_per_second": 20.292,
"step": 1850
},
{
"epoch": 0.2847955902618282,
"grad_norm": 11.980530738830566,
"learning_rate": 4.28801102434543e-06,
"loss": 0.5518,
"step": 1860
},
{
"epoch": 0.2847955902618282,
"eval_accuracy": 0.5945075210522808,
"eval_loss": 0.7088351845741272,
"eval_runtime": 281.4397,
"eval_samples_per_second": 160.418,
"eval_steps_per_second": 20.054,
"step": 1860
},
{
"epoch": 0.2863267493492574,
"grad_norm": 14.939533233642578,
"learning_rate": 4.2841831266268565e-06,
"loss": 0.5496,
"step": 1870
},
{
"epoch": 0.2863267493492574,
"eval_accuracy": 0.5940158599702348,
"eval_loss": 0.7212331295013428,
"eval_runtime": 279.5218,
"eval_samples_per_second": 161.519,
"eval_steps_per_second": 20.192,
"step": 1870
},
{
"epoch": 0.28785790843668657,
"grad_norm": 15.159697532653809,
"learning_rate": 4.280355228908284e-06,
"loss": 0.5738,
"step": 1880
},
{
"epoch": 0.28785790843668657,
"eval_accuracy": 0.5914114513981358,
"eval_loss": 0.7385027408599854,
"eval_runtime": 279.6403,
"eval_samples_per_second": 161.45,
"eval_steps_per_second": 20.183,
"step": 1880
},
{
"epoch": 0.28938906752411575,
"grad_norm": 10.097131729125977,
"learning_rate": 4.276527331189711e-06,
"loss": 0.5,
"step": 1890
},
{
"epoch": 0.28938906752411575,
"eval_accuracy": 0.5934275634055961,
"eval_loss": 0.7404712438583374,
"eval_runtime": 278.9074,
"eval_samples_per_second": 161.875,
"eval_steps_per_second": 20.236,
"step": 1890
},
{
"epoch": 0.29092022661154493,
"grad_norm": 17.089492797851562,
"learning_rate": 4.272699433471138e-06,
"loss": 0.6033,
"step": 1900
},
{
"epoch": 0.29092022661154493,
"eval_accuracy": 0.5967577397321032,
"eval_loss": 0.7266111373901367,
"eval_runtime": 280.8228,
"eval_samples_per_second": 160.77,
"eval_steps_per_second": 20.098,
"step": 1900
},
{
"epoch": 0.2924513856989741,
"grad_norm": 14.520054817199707,
"learning_rate": 4.268871535752565e-06,
"loss": 0.5852,
"step": 1910
},
{
"epoch": 0.2924513856989741,
"eval_accuracy": 0.5944566495794776,
"eval_loss": 0.7083961367607117,
"eval_runtime": 278.472,
"eval_samples_per_second": 162.128,
"eval_steps_per_second": 20.268,
"step": 1910
},
{
"epoch": 0.2939825447864033,
"grad_norm": 16.736730575561523,
"learning_rate": 4.265043638033992e-06,
"loss": 0.6374,
"step": 1920
},
{
"epoch": 0.2939825447864033,
"eval_accuracy": 0.5979578246392897,
"eval_loss": 0.6861377358436584,
"eval_runtime": 279.2609,
"eval_samples_per_second": 161.67,
"eval_steps_per_second": 20.21,
"step": 1920
},
{
"epoch": 0.29551370387383247,
"grad_norm": 9.897313117980957,
"learning_rate": 4.261215740315419e-06,
"loss": 0.5925,
"step": 1930
},
{
"epoch": 0.29551370387383247,
"eval_accuracy": 0.5983366600133068,
"eval_loss": 0.6827172636985779,
"eval_runtime": 278.8073,
"eval_samples_per_second": 161.933,
"eval_steps_per_second": 20.243,
"step": 1930
},
{
"epoch": 0.2970448629612617,
"grad_norm": 7.9534478187561035,
"learning_rate": 4.257387842596846e-06,
"loss": 0.5634,
"step": 1940
},
{
"epoch": 0.2970448629612617,
"eval_accuracy": 0.5988369512140986,
"eval_loss": 0.684248685836792,
"eval_runtime": 277.3928,
"eval_samples_per_second": 162.758,
"eval_steps_per_second": 20.347,
"step": 1940
},
{
"epoch": 0.2985760220486909,
"grad_norm": 13.70839786529541,
"learning_rate": 4.253559944878273e-06,
"loss": 0.5783,
"step": 1950
},
{
"epoch": 0.2985760220486909,
"eval_accuracy": 0.597880548042389,
"eval_loss": 0.705771267414093,
"eval_runtime": 277.0321,
"eval_samples_per_second": 162.97,
"eval_steps_per_second": 20.373,
"step": 1950
},
{
"epoch": 0.30010718113612006,
"grad_norm": 18.95427703857422,
"learning_rate": 4.2497320471597005e-06,
"loss": 0.7029,
"step": 1960
},
{
"epoch": 0.30010718113612006,
"eval_accuracy": 0.5943931866572036,
"eval_loss": 0.7076370716094971,
"eval_runtime": 279.5179,
"eval_samples_per_second": 161.521,
"eval_steps_per_second": 20.192,
"step": 1960
},
{
"epoch": 0.30163834022354924,
"grad_norm": 12.317983627319336,
"learning_rate": 4.245904149441127e-06,
"loss": 0.562,
"step": 1970
},
{
"epoch": 0.30163834022354924,
"eval_accuracy": 0.5903159950292917,
"eval_loss": 0.6966370344161987,
"eval_runtime": 278.3564,
"eval_samples_per_second": 162.195,
"eval_steps_per_second": 20.276,
"step": 1970
},
{
"epoch": 0.3031694993109784,
"grad_norm": 18.507949829101562,
"learning_rate": 4.242076251722555e-06,
"loss": 0.6133,
"step": 1980
},
{
"epoch": 0.3031694993109784,
"eval_accuracy": 0.5898846495119787,
"eval_loss": 0.697861909866333,
"eval_runtime": 276.7732,
"eval_samples_per_second": 163.123,
"eval_steps_per_second": 20.392,
"step": 1980
},
{
"epoch": 0.3047006583984076,
"grad_norm": 10.3158597946167,
"learning_rate": 4.238248354003981e-06,
"loss": 0.5549,
"step": 1990
},
{
"epoch": 0.3047006583984076,
"eval_accuracy": 0.5933229813664597,
"eval_loss": 0.6916565299034119,
"eval_runtime": 279.0507,
"eval_samples_per_second": 161.791,
"eval_steps_per_second": 20.226,
"step": 1990
},
{
"epoch": 0.3062318174858368,
"grad_norm": 17.062057495117188,
"learning_rate": 4.234420456285409e-06,
"loss": 0.6238,
"step": 2000
},
{
"epoch": 0.3062318174858368,
"eval_accuracy": 0.5943655723158828,
"eval_loss": 0.7041603326797485,
"eval_runtime": 280.3627,
"eval_samples_per_second": 161.034,
"eval_steps_per_second": 20.131,
"step": 2000
},
{
"epoch": 0.30776297657326596,
"grad_norm": 7.667088985443115,
"learning_rate": 4.230592558566835e-06,
"loss": 0.6945,
"step": 2010
},
{
"epoch": 0.30776297657326596,
"eval_accuracy": 0.5923155464796236,
"eval_loss": 0.695047914981842,
"eval_runtime": 282.0476,
"eval_samples_per_second": 160.072,
"eval_steps_per_second": 20.011,
"step": 2010
},
{
"epoch": 0.30929413566069514,
"grad_norm": 13.864084243774414,
"learning_rate": 4.226764660848263e-06,
"loss": 0.6421,
"step": 2020
},
{
"epoch": 0.30929413566069514,
"eval_accuracy": 0.5927388930806444,
"eval_loss": 0.6951669454574585,
"eval_runtime": 282.3074,
"eval_samples_per_second": 159.925,
"eval_steps_per_second": 19.992,
"step": 2020
},
{
"epoch": 0.3108252947481243,
"grad_norm": 9.97375202178955,
"learning_rate": 4.2229367631296895e-06,
"loss": 0.5758,
"step": 2030
},
{
"epoch": 0.3108252947481243,
"eval_accuracy": 0.5917714488825698,
"eval_loss": 0.6952547430992126,
"eval_runtime": 281.5694,
"eval_samples_per_second": 160.344,
"eval_steps_per_second": 20.045,
"step": 2030
},
{
"epoch": 0.3123564538355535,
"grad_norm": 7.828521251678467,
"learning_rate": 4.219108865411117e-06,
"loss": 0.6181,
"step": 2040
},
{
"epoch": 0.3123564538355535,
"eval_accuracy": 0.5886520097712636,
"eval_loss": 0.6984680891036987,
"eval_runtime": 278.5618,
"eval_samples_per_second": 162.075,
"eval_steps_per_second": 20.261,
"step": 2040
},
{
"epoch": 0.3138876129229827,
"grad_norm": 10.627179145812988,
"learning_rate": 4.215280967692544e-06,
"loss": 0.6605,
"step": 2050
},
{
"epoch": 0.3138876129229827,
"eval_accuracy": 0.5845493371296379,
"eval_loss": 0.6960271000862122,
"eval_runtime": 278.4801,
"eval_samples_per_second": 162.123,
"eval_steps_per_second": 20.267,
"step": 2050
},
{
"epoch": 0.3154187720104119,
"grad_norm": 6.945221424102783,
"learning_rate": 4.211453069973971e-06,
"loss": 0.6138,
"step": 2060
},
{
"epoch": 0.3154187720104119,
"eval_accuracy": 0.5852839088643645,
"eval_loss": 0.6904491782188416,
"eval_runtime": 276.524,
"eval_samples_per_second": 163.27,
"eval_steps_per_second": 20.411,
"step": 2060
},
{
"epoch": 0.3169499310978411,
"grad_norm": 13.37806224822998,
"learning_rate": 4.207625172255398e-06,
"loss": 0.5744,
"step": 2070
},
{
"epoch": 0.3169499310978411,
"eval_accuracy": 0.5887589069679682,
"eval_loss": 0.6954379677772522,
"eval_runtime": 275.6555,
"eval_samples_per_second": 163.784,
"eval_steps_per_second": 20.475,
"step": 2070
},
{
"epoch": 0.31848109018527027,
"grad_norm": 11.931571006774902,
"learning_rate": 4.203797274536825e-06,
"loss": 0.5473,
"step": 2080
},
{
"epoch": 0.31848109018527027,
"eval_accuracy": 0.589274223967694,
"eval_loss": 0.7085046172142029,
"eval_runtime": 276.9102,
"eval_samples_per_second": 163.042,
"eval_steps_per_second": 20.382,
"step": 2080
},
{
"epoch": 0.32001224927269944,
"grad_norm": 17.946001052856445,
"learning_rate": 4.199969376818252e-06,
"loss": 0.6201,
"step": 2090
},
{
"epoch": 0.32001224927269944,
"eval_accuracy": 0.5832519747936452,
"eval_loss": 0.7224695086479187,
"eval_runtime": 278.3015,
"eval_samples_per_second": 162.227,
"eval_steps_per_second": 20.28,
"step": 2090
},
{
"epoch": 0.3215434083601286,
"grad_norm": 9.482304573059082,
"learning_rate": 4.1961414790996794e-06,
"loss": 0.5663,
"step": 2100
},
{
"epoch": 0.3215434083601286,
"eval_accuracy": 0.5839268676917615,
"eval_loss": 0.7226927876472473,
"eval_runtime": 280.269,
"eval_samples_per_second": 161.088,
"eval_steps_per_second": 20.138,
"step": 2100
},
{
"epoch": 0.3230745674475578,
"grad_norm": 10.172694206237793,
"learning_rate": 4.192313581381106e-06,
"loss": 0.612,
"step": 2110
},
{
"epoch": 0.3230745674475578,
"eval_accuracy": 0.5900024405937299,
"eval_loss": 0.7088232040405273,
"eval_runtime": 280.1784,
"eval_samples_per_second": 161.14,
"eval_steps_per_second": 20.144,
"step": 2110
},
{
"epoch": 0.324605726534987,
"grad_norm": 11.057249069213867,
"learning_rate": 4.188485683662533e-06,
"loss": 0.5937,
"step": 2120
},
{
"epoch": 0.324605726534987,
"eval_accuracy": 0.5903734771320152,
"eval_loss": 0.7097996473312378,
"eval_runtime": 281.3342,
"eval_samples_per_second": 160.478,
"eval_steps_per_second": 20.062,
"step": 2120
},
{
"epoch": 0.32613688562241616,
"grad_norm": 12.521862030029297,
"learning_rate": 4.184657785943959e-06,
"loss": 0.6988,
"step": 2130
},
{
"epoch": 0.32613688562241616,
"eval_accuracy": 0.5909282466452257,
"eval_loss": 0.6956667900085449,
"eval_runtime": 280.7428,
"eval_samples_per_second": 160.816,
"eval_steps_per_second": 20.104,
"step": 2130
},
{
"epoch": 0.32766804470984534,
"grad_norm": 13.895928382873535,
"learning_rate": 4.180829888225387e-06,
"loss": 0.4822,
"step": 2140
},
{
"epoch": 0.32766804470984534,
"eval_accuracy": 0.5896343627973021,
"eval_loss": 0.7213166356086731,
"eval_runtime": 281.166,
"eval_samples_per_second": 160.574,
"eval_steps_per_second": 20.074,
"step": 2140
},
{
"epoch": 0.3291992037972745,
"grad_norm": 11.10944938659668,
"learning_rate": 4.1770019905068135e-06,
"loss": 0.5878,
"step": 2150
},
{
"epoch": 0.3291992037972745,
"eval_accuracy": 0.5907275953859805,
"eval_loss": 0.742756724357605,
"eval_runtime": 281.9419,
"eval_samples_per_second": 160.132,
"eval_steps_per_second": 20.018,
"step": 2150
},
{
"epoch": 0.3307303628847037,
"grad_norm": 12.602340698242188,
"learning_rate": 4.173174092788241e-06,
"loss": 0.5722,
"step": 2160
},
{
"epoch": 0.3307303628847037,
"eval_accuracy": 0.590145030380982,
"eval_loss": 0.7571865320205688,
"eval_runtime": 279.9038,
"eval_samples_per_second": 161.298,
"eval_steps_per_second": 20.164,
"step": 2160
},
{
"epoch": 0.3322615219721329,
"grad_norm": 18.790254592895508,
"learning_rate": 4.169346195069668e-06,
"loss": 0.6094,
"step": 2170
},
{
"epoch": 0.3322615219721329,
"eval_accuracy": 0.5902217294900222,
"eval_loss": 0.7526936531066895,
"eval_runtime": 280.4762,
"eval_samples_per_second": 160.969,
"eval_steps_per_second": 20.123,
"step": 2170
},
{
"epoch": 0.3337926810595621,
"grad_norm": 13.405548095703125,
"learning_rate": 4.165518297351095e-06,
"loss": 0.693,
"step": 2180
},
{
"epoch": 0.3337926810595621,
"eval_accuracy": 0.5901581176679307,
"eval_loss": 0.7200701832771301,
"eval_runtime": 281.5673,
"eval_samples_per_second": 160.345,
"eval_steps_per_second": 20.045,
"step": 2180
},
{
"epoch": 0.3353238401469913,
"grad_norm": 10.354043006896973,
"learning_rate": 4.161690399632522e-06,
"loss": 0.499,
"step": 2190
},
{
"epoch": 0.3353238401469913,
"eval_accuracy": 0.5892896756732774,
"eval_loss": 0.721836507320404,
"eval_runtime": 279.1391,
"eval_samples_per_second": 161.74,
"eval_steps_per_second": 20.219,
"step": 2190
},
{
"epoch": 0.33685499923442047,
"grad_norm": 8.689166069030762,
"learning_rate": 4.157862501913949e-06,
"loss": 0.594,
"step": 2200
},
{
"epoch": 0.33685499923442047,
"eval_accuracy": 0.5895724296992815,
"eval_loss": 0.7207421064376831,
"eval_runtime": 279.1316,
"eval_samples_per_second": 161.744,
"eval_steps_per_second": 20.22,
"step": 2200
},
{
"epoch": 0.33838615832184965,
"grad_norm": 12.664347648620605,
"learning_rate": 4.154034604195376e-06,
"loss": 0.5292,
"step": 2210
},
{
"epoch": 0.33838615832184965,
"eval_accuracy": 0.5918439794990127,
"eval_loss": 0.7299882173538208,
"eval_runtime": 281.451,
"eval_samples_per_second": 160.412,
"eval_steps_per_second": 20.053,
"step": 2210
},
{
"epoch": 0.33991731740927883,
"grad_norm": 14.595951080322266,
"learning_rate": 4.150206706476803e-06,
"loss": 0.5728,
"step": 2220
},
{
"epoch": 0.33991731740927883,
"eval_accuracy": 0.5933771015392805,
"eval_loss": 0.7359711527824402,
"eval_runtime": 281.6141,
"eval_samples_per_second": 160.319,
"eval_steps_per_second": 20.042,
"step": 2220
},
{
"epoch": 0.341448476496708,
"grad_norm": 16.81365203857422,
"learning_rate": 4.14637880875823e-06,
"loss": 0.6216,
"step": 2230
},
{
"epoch": 0.341448476496708,
"eval_accuracy": 0.5928677563150074,
"eval_loss": 0.7266600728034973,
"eval_runtime": 281.4751,
"eval_samples_per_second": 160.398,
"eval_steps_per_second": 20.052,
"step": 2230
},
{
"epoch": 0.3429796355841372,
"grad_norm": 9.753067016601562,
"learning_rate": 4.1425509110396575e-06,
"loss": 0.5759,
"step": 2240
},
{
"epoch": 0.3429796355841372,
"eval_accuracy": 0.5927989522519923,
"eval_loss": 0.7114787697792053,
"eval_runtime": 281.2888,
"eval_samples_per_second": 160.504,
"eval_steps_per_second": 20.065,
"step": 2240
},
{
"epoch": 0.34451079467156637,
"grad_norm": 10.276047706604004,
"learning_rate": 4.138723013321084e-06,
"loss": 0.621,
"step": 2250
},
{
"epoch": 0.34451079467156637,
"eval_accuracy": 0.5948861366360367,
"eval_loss": 0.7070339918136597,
"eval_runtime": 280.8031,
"eval_samples_per_second": 160.782,
"eval_steps_per_second": 20.099,
"step": 2250
},
{
"epoch": 0.34604195375899555,
"grad_norm": 11.647406578063965,
"learning_rate": 4.134895115602512e-06,
"loss": 0.6023,
"step": 2260
},
{
"epoch": 0.34604195375899555,
"eval_accuracy": 0.5949940087871123,
"eval_loss": 0.7148999571800232,
"eval_runtime": 280.6819,
"eval_samples_per_second": 160.851,
"eval_steps_per_second": 20.108,
"step": 2260
},
{
"epoch": 0.3475731128464247,
"grad_norm": 9.785872459411621,
"learning_rate": 4.131067217883938e-06,
"loss": 0.578,
"step": 2270
},
{
"epoch": 0.3475731128464247,
"eval_accuracy": 0.59318833174113,
"eval_loss": 0.7126178741455078,
"eval_runtime": 281.5251,
"eval_samples_per_second": 160.369,
"eval_steps_per_second": 20.048,
"step": 2270
},
{
"epoch": 0.3491042719338539,
"grad_norm": 11.013738632202148,
"learning_rate": 4.127239320165366e-06,
"loss": 0.5701,
"step": 2280
},
{
"epoch": 0.3491042719338539,
"eval_accuracy": 0.5925876549793361,
"eval_loss": 0.7025783061981201,
"eval_runtime": 278.114,
"eval_samples_per_second": 162.336,
"eval_steps_per_second": 20.294,
"step": 2280
},
{
"epoch": 0.3506354310212831,
"grad_norm": 9.779340744018555,
"learning_rate": 4.1234114224467924e-06,
"loss": 0.6761,
"step": 2290
},
{
"epoch": 0.3506354310212831,
"eval_accuracy": 0.5935654336338203,
"eval_loss": 0.6881637573242188,
"eval_runtime": 281.0803,
"eval_samples_per_second": 160.623,
"eval_steps_per_second": 20.08,
"step": 2290
},
{
"epoch": 0.3521665901087123,
"grad_norm": 13.62732219696045,
"learning_rate": 4.11958352472822e-06,
"loss": 0.5771,
"step": 2300
},
{
"epoch": 0.3521665901087123,
"eval_accuracy": 0.5967165834719911,
"eval_loss": 0.6921752691268921,
"eval_runtime": 278.472,
"eval_samples_per_second": 162.128,
"eval_steps_per_second": 20.268,
"step": 2300
},
{
"epoch": 0.3536977491961415,
"grad_norm": 13.277196884155273,
"learning_rate": 4.1157556270096466e-06,
"loss": 0.6241,
"step": 2310
},
{
"epoch": 0.3536977491961415,
"eval_accuracy": 0.5976616231086658,
"eval_loss": 0.6972672939300537,
"eval_runtime": 279.3077,
"eval_samples_per_second": 161.643,
"eval_steps_per_second": 20.207,
"step": 2310
},
{
"epoch": 0.3552289082835707,
"grad_norm": 11.036153793334961,
"learning_rate": 4.111927729291074e-06,
"loss": 0.6102,
"step": 2320
},
{
"epoch": 0.3552289082835707,
"eval_accuracy": 0.5959098571555319,
"eval_loss": 0.6897289752960205,
"eval_runtime": 280.1189,
"eval_samples_per_second": 161.174,
"eval_steps_per_second": 20.149,
"step": 2320
},
{
"epoch": 0.35676006737099986,
"grad_norm": 16.50404167175293,
"learning_rate": 4.108099831572501e-06,
"loss": 0.5876,
"step": 2330
},
{
"epoch": 0.35676006737099986,
"eval_accuracy": 0.595568665720369,
"eval_loss": 0.6913372874259949,
"eval_runtime": 279.8966,
"eval_samples_per_second": 161.302,
"eval_steps_per_second": 20.165,
"step": 2330
},
{
"epoch": 0.35829122645842904,
"grad_norm": 10.642626762390137,
"learning_rate": 4.104271933853927e-06,
"loss": 0.651,
"step": 2340
},
{
"epoch": 0.35829122645842904,
"eval_accuracy": 0.5946874792133212,
"eval_loss": 0.6878921389579773,
"eval_runtime": 280.9394,
"eval_samples_per_second": 160.704,
"eval_steps_per_second": 20.09,
"step": 2340
},
{
"epoch": 0.3598223855458582,
"grad_norm": 13.040077209472656,
"learning_rate": 4.100444036135355e-06,
"loss": 0.5587,
"step": 2350
},
{
"epoch": 0.3598223855458582,
"eval_accuracy": 0.5935343584281879,
"eval_loss": 0.6936639547348022,
"eval_runtime": 282.096,
"eval_samples_per_second": 160.045,
"eval_steps_per_second": 20.007,
"step": 2350
},
{
"epoch": 0.3613535446332874,
"grad_norm": 10.807535171508789,
"learning_rate": 4.0966161384167815e-06,
"loss": 0.6514,
"step": 2360
},
{
"epoch": 0.3613535446332874,
"eval_accuracy": 0.5977954711792233,
"eval_loss": 0.6872532963752747,
"eval_runtime": 279.4925,
"eval_samples_per_second": 161.536,
"eval_steps_per_second": 20.194,
"step": 2360
},
{
"epoch": 0.3628847037207166,
"grad_norm": 10.98725700378418,
"learning_rate": 4.092788240698209e-06,
"loss": 0.6015,
"step": 2370
},
{
"epoch": 0.3628847037207166,
"eval_accuracy": 0.5974941789555384,
"eval_loss": 0.6847018003463745,
"eval_runtime": 281.2012,
"eval_samples_per_second": 160.554,
"eval_steps_per_second": 20.071,
"step": 2370
},
{
"epoch": 0.36441586280814575,
"grad_norm": 12.160524368286133,
"learning_rate": 4.088960342979636e-06,
"loss": 0.5671,
"step": 2380
},
{
"epoch": 0.36441586280814575,
"eval_accuracy": 0.598935344349562,
"eval_loss": 0.6907532811164856,
"eval_runtime": 281.9981,
"eval_samples_per_second": 160.1,
"eval_steps_per_second": 20.014,
"step": 2380
},
{
"epoch": 0.36594702189557493,
"grad_norm": 12.533185005187988,
"learning_rate": 4.085132445261063e-06,
"loss": 0.6757,
"step": 2390
},
{
"epoch": 0.36594702189557493,
"eval_accuracy": 0.5970619563287769,
"eval_loss": 0.6963858008384705,
"eval_runtime": 281.702,
"eval_samples_per_second": 160.269,
"eval_steps_per_second": 20.035,
"step": 2390
},
{
"epoch": 0.3674781809830041,
"grad_norm": 11.481986045837402,
"learning_rate": 4.08130454754249e-06,
"loss": 0.6244,
"step": 2400
},
{
"epoch": 0.3674781809830041,
"eval_accuracy": 0.5956671480946562,
"eval_loss": 0.6951790452003479,
"eval_runtime": 281.7788,
"eval_samples_per_second": 160.225,
"eval_steps_per_second": 20.03,
"step": 2400
},
{
"epoch": 0.36900934007043334,
"grad_norm": 15.283388137817383,
"learning_rate": 4.077476649823917e-06,
"loss": 0.5761,
"step": 2410
},
{
"epoch": 0.36900934007043334,
"eval_accuracy": 0.5921616520484068,
"eval_loss": 0.7129482626914978,
"eval_runtime": 280.9999,
"eval_samples_per_second": 160.669,
"eval_steps_per_second": 20.085,
"step": 2410
},
{
"epoch": 0.3705404991578625,
"grad_norm": 14.590538024902344,
"learning_rate": 4.073648752105344e-06,
"loss": 0.5847,
"step": 2420
},
{
"epoch": 0.3705404991578625,
"eval_accuracy": 0.5910292582142936,
"eval_loss": 0.7289432287216187,
"eval_runtime": 281.5113,
"eval_samples_per_second": 160.377,
"eval_steps_per_second": 20.049,
"step": 2420
},
{
"epoch": 0.3720716582452917,
"grad_norm": 14.669201850891113,
"learning_rate": 4.069820854386771e-06,
"loss": 0.5957,
"step": 2430
},
{
"epoch": 0.3720716582452917,
"eval_accuracy": 0.5891025356365736,
"eval_loss": 0.7361324429512024,
"eval_runtime": 278.169,
"eval_samples_per_second": 162.304,
"eval_steps_per_second": 20.29,
"step": 2430
},
{
"epoch": 0.3736028173327209,
"grad_norm": 9.489580154418945,
"learning_rate": 4.065992956668198e-06,
"loss": 0.5718,
"step": 2440
},
{
"epoch": 0.3736028173327209,
"eval_accuracy": 0.5889370209930024,
"eval_loss": 0.7279490828514099,
"eval_runtime": 277.7775,
"eval_samples_per_second": 162.533,
"eval_steps_per_second": 20.318,
"step": 2440
},
{
"epoch": 0.37513397642015006,
"grad_norm": 15.029380798339844,
"learning_rate": 4.0621650589496255e-06,
"loss": 0.6081,
"step": 2450
},
{
"epoch": 0.37513397642015006,
"eval_accuracy": 0.5909000155289837,
"eval_loss": 0.72515469789505,
"eval_runtime": 280.1896,
"eval_samples_per_second": 161.134,
"eval_steps_per_second": 20.143,
"step": 2450
},
{
"epoch": 0.37666513550757924,
"grad_norm": 12.974061965942383,
"learning_rate": 4.058337161231052e-06,
"loss": 0.5805,
"step": 2460
},
{
"epoch": 0.37666513550757924,
"eval_accuracy": 0.5923000110950849,
"eval_loss": 0.7263885736465454,
"eval_runtime": 277.833,
"eval_samples_per_second": 162.501,
"eval_steps_per_second": 20.314,
"step": 2460
},
{
"epoch": 0.3781962945950084,
"grad_norm": 17.26422119140625,
"learning_rate": 4.05450926351248e-06,
"loss": 0.6574,
"step": 2470
},
{
"epoch": 0.3781962945950084,
"eval_accuracy": 0.5921569497769591,
"eval_loss": 0.7078375816345215,
"eval_runtime": 278.2251,
"eval_samples_per_second": 162.271,
"eval_steps_per_second": 20.286,
"step": 2470
},
{
"epoch": 0.3797274536824376,
"grad_norm": 13.827315330505371,
"learning_rate": 4.050681365793906e-06,
"loss": 0.6347,
"step": 2480
},
{
"epoch": 0.3797274536824376,
"eval_accuracy": 0.5945813901843215,
"eval_loss": 0.700303316116333,
"eval_runtime": 280.284,
"eval_samples_per_second": 161.079,
"eval_steps_per_second": 20.137,
"step": 2480
},
{
"epoch": 0.3812586127698668,
"grad_norm": 12.102642059326172,
"learning_rate": 4.046853468075334e-06,
"loss": 0.6385,
"step": 2490
},
{
"epoch": 0.3812586127698668,
"eval_accuracy": 0.5984858576439768,
"eval_loss": 0.6862630844116211,
"eval_runtime": 277.6537,
"eval_samples_per_second": 162.605,
"eval_steps_per_second": 20.327,
"step": 2490
},
{
"epoch": 0.38278977185729596,
"grad_norm": 8.007050514221191,
"learning_rate": 4.04302557035676e-06,
"loss": 0.5878,
"step": 2500
},
{
"epoch": 0.38278977185729596,
"eval_accuracy": 0.6000088768558177,
"eval_loss": 0.6816014647483826,
"eval_runtime": 278.6731,
"eval_samples_per_second": 162.011,
"eval_steps_per_second": 20.253,
"step": 2500
}
],
"logging_steps": 10,
"max_steps": 13062,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}