ljcamargo's picture
Training in progress, step 2400, checkpoint
f7cd1b6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.96,
"eval_steps": 500,
"global_step": 2400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004,
"grad_norm": 8.119571685791016,
"learning_rate": 0.0,
"loss": 3.4677,
"step": 1
},
{
"epoch": 0.004,
"grad_norm": 8.098031997680664,
"learning_rate": 9e-06,
"loss": 4.4682,
"step": 10
},
{
"epoch": 0.008,
"grad_norm": 8.463135719299316,
"learning_rate": 1.9e-05,
"loss": 4.6132,
"step": 20
},
{
"epoch": 0.012,
"grad_norm": 7.452148914337158,
"learning_rate": 2.9e-05,
"loss": 4.223,
"step": 30
},
{
"epoch": 0.016,
"grad_norm": 5.572236061096191,
"learning_rate": 3.9000000000000006e-05,
"loss": 4.4111,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 7.39447021484375,
"learning_rate": 4.9e-05,
"loss": 4.0543,
"step": 50
},
{
"epoch": 0.024,
"grad_norm": 9.954078674316406,
"learning_rate": 4.999833521640187e-05,
"loss": 4.5007,
"step": 60
},
{
"epoch": 0.028,
"grad_norm": 5.994736194610596,
"learning_rate": 4.9992580693557054e-05,
"loss": 4.6204,
"step": 70
},
{
"epoch": 0.032,
"grad_norm": 7.18951940536499,
"learning_rate": 4.998271682453017e-05,
"loss": 4.2467,
"step": 80
},
{
"epoch": 0.036,
"grad_norm": 9.591761589050293,
"learning_rate": 4.996874523116464e-05,
"loss": 4.4063,
"step": 90
},
{
"epoch": 0.04,
"grad_norm": 7.566534519195557,
"learning_rate": 4.995066821070679e-05,
"loss": 4.0773,
"step": 100
},
{
"epoch": 0.044,
"grad_norm": 5.793056488037109,
"learning_rate": 4.9928488735428105e-05,
"loss": 4.063,
"step": 110
},
{
"epoch": 0.048,
"grad_norm": 7.917051792144775,
"learning_rate": 4.990221045213652e-05,
"loss": 4.2533,
"step": 120
},
{
"epoch": 0.052,
"grad_norm": 7.804363250732422,
"learning_rate": 4.987183768157686e-05,
"loss": 4.0497,
"step": 130
},
{
"epoch": 0.056,
"grad_norm": 7.605108261108398,
"learning_rate": 4.983737541772033e-05,
"loss": 4.4334,
"step": 140
},
{
"epoch": 0.06,
"grad_norm": 9.323838233947754,
"learning_rate": 4.979882932694346e-05,
"loss": 4.0412,
"step": 150
},
{
"epoch": 0.064,
"grad_norm": 9.219818115234375,
"learning_rate": 4.9756205747096385e-05,
"loss": 3.9774,
"step": 160
},
{
"epoch": 0.068,
"grad_norm": 8.375937461853027,
"learning_rate": 4.9709511686460775e-05,
"loss": 4.0021,
"step": 170
},
{
"epoch": 0.072,
"grad_norm": 8.215436935424805,
"learning_rate": 4.96587548225975e-05,
"loss": 4.4227,
"step": 180
},
{
"epoch": 0.076,
"grad_norm": 9.561295509338379,
"learning_rate": 4.960394350108429e-05,
"loss": 4.1091,
"step": 190
},
{
"epoch": 0.08,
"grad_norm": 9.014528274536133,
"learning_rate": 4.954508673414351e-05,
"loss": 3.9428,
"step": 200
},
{
"epoch": 0.084,
"grad_norm": 7.69431209564209,
"learning_rate": 4.948219419916037e-05,
"loss": 4.368,
"step": 210
},
{
"epoch": 0.088,
"grad_norm": 11.925583839416504,
"learning_rate": 4.941527623709172e-05,
"loss": 3.6757,
"step": 220
},
{
"epoch": 0.092,
"grad_norm": 8.191117286682129,
"learning_rate": 4.934434385076576e-05,
"loss": 4.1905,
"step": 230
},
{
"epoch": 0.096,
"grad_norm": 7.863613128662109,
"learning_rate": 4.926940870307296e-05,
"loss": 4.0099,
"step": 240
},
{
"epoch": 0.1,
"grad_norm": 10.665002822875977,
"learning_rate": 4.9190483115048375e-05,
"loss": 3.9059,
"step": 250
},
{
"epoch": 0.104,
"grad_norm": 10.385906219482422,
"learning_rate": 4.910758006384583e-05,
"loss": 3.9221,
"step": 260
},
{
"epoch": 0.108,
"grad_norm": 8.544922828674316,
"learning_rate": 4.9020713180604126e-05,
"loss": 3.9398,
"step": 270
},
{
"epoch": 0.112,
"grad_norm": 7.989080429077148,
"learning_rate": 4.892989674820585e-05,
"loss": 3.7757,
"step": 280
},
{
"epoch": 0.116,
"grad_norm": 6.576107025146484,
"learning_rate": 4.8835145698928856e-05,
"loss": 3.5309,
"step": 290
},
{
"epoch": 0.12,
"grad_norm": 9.80089282989502,
"learning_rate": 4.873647561199115e-05,
"loss": 4.1776,
"step": 300
},
{
"epoch": 0.124,
"grad_norm": 15.050427436828613,
"learning_rate": 4.863390271098922e-05,
"loss": 3.5808,
"step": 310
},
{
"epoch": 0.128,
"grad_norm": 8.734102249145508,
"learning_rate": 4.852744386123061e-05,
"loss": 3.9796,
"step": 320
},
{
"epoch": 0.132,
"grad_norm": 8.711186408996582,
"learning_rate": 4.84171165669608e-05,
"loss": 4.2317,
"step": 330
},
{
"epoch": 0.136,
"grad_norm": 6.751059055328369,
"learning_rate": 4.8302938968485144e-05,
"loss": 3.7145,
"step": 340
},
{
"epoch": 0.14,
"grad_norm": 10.623860359191895,
"learning_rate": 4.8184929839186196e-05,
"loss": 3.9616,
"step": 350
},
{
"epoch": 0.144,
"grad_norm": 7.5071330070495605,
"learning_rate": 4.806310858243694e-05,
"loss": 4.0164,
"step": 360
},
{
"epoch": 0.148,
"grad_norm": 8.607765197753906,
"learning_rate": 4.793749522841042e-05,
"loss": 4.4924,
"step": 370
},
{
"epoch": 0.152,
"grad_norm": 8.406026840209961,
"learning_rate": 4.780811043078636e-05,
"loss": 3.4254,
"step": 380
},
{
"epoch": 0.156,
"grad_norm": 9.387131690979004,
"learning_rate": 4.767497546335519e-05,
"loss": 3.9158,
"step": 390
},
{
"epoch": 0.16,
"grad_norm": 7.5071258544921875,
"learning_rate": 4.753811221652017e-05,
"loss": 4.1042,
"step": 400
},
{
"epoch": 0.164,
"grad_norm": 6.716228008270264,
"learning_rate": 4.739754319369814e-05,
"loss": 3.8632,
"step": 410
},
{
"epoch": 0.168,
"grad_norm": 9.47385311126709,
"learning_rate": 4.7253291507619404e-05,
"loss": 3.7837,
"step": 420
},
{
"epoch": 0.172,
"grad_norm": 7.547070026397705,
"learning_rate": 4.710538087652748e-05,
"loss": 4.0398,
"step": 430
},
{
"epoch": 0.176,
"grad_norm": 13.61339282989502,
"learning_rate": 4.695383562027933e-05,
"loss": 3.7789,
"step": 440
},
{
"epoch": 0.18,
"grad_norm": 6.434921741485596,
"learning_rate": 4.679868065634656e-05,
"loss": 3.9506,
"step": 450
},
{
"epoch": 0.184,
"grad_norm": 7.865591049194336,
"learning_rate": 4.663994149571849e-05,
"loss": 3.4036,
"step": 460
},
{
"epoch": 0.188,
"grad_norm": 9.311790466308594,
"learning_rate": 4.647764423870751e-05,
"loss": 4.1299,
"step": 470
},
{
"epoch": 0.192,
"grad_norm": 8.277907371520996,
"learning_rate": 4.631181557065761e-05,
"loss": 4.0614,
"step": 480
},
{
"epoch": 0.196,
"grad_norm": 12.398967742919922,
"learning_rate": 4.614248275755676e-05,
"loss": 3.7492,
"step": 490
},
{
"epoch": 0.2,
"grad_norm": 7.308017730712891,
"learning_rate": 4.5969673641553685e-05,
"loss": 4.1606,
"step": 500
},
{
"epoch": 0.204,
"grad_norm": 6.510436058044434,
"learning_rate": 4.579341663638004e-05,
"loss": 3.5708,
"step": 510
},
{
"epoch": 0.208,
"grad_norm": 10.914970397949219,
"learning_rate": 4.5613740722678525e-05,
"loss": 3.4741,
"step": 520
},
{
"epoch": 0.212,
"grad_norm": 8.786978721618652,
"learning_rate": 4.5430675443237817e-05,
"loss": 3.6204,
"step": 530
},
{
"epoch": 0.216,
"grad_norm": 10.566540718078613,
"learning_rate": 4.524425089813507e-05,
"loss": 3.9298,
"step": 540
},
{
"epoch": 0.22,
"grad_norm": 8.051084518432617,
"learning_rate": 4.505449773978677e-05,
"loss": 3.7783,
"step": 550
},
{
"epoch": 0.224,
"grad_norm": 11.182727813720703,
"learning_rate": 4.4861447167908824e-05,
"loss": 3.8174,
"step": 560
},
{
"epoch": 0.228,
"grad_norm": 11.375614166259766,
"learning_rate": 4.466513092438653e-05,
"loss": 4.0511,
"step": 570
},
{
"epoch": 0.232,
"grad_norm": 8.66441535949707,
"learning_rate": 4.446558128805561e-05,
"loss": 3.7058,
"step": 580
},
{
"epoch": 0.236,
"grad_norm": 5.743879795074463,
"learning_rate": 4.426283106939474e-05,
"loss": 3.817,
"step": 590
},
{
"epoch": 0.24,
"grad_norm": 10.231585502624512,
"learning_rate": 4.4056913605130804e-05,
"loss": 3.9779,
"step": 600
},
{
"epoch": 0.244,
"grad_norm": 5.9834465980529785,
"learning_rate": 4.3847862752757604e-05,
"loss": 3.4466,
"step": 610
},
{
"epoch": 0.248,
"grad_norm": 9.162353515625,
"learning_rate": 4.363571288496888e-05,
"loss": 3.576,
"step": 620
},
{
"epoch": 0.252,
"grad_norm": 10.162070274353027,
"learning_rate": 4.342049888400669e-05,
"loss": 4.084,
"step": 630
},
{
"epoch": 0.256,
"grad_norm": 12.520784378051758,
"learning_rate": 4.3202256135925956e-05,
"loss": 3.6774,
"step": 640
},
{
"epoch": 0.26,
"grad_norm": 10.249221801757812,
"learning_rate": 4.298102052477621e-05,
"loss": 3.9724,
"step": 650
},
{
"epoch": 0.264,
"grad_norm": 10.406034469604492,
"learning_rate": 4.2756828426701426e-05,
"loss": 3.9906,
"step": 660
},
{
"epoch": 0.268,
"grad_norm": 15.699187278747559,
"learning_rate": 4.2529716703959024e-05,
"loss": 3.2696,
"step": 670
},
{
"epoch": 0.272,
"grad_norm": 10.240876197814941,
"learning_rate": 4.229972269885877e-05,
"loss": 3.2456,
"step": 680
},
{
"epoch": 0.276,
"grad_norm": 11.525603294372559,
"learning_rate": 4.206688422762295e-05,
"loss": 3.6349,
"step": 690
},
{
"epoch": 0.28,
"grad_norm": 6.479814052581787,
"learning_rate": 4.1831239574168493e-05,
"loss": 3.5024,
"step": 700
},
{
"epoch": 0.284,
"grad_norm": 10.296248435974121,
"learning_rate": 4.159282748381218e-05,
"loss": 4.0567,
"step": 710
},
{
"epoch": 0.288,
"grad_norm": 13.293269157409668,
"learning_rate": 4.135168715690015e-05,
"loss": 3.9591,
"step": 720
},
{
"epoch": 0.292,
"grad_norm": 7.214468479156494,
"learning_rate": 4.110785824236236e-05,
"loss": 3.8723,
"step": 730
},
{
"epoch": 0.296,
"grad_norm": 8.655447006225586,
"learning_rate": 4.086138083119347e-05,
"loss": 3.7503,
"step": 740
},
{
"epoch": 0.3,
"grad_norm": 12.488017082214355,
"learning_rate": 4.061229544986095e-05,
"loss": 3.6059,
"step": 750
},
{
"epoch": 0.304,
"grad_norm": 10.392841339111328,
"learning_rate": 4.036064305364162e-05,
"loss": 3.7607,
"step": 760
},
{
"epoch": 0.308,
"grad_norm": 13.796865463256836,
"learning_rate": 4.010646501988769e-05,
"loss": 3.3188,
"step": 770
},
{
"epoch": 0.312,
"grad_norm": 6.374794006347656,
"learning_rate": 3.9849803141223324e-05,
"loss": 3.3962,
"step": 780
},
{
"epoch": 0.316,
"grad_norm": 9.044532775878906,
"learning_rate": 3.9590699618673086e-05,
"loss": 3.9154,
"step": 790
},
{
"epoch": 0.32,
"grad_norm": 8.740546226501465,
"learning_rate": 3.932919705472306e-05,
"loss": 3.4457,
"step": 800
},
{
"epoch": 0.324,
"grad_norm": 10.944662094116211,
"learning_rate": 3.906533844631604e-05,
"loss": 3.6514,
"step": 810
},
{
"epoch": 0.328,
"grad_norm": 8.953042984008789,
"learning_rate": 3.879916717778191e-05,
"loss": 3.705,
"step": 820
},
{
"epoch": 0.332,
"grad_norm": 10.540362358093262,
"learning_rate": 3.8530727013704215e-05,
"loss": 3.4666,
"step": 830
},
{
"epoch": 0.336,
"grad_norm": 8.946858406066895,
"learning_rate": 3.826006209172433e-05,
"loss": 3.8688,
"step": 840
},
{
"epoch": 0.34,
"grad_norm": 8.028446197509766,
"learning_rate": 3.7987216915284184e-05,
"loss": 3.6068,
"step": 850
},
{
"epoch": 0.344,
"grad_norm": 13.014655113220215,
"learning_rate": 3.771223634630892e-05,
"loss": 3.883,
"step": 860
},
{
"epoch": 0.348,
"grad_norm": 7.317591667175293,
"learning_rate": 3.743516559783055e-05,
"loss": 3.8452,
"step": 870
},
{
"epoch": 0.352,
"grad_norm": 9.003655433654785,
"learning_rate": 3.7156050226553956e-05,
"loss": 3.5083,
"step": 880
},
{
"epoch": 0.356,
"grad_norm": 8.790939331054688,
"learning_rate": 3.687493612536628e-05,
"loss": 3.6303,
"step": 890
},
{
"epoch": 0.36,
"grad_norm": 9.35024642944336,
"learning_rate": 3.659186951579111e-05,
"loss": 3.3183,
"step": 900
},
{
"epoch": 0.364,
"grad_norm": 12.418292045593262,
"learning_rate": 3.630689694038866e-05,
"loss": 3.6162,
"step": 910
},
{
"epoch": 0.368,
"grad_norm": 9.97085952758789,
"learning_rate": 3.6020065255103056e-05,
"loss": 3.6587,
"step": 920
},
{
"epoch": 0.372,
"grad_norm": 11.682862281799316,
"learning_rate": 3.573142162155819e-05,
"loss": 3.679,
"step": 930
},
{
"epoch": 0.376,
"grad_norm": 10.91349983215332,
"learning_rate": 3.544101349930328e-05,
"loss": 3.5703,
"step": 940
},
{
"epoch": 0.38,
"grad_norm": 7.593992710113525,
"learning_rate": 3.514888863800944e-05,
"loss": 3.0866,
"step": 950
},
{
"epoch": 0.384,
"grad_norm": 7.078611850738525,
"learning_rate": 3.485509506961856e-05,
"loss": 3.5236,
"step": 960
},
{
"epoch": 0.388,
"grad_norm": 4.66752290725708,
"learning_rate": 3.4559681100445756e-05,
"loss": 3.0979,
"step": 970
},
{
"epoch": 0.392,
"grad_norm": 11.089188575744629,
"learning_rate": 3.4262695303236724e-05,
"loss": 3.5252,
"step": 980
},
{
"epoch": 0.396,
"grad_norm": 9.009184837341309,
"learning_rate": 3.396418650918127e-05,
"loss": 3.7062,
"step": 990
},
{
"epoch": 0.4,
"grad_norm": 7.165460109710693,
"learning_rate": 3.366420379988441e-05,
"loss": 3.4182,
"step": 1000
},
{
"epoch": 0.404,
"grad_norm": 13.719085693359375,
"learning_rate": 3.336279649929614e-05,
"loss": 3.6603,
"step": 1010
},
{
"epoch": 0.408,
"grad_norm": 10.594961166381836,
"learning_rate": 3.306001416560156e-05,
"loss": 3.824,
"step": 1020
},
{
"epoch": 0.412,
"grad_norm": 9.565075874328613,
"learning_rate": 3.275590658307234e-05,
"loss": 3.074,
"step": 1030
},
{
"epoch": 0.416,
"grad_norm": 11.031000137329102,
"learning_rate": 3.245052375388107e-05,
"loss": 3.3561,
"step": 1040
},
{
"epoch": 0.42,
"grad_norm": 8.683501243591309,
"learning_rate": 3.214391588987976e-05,
"loss": 3.4976,
"step": 1050
},
{
"epoch": 0.424,
"grad_norm": 7.569673538208008,
"learning_rate": 3.1836133404343885e-05,
"loss": 3.3982,
"step": 1060
},
{
"epoch": 0.428,
"grad_norm": 9.724939346313477,
"learning_rate": 3.1527226903683286e-05,
"loss": 3.1605,
"step": 1070
},
{
"epoch": 0.432,
"grad_norm": 11.795547485351562,
"learning_rate": 3.121724717912138e-05,
"loss": 3.4858,
"step": 1080
},
{
"epoch": 0.436,
"grad_norm": 10.01028823852539,
"learning_rate": 3.090624519834383e-05,
"loss": 3.5917,
"step": 1090
},
{
"epoch": 0.44,
"grad_norm": 10.159195899963379,
"learning_rate": 3.0594272097118436e-05,
"loss": 3.5127,
"step": 1100
},
{
"epoch": 0.444,
"grad_norm": 12.02109432220459,
"learning_rate": 3.028137917088716e-05,
"loss": 3.7095,
"step": 1110
},
{
"epoch": 0.448,
"grad_norm": 11.922860145568848,
"learning_rate": 2.9967617866331997e-05,
"loss": 3.0155,
"step": 1120
},
{
"epoch": 0.452,
"grad_norm": 7.406614780426025,
"learning_rate": 2.9653039772916052e-05,
"loss": 3.5601,
"step": 1130
},
{
"epoch": 0.456,
"grad_norm": 9.041807174682617,
"learning_rate": 2.9337696614400977e-05,
"loss": 3.4362,
"step": 1140
},
{
"epoch": 0.46,
"grad_norm": 7.662649631500244,
"learning_rate": 2.902164024034246e-05,
"loss": 3.2583,
"step": 1150
},
{
"epoch": 0.464,
"grad_norm": 10.045381546020508,
"learning_rate": 2.8704922617564983e-05,
"loss": 3.5937,
"step": 1160
},
{
"epoch": 0.468,
"grad_norm": 10.429932594299316,
"learning_rate": 2.8387595821617275e-05,
"loss": 3.5703,
"step": 1170
},
{
"epoch": 0.472,
"grad_norm": 13.951080322265625,
"learning_rate": 2.8069712028209927e-05,
"loss": 3.2037,
"step": 1180
},
{
"epoch": 0.476,
"grad_norm": 9.768102645874023,
"learning_rate": 2.7751323504636544e-05,
"loss": 3.2948,
"step": 1190
},
{
"epoch": 0.48,
"grad_norm": 16.445524215698242,
"learning_rate": 2.7432482601179794e-05,
"loss": 3.7049,
"step": 1200
},
{
"epoch": 0.484,
"grad_norm": 10.077542304992676,
"learning_rate": 2.711324174250382e-05,
"loss": 3.7272,
"step": 1210
},
{
"epoch": 0.488,
"grad_norm": 10.981230735778809,
"learning_rate": 2.6793653419034482e-05,
"loss": 3.3686,
"step": 1220
},
{
"epoch": 0.492,
"grad_norm": 8.846978187561035,
"learning_rate": 2.6473770178328715e-05,
"loss": 3.7523,
"step": 1230
},
{
"epoch": 0.496,
"grad_norm": 13.945764541625977,
"learning_rate": 2.6153644616434526e-05,
"loss": 3.5152,
"step": 1240
},
{
"epoch": 0.5,
"grad_norm": 10.375041961669922,
"learning_rate": 2.583332936924299e-05,
"loss": 3.4198,
"step": 1250
},
{
"epoch": 0.504,
"grad_norm": 7.862137794494629,
"learning_rate": 2.5512877103833783e-05,
"loss": 3.4253,
"step": 1260
},
{
"epoch": 0.508,
"grad_norm": 9.651905059814453,
"learning_rate": 2.519234050981543e-05,
"loss": 2.9916,
"step": 1270
},
{
"epoch": 0.512,
"grad_norm": 8.323561668395996,
"learning_rate": 2.4871772290662044e-05,
"loss": 3.0336,
"step": 1280
},
{
"epoch": 0.516,
"grad_norm": 7.276916980743408,
"learning_rate": 2.4551225155047573e-05,
"loss": 3.3251,
"step": 1290
},
{
"epoch": 0.52,
"grad_norm": 9.36464786529541,
"learning_rate": 2.423075180817938e-05,
"loss": 3.0858,
"step": 1300
},
{
"epoch": 0.524,
"grad_norm": 9.869660377502441,
"learning_rate": 2.391040494313229e-05,
"loss": 3.2847,
"step": 1310
},
{
"epoch": 0.528,
"grad_norm": 8.658061981201172,
"learning_rate": 2.3590237232184644e-05,
"loss": 3.1331,
"step": 1320
},
{
"epoch": 0.532,
"grad_norm": 8.946754455566406,
"learning_rate": 2.3270301318157792e-05,
"loss": 3.4923,
"step": 1330
},
{
"epoch": 0.536,
"grad_norm": 10.488960266113281,
"learning_rate": 2.2950649805760438e-05,
"loss": 3.2958,
"step": 1340
},
{
"epoch": 0.54,
"grad_norm": 12.32264518737793,
"learning_rate": 2.263133525293918e-05,
"loss": 2.9298,
"step": 1350
},
{
"epoch": 0.544,
"grad_norm": 14.110706329345703,
"learning_rate": 2.2312410162236883e-05,
"loss": 3.2753,
"step": 1360
},
{
"epoch": 0.548,
"grad_norm": 11.187686920166016,
"learning_rate": 2.1993926972159972e-05,
"loss": 3.4152,
"step": 1370
},
{
"epoch": 0.552,
"grad_norm": 10.895075798034668,
"learning_rate": 2.1675938048556446e-05,
"loss": 3.4019,
"step": 1380
},
{
"epoch": 0.556,
"grad_norm": 5.504537105560303,
"learning_rate": 2.1358495676005664e-05,
"loss": 3.167,
"step": 1390
},
{
"epoch": 0.56,
"grad_norm": 8.452468872070312,
"learning_rate": 2.1041652049221648e-05,
"loss": 3.0729,
"step": 1400
},
{
"epoch": 0.564,
"grad_norm": 11.04509449005127,
"learning_rate": 2.0725459264471047e-05,
"loss": 3.642,
"step": 1410
},
{
"epoch": 0.568,
"grad_norm": 8.009263038635254,
"learning_rate": 2.0409969311007335e-05,
"loss": 3.1349,
"step": 1420
},
{
"epoch": 0.572,
"grad_norm": 8.250015258789062,
"learning_rate": 2.009523406252263e-05,
"loss": 3.4037,
"step": 1430
},
{
"epoch": 0.576,
"grad_norm": 6.933814525604248,
"learning_rate": 1.9781305268618417e-05,
"loss": 3.2761,
"step": 1440
},
{
"epoch": 0.58,
"grad_norm": 8.798672676086426,
"learning_rate": 1.9468234546296844e-05,
"loss": 3.2963,
"step": 1450
},
{
"epoch": 0.584,
"grad_norm": 8.615999221801758,
"learning_rate": 1.9156073371473618e-05,
"loss": 3.3487,
"step": 1460
},
{
"epoch": 0.588,
"grad_norm": 6.798926830291748,
"learning_rate": 1.8844873070514272e-05,
"loss": 3.2746,
"step": 1470
},
{
"epoch": 0.592,
"grad_norm": 8.364091873168945,
"learning_rate": 1.8534684811794893e-05,
"loss": 3.071,
"step": 1480
},
{
"epoch": 0.596,
"grad_norm": 6.177745342254639,
"learning_rate": 1.822555959728892e-05,
"loss": 2.8733,
"step": 1490
},
{
"epoch": 0.6,
"grad_norm": 7.9252238273620605,
"learning_rate": 1.7917548254181273e-05,
"loss": 3.0836,
"step": 1500
},
{
"epoch": 0.604,
"grad_norm": 10.669748306274414,
"learning_rate": 1.7610701426511128e-05,
"loss": 3.587,
"step": 1510
},
{
"epoch": 0.608,
"grad_norm": 6.509505271911621,
"learning_rate": 1.7305069566845046e-05,
"loss": 2.8579,
"step": 1520
},
{
"epoch": 0.612,
"grad_norm": 8.801206588745117,
"learning_rate": 1.7000702927981254e-05,
"loss": 3.5055,
"step": 1530
},
{
"epoch": 0.616,
"grad_norm": 13.360625267028809,
"learning_rate": 1.669765155468708e-05,
"loss": 3.007,
"step": 1540
},
{
"epoch": 0.62,
"grad_norm": 9.038350105285645,
"learning_rate": 1.6395965275470393e-05,
"loss": 3.546,
"step": 1550
},
{
"epoch": 0.624,
"grad_norm": 12.468111038208008,
"learning_rate": 1.6095693694386697e-05,
"loss": 3.046,
"step": 1560
},
{
"epoch": 0.628,
"grad_norm": 9.378480911254883,
"learning_rate": 1.5796886182883053e-05,
"loss": 2.9804,
"step": 1570
},
{
"epoch": 0.632,
"grad_norm": 8.186980247497559,
"learning_rate": 1.549959187168038e-05,
"loss": 3.1672,
"step": 1580
},
{
"epoch": 0.636,
"grad_norm": 13.096222877502441,
"learning_rate": 1.520385964269519e-05,
"loss": 3.0177,
"step": 1590
},
{
"epoch": 0.64,
"grad_norm": 9.109463691711426,
"learning_rate": 1.4909738121002276e-05,
"loss": 3.101,
"step": 1600
},
{
"epoch": 0.644,
"grad_norm": 8.423794746398926,
"learning_rate": 1.4617275666839725e-05,
"loss": 2.8508,
"step": 1610
},
{
"epoch": 0.648,
"grad_norm": 6.3356122970581055,
"learning_rate": 1.4326520367657314e-05,
"loss": 3.3239,
"step": 1620
},
{
"epoch": 0.652,
"grad_norm": 8.81240177154541,
"learning_rate": 1.4037520030209934e-05,
"loss": 3.2261,
"step": 1630
},
{
"epoch": 0.656,
"grad_norm": 7.250948905944824,
"learning_rate": 1.3750322172696972e-05,
"loss": 3.1138,
"step": 1640
},
{
"epoch": 0.66,
"grad_norm": 9.962249755859375,
"learning_rate": 1.3464974016949342e-05,
"loss": 3.2969,
"step": 1650
},
{
"epoch": 0.664,
"grad_norm": 7.954286575317383,
"learning_rate": 1.3181522480665098e-05,
"loss": 2.7313,
"step": 1660
},
{
"epoch": 0.668,
"grad_norm": 11.184345245361328,
"learning_rate": 1.2900014169695082e-05,
"loss": 3.2666,
"step": 1670
},
{
"epoch": 0.672,
"grad_norm": 8.145426750183105,
"learning_rate": 1.262049537037992e-05,
"loss": 2.7759,
"step": 1680
},
{
"epoch": 0.676,
"grad_norm": 12.047683715820312,
"learning_rate": 1.2343012041939469e-05,
"loss": 2.9462,
"step": 1690
},
{
"epoch": 0.68,
"grad_norm": 11.436731338500977,
"learning_rate": 1.2067609808916086e-05,
"loss": 3.362,
"step": 1700
},
{
"epoch": 0.684,
"grad_norm": 7.729074954986572,
"learning_rate": 1.1794333953672893e-05,
"loss": 3.4444,
"step": 1710
},
{
"epoch": 0.688,
"grad_norm": 11.743609428405762,
"learning_rate": 1.1523229408948394e-05,
"loss": 2.9723,
"step": 1720
},
{
"epoch": 0.692,
"grad_norm": 7.753131866455078,
"learning_rate": 1.1254340750468445e-05,
"loss": 3.0701,
"step": 1730
},
{
"epoch": 0.696,
"grad_norm": 10.646190643310547,
"learning_rate": 1.0987712189617049e-05,
"loss": 3.3374,
"step": 1740
},
{
"epoch": 0.7,
"grad_norm": 8.74120044708252,
"learning_rate": 1.0723387566166979e-05,
"loss": 3.0917,
"step": 1750
},
{
"epoch": 0.704,
"grad_norm": 9.45445728302002,
"learning_rate": 1.0461410341071528e-05,
"loss": 3.2809,
"step": 1760
},
{
"epoch": 0.708,
"grad_norm": 11.984269142150879,
"learning_rate": 1.0201823589318554e-05,
"loss": 3.256,
"step": 1770
},
{
"epoch": 0.712,
"grad_norm": 6.780118465423584,
"learning_rate": 9.944669992847946e-06,
"loss": 3.0955,
"step": 1780
},
{
"epoch": 0.716,
"grad_norm": 10.487933158874512,
"learning_rate": 9.689991833533804e-06,
"loss": 3.1214,
"step": 1790
},
{
"epoch": 0.72,
"grad_norm": 7.70168399810791,
"learning_rate": 9.437830986232265e-06,
"loss": 3.052,
"step": 1800
},
{
"epoch": 0.724,
"grad_norm": 13.816009521484375,
"learning_rate": 9.188228911896412e-06,
"loss": 3.4094,
"step": 1810
},
{
"epoch": 0.728,
"grad_norm": 8.344259262084961,
"learning_rate": 8.94122665075909e-06,
"loss": 3.0472,
"step": 1820
},
{
"epoch": 0.732,
"grad_norm": 19.413257598876953,
"learning_rate": 8.696864815584995e-06,
"loss": 2.6052,
"step": 1830
},
{
"epoch": 0.736,
"grad_norm": 10.31498908996582,
"learning_rate": 8.455183584993009e-06,
"loss": 3.0981,
"step": 1840
},
{
"epoch": 0.74,
"grad_norm": 11.46462345123291,
"learning_rate": 8.2162226968499e-06,
"loss": 3.1952,
"step": 1850
},
{
"epoch": 0.744,
"grad_norm": 9.817370414733887,
"learning_rate": 7.980021441736576e-06,
"loss": 2.9148,
"step": 1860
},
{
"epoch": 0.748,
"grad_norm": 12.085224151611328,
"learning_rate": 7.746618656487748e-06,
"loss": 3.1418,
"step": 1870
},
{
"epoch": 0.752,
"grad_norm": 13.42601490020752,
"learning_rate": 7.516052717806346e-06,
"loss": 3.0495,
"step": 1880
},
{
"epoch": 0.756,
"grad_norm": 10.328361511230469,
"learning_rate": 7.288361535953472e-06,
"loss": 3.2537,
"step": 1890
},
{
"epoch": 0.76,
"grad_norm": 8.320837020874023,
"learning_rate": 7.06358254851513e-06,
"loss": 3.2002,
"step": 1900
},
{
"epoch": 0.764,
"grad_norm": 12.367525100708008,
"learning_rate": 6.841752714246588e-06,
"loss": 3.415,
"step": 1910
},
{
"epoch": 0.768,
"grad_norm": 8.72415828704834,
"learning_rate": 6.622908506995581e-06,
"loss": 2.7481,
"step": 1920
},
{
"epoch": 0.772,
"grad_norm": 9.888436317443848,
"learning_rate": 6.407085909705157e-06,
"loss": 3.4815,
"step": 1930
},
{
"epoch": 0.776,
"grad_norm": 7.541075706481934,
"learning_rate": 6.194320408497245e-06,
"loss": 3.4048,
"step": 1940
},
{
"epoch": 0.78,
"grad_norm": 11.171248435974121,
"learning_rate": 5.98464698683798e-06,
"loss": 3.5409,
"step": 1950
},
{
"epoch": 0.784,
"grad_norm": 9.28205394744873,
"learning_rate": 5.778100119785587e-06,
"loss": 3.1082,
"step": 1960
},
{
"epoch": 0.788,
"grad_norm": 8.433388710021973,
"learning_rate": 5.5747137683219404e-06,
"loss": 2.9565,
"step": 1970
},
{
"epoch": 0.792,
"grad_norm": 14.938470840454102,
"learning_rate": 5.374521373768549e-06,
"loss": 3.2282,
"step": 1980
},
{
"epoch": 0.796,
"grad_norm": 9.903738975524902,
"learning_rate": 5.177555852288119e-06,
"loss": 2.9652,
"step": 1990
},
{
"epoch": 0.8,
"grad_norm": 13.002461433410645,
"learning_rate": 4.983849589472348e-06,
"loss": 3.221,
"step": 2000
},
{
"epoch": 0.804,
"grad_norm": 12.107378005981445,
"learning_rate": 4.793434435016986e-06,
"loss": 3.1341,
"step": 2010
},
{
"epoch": 0.808,
"grad_norm": 11.94257640838623,
"learning_rate": 4.606341697485087e-06,
"loss": 3.318,
"step": 2020
},
{
"epoch": 0.812,
"grad_norm": 10.116772651672363,
"learning_rate": 4.422602139159091e-06,
"loss": 3.2286,
"step": 2030
},
{
"epoch": 0.816,
"grad_norm": 10.068933486938477,
"learning_rate": 4.242245970982883e-06,
"loss": 3.306,
"step": 2040
},
{
"epoch": 0.82,
"grad_norm": 10.280326843261719,
"learning_rate": 4.065302847594369e-06,
"loss": 3.005,
"step": 2050
},
{
"epoch": 0.824,
"grad_norm": 10.214073181152344,
"learning_rate": 3.891801862449629e-06,
"loss": 2.9953,
"step": 2060
},
{
"epoch": 0.828,
"grad_norm": 12.787151336669922,
"learning_rate": 3.721771543039254e-06,
"loss": 2.9877,
"step": 2070
},
{
"epoch": 0.832,
"grad_norm": 7.119079113006592,
"learning_rate": 3.5552398461978277e-06,
"loss": 3.0851,
"step": 2080
},
{
"epoch": 0.836,
"grad_norm": 6.1061177253723145,
"learning_rate": 3.3922341535071483e-06,
"loss": 2.9198,
"step": 2090
},
{
"epoch": 0.84,
"grad_norm": 9.866963386535645,
"learning_rate": 3.23278126679408e-06,
"loss": 2.9846,
"step": 2100
},
{
"epoch": 0.844,
"grad_norm": 9.084943771362305,
"learning_rate": 3.0769074037237583e-06,
"loss": 2.9903,
"step": 2110
},
{
"epoch": 0.848,
"grad_norm": 6.5540595054626465,
"learning_rate": 2.9246381934887684e-06,
"loss": 3.2851,
"step": 2120
},
{
"epoch": 0.852,
"grad_norm": 7.740701675415039,
"learning_rate": 2.7759986725951703e-06,
"loss": 2.9797,
"step": 2130
},
{
"epoch": 0.856,
"grad_norm": 10.074856758117676,
"learning_rate": 2.6310132807458894e-06,
"loss": 3.1325,
"step": 2140
},
{
"epoch": 0.86,
"grad_norm": 10.44127368927002,
"learning_rate": 2.4897058568223137e-06,
"loss": 3.0159,
"step": 2150
},
{
"epoch": 0.864,
"grad_norm": 9.894632339477539,
"learning_rate": 2.3520996349645995e-06,
"loss": 2.8015,
"step": 2160
},
{
"epoch": 0.868,
"grad_norm": 9.043245315551758,
"learning_rate": 2.218217240751491e-06,
"loss": 3.4477,
"step": 2170
},
{
"epoch": 0.872,
"grad_norm": 9.901315689086914,
"learning_rate": 2.088080687480151e-06,
"loss": 3.3157,
"step": 2180
},
{
"epoch": 0.876,
"grad_norm": 8.202696800231934,
"learning_rate": 1.961711372546657e-06,
"loss": 2.9467,
"step": 2190
},
{
"epoch": 0.88,
"grad_norm": 8.691917419433594,
"learning_rate": 1.8391300739278139e-06,
"loss": 2.9079,
"step": 2200
},
{
"epoch": 0.884,
"grad_norm": 13.363630294799805,
"learning_rate": 1.7203569467647674e-06,
"loss": 3.2583,
"step": 2210
},
{
"epoch": 0.888,
"grad_norm": 11.732659339904785,
"learning_rate": 1.6054115200490493e-06,
"loss": 3.0431,
"step": 2220
},
{
"epoch": 0.892,
"grad_norm": 7.8193230628967285,
"learning_rate": 1.4943126934115536e-06,
"loss": 3.1155,
"step": 2230
},
{
"epoch": 0.896,
"grad_norm": 6.232199192047119,
"learning_rate": 1.3870787340150376e-06,
"loss": 3.2006,
"step": 2240
},
{
"epoch": 0.9,
"grad_norm": 5.650846004486084,
"learning_rate": 1.2837272735505668e-06,
"loss": 2.8882,
"step": 2250
},
{
"epoch": 0.904,
"grad_norm": 7.191598892211914,
"learning_rate": 1.1842753053384559e-06,
"loss": 3.0833,
"step": 2260
},
{
"epoch": 0.908,
"grad_norm": 8.854833602905273,
"learning_rate": 1.0887391815342124e-06,
"loss": 3.3196,
"step": 2270
},
{
"epoch": 0.912,
"grad_norm": 13.160386085510254,
"learning_rate": 9.971346104398455e-07,
"loss": 3.564,
"step": 2280
},
{
"epoch": 0.916,
"grad_norm": 8.540671348571777,
"learning_rate": 9.09476653921082e-07,
"loss": 3.1383,
"step": 2290
},
{
"epoch": 0.92,
"grad_norm": 12.331473350524902,
"learning_rate": 8.257797249308419e-07,
"loss": 3.259,
"step": 2300
},
{
"epoch": 0.924,
"grad_norm": 7.576813697814941,
"learning_rate": 7.460575851394341e-07,
"loss": 2.8659,
"step": 2310
},
{
"epoch": 0.928,
"grad_norm": 6.937955379486084,
"learning_rate": 6.703233426718136e-07,
"loss": 2.9416,
"step": 2320
},
{
"epoch": 0.932,
"grad_norm": 7.9867777824401855,
"learning_rate": 5.985894499523193e-07,
"loss": 3.0008,
"step": 2330
},
{
"epoch": 0.936,
"grad_norm": 10.474209785461426,
"learning_rate": 5.308677016572145e-07,
"loss": 3.6042,
"step": 2340
},
{
"epoch": 0.94,
"grad_norm": 6.954331398010254,
"learning_rate": 4.6716923277536627e-07,
"loss": 2.696,
"step": 2350
},
{
"epoch": 0.944,
"grad_norm": 6.555063247680664,
"learning_rate": 4.075045167774072e-07,
"loss": 3.2311,
"step": 2360
},
{
"epoch": 0.948,
"grad_norm": 7.122920513153076,
"learning_rate": 3.518833638936514e-07,
"loss": 3.1349,
"step": 2370
},
{
"epoch": 0.952,
"grad_norm": 10.269899368286133,
"learning_rate": 3.003149195010907e-07,
"loss": 2.9381,
"step": 2380
},
{
"epoch": 0.956,
"grad_norm": 8.958882331848145,
"learning_rate": 2.528076626196585e-07,
"loss": 3.0804,
"step": 2390
},
{
"epoch": 0.96,
"grad_norm": 11.036646842956543,
"learning_rate": 2.0936940451811437e-07,
"loss": 3.0191,
"step": 2400
}
],
"logging_steps": 10,
"max_steps": 2500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.128334475132928e+20,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}