wav2vec2_accent_classification / trainer_state.json
vrund1346's picture
Upload folder using huggingface_hub
44501f1 verified
{
"best_global_step": 5049,
"best_metric": 0.5286195286195287,
"best_model_checkpoint": "./wav2vec2_accent_classification_exp2/checkpoint-5049",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 8415,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011883541295306001,
"grad_norm": 13.946310997009277,
"learning_rate": 1.0688836104513065e-06,
"loss": 1.3423,
"step": 20
},
{
"epoch": 0.023767082590612002,
"grad_norm": 10.987089157104492,
"learning_rate": 2.2565320665083133e-06,
"loss": 1.5915,
"step": 40
},
{
"epoch": 0.035650623885918005,
"grad_norm": 9.114075660705566,
"learning_rate": 3.4441805225653207e-06,
"loss": 1.2294,
"step": 60
},
{
"epoch": 0.047534165181224004,
"grad_norm": 4.089534282684326,
"learning_rate": 4.631828978622328e-06,
"loss": 1.6258,
"step": 80
},
{
"epoch": 0.059417706476530004,
"grad_norm": 1.886678695678711,
"learning_rate": 5.819477434679335e-06,
"loss": 1.3137,
"step": 100
},
{
"epoch": 0.07130124777183601,
"grad_norm": 4.820520401000977,
"learning_rate": 6.947743467933492e-06,
"loss": 1.441,
"step": 120
},
{
"epoch": 0.08318478906714201,
"grad_norm": 8.273014068603516,
"learning_rate": 8.135391923990499e-06,
"loss": 1.0149,
"step": 140
},
{
"epoch": 0.09506833036244801,
"grad_norm": 14.566327095031738,
"learning_rate": 9.263657957244656e-06,
"loss": 1.9487,
"step": 160
},
{
"epoch": 0.10695187165775401,
"grad_norm": 8.97884750366211,
"learning_rate": 1.0451306413301664e-05,
"loss": 1.6529,
"step": 180
},
{
"epoch": 0.11883541295306001,
"grad_norm": 9.661385536193848,
"learning_rate": 1.163895486935867e-05,
"loss": 1.5005,
"step": 200
},
{
"epoch": 0.13071895424836602,
"grad_norm": 25.764766693115234,
"learning_rate": 1.2826603325415679e-05,
"loss": 1.1438,
"step": 220
},
{
"epoch": 0.14260249554367202,
"grad_norm": 6.792453289031982,
"learning_rate": 1.4014251781472683e-05,
"loss": 1.391,
"step": 240
},
{
"epoch": 0.15448603683897802,
"grad_norm": 20.50897979736328,
"learning_rate": 1.5201900237529693e-05,
"loss": 1.5084,
"step": 260
},
{
"epoch": 0.16636957813428402,
"grad_norm": 16.218360900878906,
"learning_rate": 1.63895486935867e-05,
"loss": 0.9483,
"step": 280
},
{
"epoch": 0.17825311942959002,
"grad_norm": 42.791805267333984,
"learning_rate": 1.7577197149643705e-05,
"loss": 1.4542,
"step": 300
},
{
"epoch": 0.19013666072489602,
"grad_norm": 8.093267440795898,
"learning_rate": 1.876484560570071e-05,
"loss": 0.9734,
"step": 320
},
{
"epoch": 0.20202020202020202,
"grad_norm": 11.891115188598633,
"learning_rate": 1.995249406175772e-05,
"loss": 2.0537,
"step": 340
},
{
"epoch": 0.21390374331550802,
"grad_norm": 8.027478218078613,
"learning_rate": 2.114014251781473e-05,
"loss": 1.4334,
"step": 360
},
{
"epoch": 0.22578728461081402,
"grad_norm": 0.1833440214395523,
"learning_rate": 2.2327790973871736e-05,
"loss": 1.261,
"step": 380
},
{
"epoch": 0.23767082590612001,
"grad_norm": 13.981164932250977,
"learning_rate": 2.3515439429928742e-05,
"loss": 1.1544,
"step": 400
},
{
"epoch": 0.24955436720142601,
"grad_norm": 10.806032180786133,
"learning_rate": 2.4703087885985748e-05,
"loss": 1.3147,
"step": 420
},
{
"epoch": 0.26143790849673204,
"grad_norm": 0.24756519496440887,
"learning_rate": 2.5890736342042754e-05,
"loss": 1.563,
"step": 440
},
{
"epoch": 0.27332144979203804,
"grad_norm": 5.813249588012695,
"learning_rate": 2.7078384798099763e-05,
"loss": 1.123,
"step": 460
},
{
"epoch": 0.28520499108734404,
"grad_norm": 0.6889862418174744,
"learning_rate": 2.826603325415677e-05,
"loss": 1.4189,
"step": 480
},
{
"epoch": 0.29708853238265004,
"grad_norm": 16.024629592895508,
"learning_rate": 2.9453681710213776e-05,
"loss": 1.6982,
"step": 500
},
{
"epoch": 0.30897207367795604,
"grad_norm": 14.917092323303223,
"learning_rate": 3.064133016627079e-05,
"loss": 1.6073,
"step": 520
},
{
"epoch": 0.32085561497326204,
"grad_norm": 3.446753978729248,
"learning_rate": 3.1828978622327794e-05,
"loss": 1.6064,
"step": 540
},
{
"epoch": 0.33273915626856804,
"grad_norm": 8.022488594055176,
"learning_rate": 3.30166270783848e-05,
"loss": 1.6411,
"step": 560
},
{
"epoch": 0.34462269756387404,
"grad_norm": 6.750226974487305,
"learning_rate": 3.4204275534441806e-05,
"loss": 1.6224,
"step": 580
},
{
"epoch": 0.35650623885918004,
"grad_norm": 8.452493667602539,
"learning_rate": 3.539192399049881e-05,
"loss": 1.906,
"step": 600
},
{
"epoch": 0.36838978015448604,
"grad_norm": 2.7862751483917236,
"learning_rate": 3.657957244655582e-05,
"loss": 1.2242,
"step": 620
},
{
"epoch": 0.38027332144979203,
"grad_norm": 9.983319282531738,
"learning_rate": 3.7767220902612825e-05,
"loss": 0.9727,
"step": 640
},
{
"epoch": 0.39215686274509803,
"grad_norm": 18.746906280517578,
"learning_rate": 3.895486935866984e-05,
"loss": 1.4356,
"step": 660
},
{
"epoch": 0.40404040404040403,
"grad_norm": 6.126219749450684,
"learning_rate": 4.0142517814726843e-05,
"loss": 1.354,
"step": 680
},
{
"epoch": 0.41592394533571003,
"grad_norm": 5.892016410827637,
"learning_rate": 4.133016627078385e-05,
"loss": 1.2593,
"step": 700
},
{
"epoch": 0.42780748663101603,
"grad_norm": 36.41590881347656,
"learning_rate": 4.2517814726840856e-05,
"loss": 1.6788,
"step": 720
},
{
"epoch": 0.43969102792632203,
"grad_norm": 7.916555881500244,
"learning_rate": 4.370546318289787e-05,
"loss": 1.2726,
"step": 740
},
{
"epoch": 0.45157456922162803,
"grad_norm": 13.21894359588623,
"learning_rate": 4.4893111638954874e-05,
"loss": 1.2754,
"step": 760
},
{
"epoch": 0.46345811051693403,
"grad_norm": 15.568081855773926,
"learning_rate": 4.6080760095011874e-05,
"loss": 1.4403,
"step": 780
},
{
"epoch": 0.47534165181224003,
"grad_norm": 0.22334127128124237,
"learning_rate": 4.7268408551068886e-05,
"loss": 1.1967,
"step": 800
},
{
"epoch": 0.48722519310754603,
"grad_norm": 3.894404888153076,
"learning_rate": 4.845605700712589e-05,
"loss": 1.3669,
"step": 820
},
{
"epoch": 0.49910873440285203,
"grad_norm": 0.42559102177619934,
"learning_rate": 4.96437054631829e-05,
"loss": 1.4207,
"step": 840
},
{
"epoch": 0.5109922756981581,
"grad_norm": 2.018228769302368,
"learning_rate": 4.9907566354152915e-05,
"loss": 1.5397,
"step": 860
},
{
"epoch": 0.5228758169934641,
"grad_norm": 4.865698337554932,
"learning_rate": 4.977551828865707e-05,
"loss": 0.9683,
"step": 880
},
{
"epoch": 0.5347593582887701,
"grad_norm": 13.285181999206543,
"learning_rate": 4.964347022316123e-05,
"loss": 1.0565,
"step": 900
},
{
"epoch": 0.5466428995840761,
"grad_norm": 18.873655319213867,
"learning_rate": 4.951142215766539e-05,
"loss": 1.2446,
"step": 920
},
{
"epoch": 0.5585264408793821,
"grad_norm": 7.773611068725586,
"learning_rate": 4.937937409216955e-05,
"loss": 1.1926,
"step": 940
},
{
"epoch": 0.5704099821746881,
"grad_norm": 32.8792610168457,
"learning_rate": 4.9247326026673714e-05,
"loss": 1.4732,
"step": 960
},
{
"epoch": 0.5822935234699941,
"grad_norm": 7.385775566101074,
"learning_rate": 4.911527796117787e-05,
"loss": 1.3785,
"step": 980
},
{
"epoch": 0.5941770647653001,
"grad_norm": 18.77931022644043,
"learning_rate": 4.898322989568203e-05,
"loss": 1.3192,
"step": 1000
},
{
"epoch": 0.6060606060606061,
"grad_norm": 5.315192699432373,
"learning_rate": 4.8851181830186186e-05,
"loss": 1.8817,
"step": 1020
},
{
"epoch": 0.6179441473559121,
"grad_norm": 55.71400833129883,
"learning_rate": 4.871913376469035e-05,
"loss": 1.2092,
"step": 1040
},
{
"epoch": 0.6298276886512181,
"grad_norm": 11.886483192443848,
"learning_rate": 4.8593688102469305e-05,
"loss": 1.4677,
"step": 1060
},
{
"epoch": 0.6417112299465241,
"grad_norm": 11.212895393371582,
"learning_rate": 4.846164003697346e-05,
"loss": 1.3166,
"step": 1080
},
{
"epoch": 0.6535947712418301,
"grad_norm": 0.1047590896487236,
"learning_rate": 4.832959197147762e-05,
"loss": 1.1183,
"step": 1100
},
{
"epoch": 0.6654783125371361,
"grad_norm": 15.537800788879395,
"learning_rate": 4.8197543905981776e-05,
"loss": 1.149,
"step": 1120
},
{
"epoch": 0.6773618538324421,
"grad_norm": 2.474100351333618,
"learning_rate": 4.806549584048594e-05,
"loss": 1.0457,
"step": 1140
},
{
"epoch": 0.6892453951277481,
"grad_norm": 8.193405151367188,
"learning_rate": 4.79334477749901e-05,
"loss": 1.6977,
"step": 1160
},
{
"epoch": 0.7011289364230541,
"grad_norm": 8.54953384399414,
"learning_rate": 4.780139970949426e-05,
"loss": 1.1706,
"step": 1180
},
{
"epoch": 0.7130124777183601,
"grad_norm": 1.4723117351531982,
"learning_rate": 4.766935164399842e-05,
"loss": 1.8051,
"step": 1200
},
{
"epoch": 0.7248960190136661,
"grad_norm": 14.194575309753418,
"learning_rate": 4.7537303578502576e-05,
"loss": 1.1304,
"step": 1220
},
{
"epoch": 0.7367795603089721,
"grad_norm": 2.2631449699401855,
"learning_rate": 4.740525551300673e-05,
"loss": 1.0901,
"step": 1240
},
{
"epoch": 0.7486631016042781,
"grad_norm": 15.501215934753418,
"learning_rate": 4.72732074475109e-05,
"loss": 1.5244,
"step": 1260
},
{
"epoch": 0.7605466428995841,
"grad_norm": 8.5521240234375,
"learning_rate": 4.7141159382015054e-05,
"loss": 1.036,
"step": 1280
},
{
"epoch": 0.7724301841948901,
"grad_norm": 2.5163466930389404,
"learning_rate": 4.700911131651922e-05,
"loss": 1.1431,
"step": 1300
},
{
"epoch": 0.7843137254901961,
"grad_norm": 6.166524887084961,
"learning_rate": 4.6877063251023375e-05,
"loss": 1.6139,
"step": 1320
},
{
"epoch": 0.7961972667855021,
"grad_norm": 0.9744242429733276,
"learning_rate": 4.674501518552753e-05,
"loss": 1.177,
"step": 1340
},
{
"epoch": 0.8080808080808081,
"grad_norm": 24.883136749267578,
"learning_rate": 4.661296712003169e-05,
"loss": 1.0183,
"step": 1360
},
{
"epoch": 0.8199643493761141,
"grad_norm": 60.34940719604492,
"learning_rate": 4.648091905453585e-05,
"loss": 1.5402,
"step": 1380
},
{
"epoch": 0.8318478906714201,
"grad_norm": 1.4948738813400269,
"learning_rate": 4.634887098904001e-05,
"loss": 1.8425,
"step": 1400
},
{
"epoch": 0.8437314319667261,
"grad_norm": 16.30664825439453,
"learning_rate": 4.6216822923544174e-05,
"loss": 1.8361,
"step": 1420
},
{
"epoch": 0.8556149732620321,
"grad_norm": 26.030075073242188,
"learning_rate": 4.608477485804833e-05,
"loss": 1.6207,
"step": 1440
},
{
"epoch": 0.8674985145573381,
"grad_norm": 9.511061668395996,
"learning_rate": 4.595272679255249e-05,
"loss": 1.2074,
"step": 1460
},
{
"epoch": 0.8793820558526441,
"grad_norm": 25.75135040283203,
"learning_rate": 4.5820678727056646e-05,
"loss": 1.3626,
"step": 1480
},
{
"epoch": 0.8912655971479501,
"grad_norm": 9.004514694213867,
"learning_rate": 4.568863066156081e-05,
"loss": 1.0094,
"step": 1500
},
{
"epoch": 0.9031491384432561,
"grad_norm": 15.008450508117676,
"learning_rate": 4.555658259606497e-05,
"loss": 0.9041,
"step": 1520
},
{
"epoch": 0.9150326797385621,
"grad_norm": 10.19078540802002,
"learning_rate": 4.542453453056913e-05,
"loss": 1.2298,
"step": 1540
},
{
"epoch": 0.9269162210338681,
"grad_norm": 6.368117809295654,
"learning_rate": 4.529248646507329e-05,
"loss": 1.1629,
"step": 1560
},
{
"epoch": 0.9387997623291741,
"grad_norm": 8.672981262207031,
"learning_rate": 4.5160438399577445e-05,
"loss": 1.4828,
"step": 1580
},
{
"epoch": 0.9506833036244801,
"grad_norm": 8.937565803527832,
"learning_rate": 4.502839033408161e-05,
"loss": 1.0707,
"step": 1600
},
{
"epoch": 0.9625668449197861,
"grad_norm": 42.59934997558594,
"learning_rate": 4.4896342268585766e-05,
"loss": 1.1838,
"step": 1620
},
{
"epoch": 0.9744503862150921,
"grad_norm": 5.572483062744141,
"learning_rate": 4.476429420308993e-05,
"loss": 1.3519,
"step": 1640
},
{
"epoch": 0.9863339275103981,
"grad_norm": 15.407792091369629,
"learning_rate": 4.463224613759409e-05,
"loss": 1.9737,
"step": 1660
},
{
"epoch": 0.9982174688057041,
"grad_norm": 10.246801376342773,
"learning_rate": 4.450019807209825e-05,
"loss": 1.3942,
"step": 1680
},
{
"epoch": 1.0,
"eval_accuracy": 0.5084175084175084,
"eval_loss": 1.4993430376052856,
"eval_runtime": 95.7254,
"eval_samples_per_second": 3.103,
"eval_steps_per_second": 3.103,
"step": 1683
},
{
"epoch": 1.0101010101010102,
"grad_norm": 92.40440368652344,
"learning_rate": 4.436815000660241e-05,
"loss": 1.5887,
"step": 1700
},
{
"epoch": 1.0219845513963162,
"grad_norm": 18.110252380371094,
"learning_rate": 4.4236101941106565e-05,
"loss": 1.8026,
"step": 1720
},
{
"epoch": 1.0338680926916222,
"grad_norm": 14.221270561218262,
"learning_rate": 4.410405387561072e-05,
"loss": 1.3638,
"step": 1740
},
{
"epoch": 1.0457516339869282,
"grad_norm": 4.216358661651611,
"learning_rate": 4.3972005810114886e-05,
"loss": 1.8443,
"step": 1760
},
{
"epoch": 1.0576351752822342,
"grad_norm": 0.6796126365661621,
"learning_rate": 4.3839957744619043e-05,
"loss": 0.7499,
"step": 1780
},
{
"epoch": 1.0695187165775402,
"grad_norm": 6.125434398651123,
"learning_rate": 4.370790967912321e-05,
"loss": 1.197,
"step": 1800
},
{
"epoch": 1.0814022578728462,
"grad_norm": 1.4550594091415405,
"learning_rate": 4.3575861613627364e-05,
"loss": 1.7175,
"step": 1820
},
{
"epoch": 1.0932857991681522,
"grad_norm": 9.569698333740234,
"learning_rate": 4.344381354813152e-05,
"loss": 1.5715,
"step": 1840
},
{
"epoch": 1.1051693404634582,
"grad_norm": 6.054975509643555,
"learning_rate": 4.331176548263568e-05,
"loss": 1.5164,
"step": 1860
},
{
"epoch": 1.1170528817587642,
"grad_norm": 66.08502197265625,
"learning_rate": 4.317971741713984e-05,
"loss": 1.834,
"step": 1880
},
{
"epoch": 1.1289364230540702,
"grad_norm": 31.93483543395996,
"learning_rate": 4.3047669351644e-05,
"loss": 1.4792,
"step": 1900
},
{
"epoch": 1.1408199643493762,
"grad_norm": 8.787007331848145,
"learning_rate": 4.2915621286148164e-05,
"loss": 1.1916,
"step": 1920
},
{
"epoch": 1.1527035056446822,
"grad_norm": 12.01029109954834,
"learning_rate": 4.278357322065232e-05,
"loss": 1.4424,
"step": 1940
},
{
"epoch": 1.1645870469399882,
"grad_norm": 10.144413948059082,
"learning_rate": 4.265152515515648e-05,
"loss": 1.1927,
"step": 1960
},
{
"epoch": 1.1764705882352942,
"grad_norm": 8.053743362426758,
"learning_rate": 4.2519477089660635e-05,
"loss": 1.4009,
"step": 1980
},
{
"epoch": 1.1883541295306002,
"grad_norm": 3.4341797828674316,
"learning_rate": 4.23874290241648e-05,
"loss": 0.8713,
"step": 2000
},
{
"epoch": 1.2002376708259062,
"grad_norm": 8.886828422546387,
"learning_rate": 4.2255380958668956e-05,
"loss": 0.9202,
"step": 2020
},
{
"epoch": 1.2121212121212122,
"grad_norm": 24.93857192993164,
"learning_rate": 4.212333289317312e-05,
"loss": 1.7785,
"step": 2040
},
{
"epoch": 1.2240047534165182,
"grad_norm": 16.332698822021484,
"learning_rate": 4.199128482767728e-05,
"loss": 1.1573,
"step": 2060
},
{
"epoch": 1.2358882947118242,
"grad_norm": 19.60898208618164,
"learning_rate": 4.1859236762181434e-05,
"loss": 1.4027,
"step": 2080
},
{
"epoch": 1.2477718360071302,
"grad_norm": 0.7732555866241455,
"learning_rate": 4.172718869668559e-05,
"loss": 1.1273,
"step": 2100
},
{
"epoch": 1.2596553773024362,
"grad_norm": 10.469820022583008,
"learning_rate": 4.1595140631189755e-05,
"loss": 1.1299,
"step": 2120
},
{
"epoch": 1.2715389185977422,
"grad_norm": 33.61751174926758,
"learning_rate": 4.146309256569392e-05,
"loss": 1.6233,
"step": 2140
},
{
"epoch": 1.2834224598930482,
"grad_norm": 18.64516258239746,
"learning_rate": 4.1331044500198077e-05,
"loss": 1.222,
"step": 2160
},
{
"epoch": 1.2953060011883542,
"grad_norm": 31.447341918945312,
"learning_rate": 4.1198996434702234e-05,
"loss": 1.4424,
"step": 2180
},
{
"epoch": 1.3071895424836601,
"grad_norm": 10.651689529418945,
"learning_rate": 4.106694836920639e-05,
"loss": 1.5345,
"step": 2200
},
{
"epoch": 1.3190730837789661,
"grad_norm": 3.412811040878296,
"learning_rate": 4.0934900303710555e-05,
"loss": 0.9922,
"step": 2220
},
{
"epoch": 1.3309566250742721,
"grad_norm": 14.007122039794922,
"learning_rate": 4.080285223821471e-05,
"loss": 0.9139,
"step": 2240
},
{
"epoch": 1.3428401663695781,
"grad_norm": 13.190730094909668,
"learning_rate": 4.0670804172718876e-05,
"loss": 1.284,
"step": 2260
},
{
"epoch": 1.3547237076648841,
"grad_norm": 0.3047619163990021,
"learning_rate": 4.053875610722303e-05,
"loss": 0.9321,
"step": 2280
},
{
"epoch": 1.3666072489601901,
"grad_norm": 32.45023727416992,
"learning_rate": 4.040670804172719e-05,
"loss": 1.217,
"step": 2300
},
{
"epoch": 1.3784907902554961,
"grad_norm": 15.72593879699707,
"learning_rate": 4.027465997623135e-05,
"loss": 1.7323,
"step": 2320
},
{
"epoch": 1.3903743315508021,
"grad_norm": 29.2542781829834,
"learning_rate": 4.014261191073551e-05,
"loss": 1.2535,
"step": 2340
},
{
"epoch": 1.4022578728461081,
"grad_norm": 23.876358032226562,
"learning_rate": 4.001056384523967e-05,
"loss": 1.8049,
"step": 2360
},
{
"epoch": 1.4141414141414141,
"grad_norm": 6.0813398361206055,
"learning_rate": 3.987851577974383e-05,
"loss": 1.2617,
"step": 2380
},
{
"epoch": 1.4260249554367201,
"grad_norm": 13.96455192565918,
"learning_rate": 3.974646771424799e-05,
"loss": 1.149,
"step": 2400
},
{
"epoch": 1.4379084967320261,
"grad_norm": 0.3341121971607208,
"learning_rate": 3.9614419648752146e-05,
"loss": 0.8563,
"step": 2420
},
{
"epoch": 1.4497920380273321,
"grad_norm": 9.068788528442383,
"learning_rate": 3.9482371583256304e-05,
"loss": 1.5814,
"step": 2440
},
{
"epoch": 1.4616755793226381,
"grad_norm": 6.83055305480957,
"learning_rate": 3.935032351776047e-05,
"loss": 1.4922,
"step": 2460
},
{
"epoch": 1.4735591206179441,
"grad_norm": 6.623952865600586,
"learning_rate": 3.9218275452264625e-05,
"loss": 1.0381,
"step": 2480
},
{
"epoch": 1.4854426619132501,
"grad_norm": 14.515748977661133,
"learning_rate": 3.908622738676879e-05,
"loss": 1.5335,
"step": 2500
},
{
"epoch": 1.4973262032085561,
"grad_norm": 30.07071876525879,
"learning_rate": 3.8954179321272946e-05,
"loss": 1.4818,
"step": 2520
},
{
"epoch": 1.5092097445038621,
"grad_norm": 18.413236618041992,
"learning_rate": 3.88221312557771e-05,
"loss": 1.2395,
"step": 2540
},
{
"epoch": 1.5210932857991681,
"grad_norm": 15.883363723754883,
"learning_rate": 3.869008319028126e-05,
"loss": 1.2161,
"step": 2560
},
{
"epoch": 1.5329768270944741,
"grad_norm": 21.880586624145508,
"learning_rate": 3.8558035124785424e-05,
"loss": 0.9101,
"step": 2580
},
{
"epoch": 1.5448603683897801,
"grad_norm": 18.103723526000977,
"learning_rate": 3.842598705928958e-05,
"loss": 1.4307,
"step": 2600
},
{
"epoch": 1.5567439096850861,
"grad_norm": 36.77565002441406,
"learning_rate": 3.8293938993793745e-05,
"loss": 1.5081,
"step": 2620
},
{
"epoch": 1.5686274509803921,
"grad_norm": 18.047407150268555,
"learning_rate": 3.81618909282979e-05,
"loss": 1.1816,
"step": 2640
},
{
"epoch": 1.5805109922756981,
"grad_norm": 6.798829555511475,
"learning_rate": 3.802984286280206e-05,
"loss": 1.0369,
"step": 2660
},
{
"epoch": 1.5923945335710041,
"grad_norm": 49.81755447387695,
"learning_rate": 3.7897794797306216e-05,
"loss": 1.3919,
"step": 2680
},
{
"epoch": 1.6042780748663101,
"grad_norm": 7.157380104064941,
"learning_rate": 3.776574673181038e-05,
"loss": 1.1026,
"step": 2700
},
{
"epoch": 1.6161616161616161,
"grad_norm": 10.095293998718262,
"learning_rate": 3.7633698666314544e-05,
"loss": 1.486,
"step": 2720
},
{
"epoch": 1.6280451574569221,
"grad_norm": 11.767765045166016,
"learning_rate": 3.75016506008187e-05,
"loss": 1.5152,
"step": 2740
},
{
"epoch": 1.6399286987522281,
"grad_norm": 7.241148471832275,
"learning_rate": 3.736960253532286e-05,
"loss": 1.4989,
"step": 2760
},
{
"epoch": 1.6518122400475341,
"grad_norm": 10.613069534301758,
"learning_rate": 3.7237554469827016e-05,
"loss": 1.1205,
"step": 2780
},
{
"epoch": 1.6636957813428401,
"grad_norm": 9.965812683105469,
"learning_rate": 3.710550640433118e-05,
"loss": 1.3987,
"step": 2800
},
{
"epoch": 1.6755793226381461,
"grad_norm": 1.9508392810821533,
"learning_rate": 3.697345833883534e-05,
"loss": 0.8366,
"step": 2820
},
{
"epoch": 1.6874628639334521,
"grad_norm": 8.022974967956543,
"learning_rate": 3.68414102733395e-05,
"loss": 1.481,
"step": 2840
},
{
"epoch": 1.6993464052287581,
"grad_norm": 3.9751815795898438,
"learning_rate": 3.670936220784366e-05,
"loss": 1.0167,
"step": 2860
},
{
"epoch": 1.7112299465240641,
"grad_norm": 3.7135047912597656,
"learning_rate": 3.657731414234782e-05,
"loss": 0.9009,
"step": 2880
},
{
"epoch": 1.7231134878193701,
"grad_norm": 4.908830165863037,
"learning_rate": 3.644526607685197e-05,
"loss": 1.833,
"step": 2900
},
{
"epoch": 1.7349970291146761,
"grad_norm": 13.40709400177002,
"learning_rate": 3.6313218011356136e-05,
"loss": 1.6045,
"step": 2920
},
{
"epoch": 1.7468805704099821,
"grad_norm": 15.4568510055542,
"learning_rate": 3.6187772349135085e-05,
"loss": 0.8128,
"step": 2940
},
{
"epoch": 1.7587641117052881,
"grad_norm": 23.41459083557129,
"learning_rate": 3.605572428363925e-05,
"loss": 1.5322,
"step": 2960
},
{
"epoch": 1.7706476530005941,
"grad_norm": 2.598500967025757,
"learning_rate": 3.5923676218143406e-05,
"loss": 1.0382,
"step": 2980
},
{
"epoch": 1.7825311942959001,
"grad_norm": 4.4699788093566895,
"learning_rate": 3.579162815264756e-05,
"loss": 0.8716,
"step": 3000
},
{
"epoch": 1.7944147355912063,
"grad_norm": 12.809410095214844,
"learning_rate": 3.565958008715172e-05,
"loss": 1.1505,
"step": 3020
},
{
"epoch": 1.8062982768865123,
"grad_norm": 11.521143913269043,
"learning_rate": 3.5527532021655884e-05,
"loss": 0.932,
"step": 3040
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.12162753939628601,
"learning_rate": 3.539548395616004e-05,
"loss": 1.2314,
"step": 3060
},
{
"epoch": 1.8300653594771243,
"grad_norm": 33.265018463134766,
"learning_rate": 3.5263435890664205e-05,
"loss": 1.6528,
"step": 3080
},
{
"epoch": 1.8419489007724303,
"grad_norm": 13.268301010131836,
"learning_rate": 3.513138782516836e-05,
"loss": 1.7362,
"step": 3100
},
{
"epoch": 1.8538324420677363,
"grad_norm": 2.0898756980895996,
"learning_rate": 3.4999339759672526e-05,
"loss": 1.0029,
"step": 3120
},
{
"epoch": 1.8657159833630423,
"grad_norm": 23.98683738708496,
"learning_rate": 3.486729169417668e-05,
"loss": 1.2156,
"step": 3140
},
{
"epoch": 1.8775995246583483,
"grad_norm": 3.1218037605285645,
"learning_rate": 3.473524362868084e-05,
"loss": 1.5906,
"step": 3160
},
{
"epoch": 1.8894830659536543,
"grad_norm": 0.22381114959716797,
"learning_rate": 3.4603195563185004e-05,
"loss": 0.8903,
"step": 3180
},
{
"epoch": 1.9013666072489603,
"grad_norm": 0.15086892247200012,
"learning_rate": 3.447114749768916e-05,
"loss": 1.9203,
"step": 3200
},
{
"epoch": 1.9132501485442663,
"grad_norm": 4.569338798522949,
"learning_rate": 3.4339099432193325e-05,
"loss": 1.4692,
"step": 3220
},
{
"epoch": 1.9251336898395723,
"grad_norm": 0.027358679100871086,
"learning_rate": 3.420705136669748e-05,
"loss": 0.8022,
"step": 3240
},
{
"epoch": 1.9370172311348783,
"grad_norm": 4.4244465827941895,
"learning_rate": 3.407500330120164e-05,
"loss": 0.8377,
"step": 3260
},
{
"epoch": 1.9489007724301843,
"grad_norm": 11.952341079711914,
"learning_rate": 3.3942955235705797e-05,
"loss": 1.3864,
"step": 3280
},
{
"epoch": 1.9607843137254903,
"grad_norm": 26.201208114624023,
"learning_rate": 3.381090717020996e-05,
"loss": 1.9779,
"step": 3300
},
{
"epoch": 1.9726678550207963,
"grad_norm": 7.364301681518555,
"learning_rate": 3.367885910471412e-05,
"loss": 1.2488,
"step": 3320
},
{
"epoch": 1.9845513963161023,
"grad_norm": 0.006225454155355692,
"learning_rate": 3.354681103921828e-05,
"loss": 1.7714,
"step": 3340
},
{
"epoch": 1.9964349376114083,
"grad_norm": 8.852691650390625,
"learning_rate": 3.341476297372244e-05,
"loss": 1.2031,
"step": 3360
},
{
"epoch": 2.0,
"eval_accuracy": 0.4781144781144781,
"eval_loss": 2.019242763519287,
"eval_runtime": 95.4083,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 3.113,
"step": 3366
},
{
"epoch": 2.0083184789067143,
"grad_norm": 31.287784576416016,
"learning_rate": 3.3282714908226596e-05,
"loss": 0.8208,
"step": 3380
},
{
"epoch": 2.0202020202020203,
"grad_norm": 2.58017635345459,
"learning_rate": 3.315066684273075e-05,
"loss": 1.1056,
"step": 3400
},
{
"epoch": 2.0320855614973263,
"grad_norm": 0.5721943974494934,
"learning_rate": 3.301861877723492e-05,
"loss": 1.0099,
"step": 3420
},
{
"epoch": 2.0439691027926323,
"grad_norm": 24.783836364746094,
"learning_rate": 3.2886570711739074e-05,
"loss": 1.5317,
"step": 3440
},
{
"epoch": 2.0558526440879383,
"grad_norm": 21.35218620300293,
"learning_rate": 3.275452264624324e-05,
"loss": 1.4263,
"step": 3460
},
{
"epoch": 2.0677361853832443,
"grad_norm": 6.621454238891602,
"learning_rate": 3.2622474580747395e-05,
"loss": 1.7136,
"step": 3480
},
{
"epoch": 2.0796197266785503,
"grad_norm": 0.5743517875671387,
"learning_rate": 3.249042651525155e-05,
"loss": 1.3269,
"step": 3500
},
{
"epoch": 2.0915032679738563,
"grad_norm": 6.627630233764648,
"learning_rate": 3.235837844975571e-05,
"loss": 1.093,
"step": 3520
},
{
"epoch": 2.1033868092691623,
"grad_norm": 8.044147491455078,
"learning_rate": 3.222633038425987e-05,
"loss": 1.6103,
"step": 3540
},
{
"epoch": 2.1152703505644683,
"grad_norm": 5.326707363128662,
"learning_rate": 3.209428231876403e-05,
"loss": 0.785,
"step": 3560
},
{
"epoch": 2.1271538918597743,
"grad_norm": 12.404735565185547,
"learning_rate": 3.1962234253268194e-05,
"loss": 0.6382,
"step": 3580
},
{
"epoch": 2.1390374331550803,
"grad_norm": 20.279443740844727,
"learning_rate": 3.183018618777235e-05,
"loss": 1.6417,
"step": 3600
},
{
"epoch": 2.1509209744503863,
"grad_norm": 5.098419189453125,
"learning_rate": 3.169813812227651e-05,
"loss": 1.1915,
"step": 3620
},
{
"epoch": 2.1628045157456923,
"grad_norm": 14.745153427124023,
"learning_rate": 3.1566090056780666e-05,
"loss": 0.906,
"step": 3640
},
{
"epoch": 2.1746880570409983,
"grad_norm": 13.50672435760498,
"learning_rate": 3.143404199128483e-05,
"loss": 1.318,
"step": 3660
},
{
"epoch": 2.1865715983363043,
"grad_norm": 0.5474358797073364,
"learning_rate": 3.130199392578899e-05,
"loss": 1.3119,
"step": 3680
},
{
"epoch": 2.1984551396316103,
"grad_norm": 4.46449089050293,
"learning_rate": 3.116994586029315e-05,
"loss": 1.263,
"step": 3700
},
{
"epoch": 2.2103386809269163,
"grad_norm": 0.36209437251091003,
"learning_rate": 3.103789779479731e-05,
"loss": 1.4981,
"step": 3720
},
{
"epoch": 2.2222222222222223,
"grad_norm": 28.142601013183594,
"learning_rate": 3.0905849729301465e-05,
"loss": 0.8877,
"step": 3740
},
{
"epoch": 2.2341057635175283,
"grad_norm": 0.3299981355667114,
"learning_rate": 3.077380166380563e-05,
"loss": 1.261,
"step": 3760
},
{
"epoch": 2.2459893048128343,
"grad_norm": 27.140256881713867,
"learning_rate": 3.0641753598309786e-05,
"loss": 1.8361,
"step": 3780
},
{
"epoch": 2.2578728461081403,
"grad_norm": 8.372640609741211,
"learning_rate": 3.0509705532813947e-05,
"loss": 1.3107,
"step": 3800
},
{
"epoch": 2.2697563874034463,
"grad_norm": 0.0998767837882042,
"learning_rate": 3.0377657467318104e-05,
"loss": 1.4373,
"step": 3820
},
{
"epoch": 2.2816399286987523,
"grad_norm": 6.8200154304504395,
"learning_rate": 3.0245609401822268e-05,
"loss": 1.4613,
"step": 3840
},
{
"epoch": 2.2935234699940583,
"grad_norm": 5.426070213317871,
"learning_rate": 3.0113561336326425e-05,
"loss": 1.4091,
"step": 3860
},
{
"epoch": 2.3054070112893643,
"grad_norm": 8.299811363220215,
"learning_rate": 2.9981513270830585e-05,
"loss": 1.3953,
"step": 3880
},
{
"epoch": 2.3172905525846703,
"grad_norm": 32.09596252441406,
"learning_rate": 2.9849465205334742e-05,
"loss": 0.8212,
"step": 3900
},
{
"epoch": 2.3291740938799763,
"grad_norm": 13.691157341003418,
"learning_rate": 2.9717417139838903e-05,
"loss": 1.2198,
"step": 3920
},
{
"epoch": 2.3410576351752823,
"grad_norm": 10.414209365844727,
"learning_rate": 2.958536907434306e-05,
"loss": 1.8699,
"step": 3940
},
{
"epoch": 2.3529411764705883,
"grad_norm": 13.435007095336914,
"learning_rate": 2.9453321008847224e-05,
"loss": 1.176,
"step": 3960
},
{
"epoch": 2.3648247177658943,
"grad_norm": 7.29554557800293,
"learning_rate": 2.932127294335138e-05,
"loss": 1.3829,
"step": 3980
},
{
"epoch": 2.3767082590612003,
"grad_norm": 0.7349147200584412,
"learning_rate": 2.9189224877855542e-05,
"loss": 0.9339,
"step": 4000
},
{
"epoch": 2.3885918003565063,
"grad_norm": 1.7457711696624756,
"learning_rate": 2.90571768123597e-05,
"loss": 1.0813,
"step": 4020
},
{
"epoch": 2.4004753416518123,
"grad_norm": 8.853830337524414,
"learning_rate": 2.892512874686386e-05,
"loss": 1.2585,
"step": 4040
},
{
"epoch": 2.4123588829471183,
"grad_norm": 8.946368217468262,
"learning_rate": 2.8793080681368017e-05,
"loss": 1.063,
"step": 4060
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.7700206637382507,
"learning_rate": 2.866103261587218e-05,
"loss": 0.5709,
"step": 4080
},
{
"epoch": 2.4361259655377303,
"grad_norm": 16.189891815185547,
"learning_rate": 2.8528984550376338e-05,
"loss": 0.898,
"step": 4100
},
{
"epoch": 2.4480095068330363,
"grad_norm": 0.00046739584649913013,
"learning_rate": 2.8396936484880498e-05,
"loss": 1.4132,
"step": 4120
},
{
"epoch": 2.4598930481283423,
"grad_norm": 30.322633743286133,
"learning_rate": 2.8264888419384655e-05,
"loss": 1.3733,
"step": 4140
},
{
"epoch": 2.4717765894236483,
"grad_norm": 2.891150951385498,
"learning_rate": 2.8132840353888816e-05,
"loss": 0.7882,
"step": 4160
},
{
"epoch": 2.4836601307189543,
"grad_norm": 10.033570289611816,
"learning_rate": 2.8000792288392973e-05,
"loss": 1.3073,
"step": 4180
},
{
"epoch": 2.4955436720142603,
"grad_norm": 0.06040190905332565,
"learning_rate": 2.7868744222897137e-05,
"loss": 1.0438,
"step": 4200
},
{
"epoch": 2.5074272133095663,
"grad_norm": 26.081911087036133,
"learning_rate": 2.7736696157401294e-05,
"loss": 1.3487,
"step": 4220
},
{
"epoch": 2.5193107546048723,
"grad_norm": 6.065289497375488,
"learning_rate": 2.7604648091905455e-05,
"loss": 0.8482,
"step": 4240
},
{
"epoch": 2.5311942959001783,
"grad_norm": 20.791973114013672,
"learning_rate": 2.747260002640961e-05,
"loss": 1.1362,
"step": 4260
},
{
"epoch": 2.5430778371954843,
"grad_norm": 0.8581683039665222,
"learning_rate": 2.7340551960913776e-05,
"loss": 1.1743,
"step": 4280
},
{
"epoch": 2.5549613784907903,
"grad_norm": 10.96738338470459,
"learning_rate": 2.7208503895417936e-05,
"loss": 0.6798,
"step": 4300
},
{
"epoch": 2.5668449197860963,
"grad_norm": 7.614635467529297,
"learning_rate": 2.7076455829922093e-05,
"loss": 1.3763,
"step": 4320
},
{
"epoch": 2.5787284610814023,
"grad_norm": 12.606915473937988,
"learning_rate": 2.6944407764426254e-05,
"loss": 0.8797,
"step": 4340
},
{
"epoch": 2.5906120023767083,
"grad_norm": 0.8938280940055847,
"learning_rate": 2.681235969893041e-05,
"loss": 1.3393,
"step": 4360
},
{
"epoch": 2.6024955436720143,
"grad_norm": 32.35802459716797,
"learning_rate": 2.6680311633434575e-05,
"loss": 1.1602,
"step": 4380
},
{
"epoch": 2.6143790849673203,
"grad_norm": 8.067231178283691,
"learning_rate": 2.6548263567938732e-05,
"loss": 1.0959,
"step": 4400
},
{
"epoch": 2.6262626262626263,
"grad_norm": 33.88584899902344,
"learning_rate": 2.6416215502442892e-05,
"loss": 2.0019,
"step": 4420
},
{
"epoch": 2.6381461675579323,
"grad_norm": 16.35814094543457,
"learning_rate": 2.628416743694705e-05,
"loss": 1.257,
"step": 4440
},
{
"epoch": 2.6500297088532383,
"grad_norm": 20.917156219482422,
"learning_rate": 2.615211937145121e-05,
"loss": 1.0194,
"step": 4460
},
{
"epoch": 2.6619132501485443,
"grad_norm": 1.7137622833251953,
"learning_rate": 2.6020071305955367e-05,
"loss": 0.9663,
"step": 4480
},
{
"epoch": 2.6737967914438503,
"grad_norm": 11.321816444396973,
"learning_rate": 2.588802324045953e-05,
"loss": 1.3196,
"step": 4500
},
{
"epoch": 2.6856803327391563,
"grad_norm": 0.3049762547016144,
"learning_rate": 2.575597517496369e-05,
"loss": 1.2364,
"step": 4520
},
{
"epoch": 2.6975638740344623,
"grad_norm": 12.133074760437012,
"learning_rate": 2.562392710946785e-05,
"loss": 0.6172,
"step": 4540
},
{
"epoch": 2.7094474153297683,
"grad_norm": 17.304231643676758,
"learning_rate": 2.5491879043972006e-05,
"loss": 1.2715,
"step": 4560
},
{
"epoch": 2.7213309566250743,
"grad_norm": 23.675540924072266,
"learning_rate": 2.5359830978476167e-05,
"loss": 1.5344,
"step": 4580
},
{
"epoch": 2.7332144979203803,
"grad_norm": 0.7793062329292297,
"learning_rate": 2.5227782912980324e-05,
"loss": 1.201,
"step": 4600
},
{
"epoch": 2.7450980392156863,
"grad_norm": 7.877832412719727,
"learning_rate": 2.5095734847484488e-05,
"loss": 1.1906,
"step": 4620
},
{
"epoch": 2.7569815805109923,
"grad_norm": 16.44761085510254,
"learning_rate": 2.4963686781988645e-05,
"loss": 1.3473,
"step": 4640
},
{
"epoch": 2.7688651218062983,
"grad_norm": 44.280391693115234,
"learning_rate": 2.4831638716492805e-05,
"loss": 1.0871,
"step": 4660
},
{
"epoch": 2.7807486631016043,
"grad_norm": 12.280585289001465,
"learning_rate": 2.4699590650996966e-05,
"loss": 0.9871,
"step": 4680
},
{
"epoch": 2.7926322043969103,
"grad_norm": 20.463937759399414,
"learning_rate": 2.4567542585501123e-05,
"loss": 1.2736,
"step": 4700
},
{
"epoch": 2.8045157456922163,
"grad_norm": 16.443626403808594,
"learning_rate": 2.4435494520005283e-05,
"loss": 1.4343,
"step": 4720
},
{
"epoch": 2.8163992869875223,
"grad_norm": 0.39355403184890747,
"learning_rate": 2.4303446454509444e-05,
"loss": 1.4741,
"step": 4740
},
{
"epoch": 2.8282828282828283,
"grad_norm": 0.24809391796588898,
"learning_rate": 2.41713983890136e-05,
"loss": 1.3588,
"step": 4760
},
{
"epoch": 2.8401663695781343,
"grad_norm": 6.478326797485352,
"learning_rate": 2.403935032351776e-05,
"loss": 1.2004,
"step": 4780
},
{
"epoch": 2.8520499108734403,
"grad_norm": 7.24426794052124,
"learning_rate": 2.3907302258021922e-05,
"loss": 0.9857,
"step": 4800
},
{
"epoch": 2.8639334521687463,
"grad_norm": 50.702274322509766,
"learning_rate": 2.377525419252608e-05,
"loss": 1.3814,
"step": 4820
},
{
"epoch": 2.8758169934640523,
"grad_norm": 29.885473251342773,
"learning_rate": 2.364320612703024e-05,
"loss": 1.4847,
"step": 4840
},
{
"epoch": 2.8877005347593583,
"grad_norm": 8.818001747131348,
"learning_rate": 2.35111580615344e-05,
"loss": 1.3428,
"step": 4860
},
{
"epoch": 2.8995840760546643,
"grad_norm": 20.55769920349121,
"learning_rate": 2.3379109996038558e-05,
"loss": 0.917,
"step": 4880
},
{
"epoch": 2.9114676173499703,
"grad_norm": 47.634132385253906,
"learning_rate": 2.3247061930542718e-05,
"loss": 1.8805,
"step": 4900
},
{
"epoch": 2.9233511586452763,
"grad_norm": 6.875288963317871,
"learning_rate": 2.311501386504688e-05,
"loss": 1.2572,
"step": 4920
},
{
"epoch": 2.9352346999405823,
"grad_norm": 14.847312927246094,
"learning_rate": 2.2982965799551036e-05,
"loss": 1.1894,
"step": 4940
},
{
"epoch": 2.9471182412358883,
"grad_norm": 12.259700775146484,
"learning_rate": 2.2850917734055196e-05,
"loss": 1.336,
"step": 4960
},
{
"epoch": 2.9590017825311943,
"grad_norm": 0.7911491990089417,
"learning_rate": 2.2718869668559357e-05,
"loss": 1.1977,
"step": 4980
},
{
"epoch": 2.9708853238265003,
"grad_norm": 0.1997266709804535,
"learning_rate": 2.2586821603063514e-05,
"loss": 0.8854,
"step": 5000
},
{
"epoch": 2.9827688651218063,
"grad_norm": 20.288911819458008,
"learning_rate": 2.2454773537567674e-05,
"loss": 1.7974,
"step": 5020
},
{
"epoch": 2.9946524064171123,
"grad_norm": 9.013290405273438,
"learning_rate": 2.2322725472071835e-05,
"loss": 1.3532,
"step": 5040
},
{
"epoch": 3.0,
"eval_accuracy": 0.5286195286195287,
"eval_loss": 1.7492114305496216,
"eval_runtime": 96.0692,
"eval_samples_per_second": 3.092,
"eval_steps_per_second": 3.092,
"step": 5049
},
{
"epoch": 3.0065359477124183,
"grad_norm": 0.3140548765659332,
"learning_rate": 2.2190677406575992e-05,
"loss": 1.524,
"step": 5060
},
{
"epoch": 3.0184194890077243,
"grad_norm": 13.2945556640625,
"learning_rate": 2.2058629341080153e-05,
"loss": 1.2853,
"step": 5080
},
{
"epoch": 3.0303030303030303,
"grad_norm": 29.13817024230957,
"learning_rate": 2.1926581275584313e-05,
"loss": 0.8849,
"step": 5100
},
{
"epoch": 3.0421865715983363,
"grad_norm": 2.024958848953247,
"learning_rate": 2.179453321008847e-05,
"loss": 0.812,
"step": 5120
},
{
"epoch": 3.0540701128936423,
"grad_norm": 8.262164115905762,
"learning_rate": 2.1662485144592634e-05,
"loss": 1.0216,
"step": 5140
},
{
"epoch": 3.0659536541889483,
"grad_norm": 7.723167896270752,
"learning_rate": 2.1530437079096795e-05,
"loss": 1.4141,
"step": 5160
},
{
"epoch": 3.0778371954842543,
"grad_norm": 29.036544799804688,
"learning_rate": 2.1398389013600952e-05,
"loss": 1.3481,
"step": 5180
},
{
"epoch": 3.0897207367795603,
"grad_norm": 21.65529441833496,
"learning_rate": 2.1266340948105112e-05,
"loss": 1.4193,
"step": 5200
},
{
"epoch": 3.1016042780748663,
"grad_norm": 4.1284637451171875,
"learning_rate": 2.1134292882609273e-05,
"loss": 1.0835,
"step": 5220
},
{
"epoch": 3.1134878193701723,
"grad_norm": 2.1729178428649902,
"learning_rate": 2.100224481711343e-05,
"loss": 0.9514,
"step": 5240
},
{
"epoch": 3.1253713606654783,
"grad_norm": 27.18805694580078,
"learning_rate": 2.087019675161759e-05,
"loss": 1.1735,
"step": 5260
},
{
"epoch": 3.1372549019607843,
"grad_norm": 28.346616744995117,
"learning_rate": 2.073814868612175e-05,
"loss": 1.1961,
"step": 5280
},
{
"epoch": 3.1491384432560903,
"grad_norm": 0.26050230860710144,
"learning_rate": 2.0606100620625908e-05,
"loss": 1.08,
"step": 5300
},
{
"epoch": 3.1610219845513963,
"grad_norm": 16.993898391723633,
"learning_rate": 2.047405255513007e-05,
"loss": 0.9361,
"step": 5320
},
{
"epoch": 3.1729055258467023,
"grad_norm": 24.867691040039062,
"learning_rate": 2.034200448963423e-05,
"loss": 0.912,
"step": 5340
},
{
"epoch": 3.1847890671420083,
"grad_norm": 21.213882446289062,
"learning_rate": 2.0209956424138386e-05,
"loss": 0.9956,
"step": 5360
},
{
"epoch": 3.1966726084373143,
"grad_norm": 1.9638105630874634,
"learning_rate": 2.0077908358642547e-05,
"loss": 1.1497,
"step": 5380
},
{
"epoch": 3.2085561497326203,
"grad_norm": 38.245845794677734,
"learning_rate": 1.9945860293146708e-05,
"loss": 1.4225,
"step": 5400
},
{
"epoch": 3.2204396910279263,
"grad_norm": 2.6078875064849854,
"learning_rate": 1.9813812227650865e-05,
"loss": 1.0647,
"step": 5420
},
{
"epoch": 3.2323232323232323,
"grad_norm": 10.162765502929688,
"learning_rate": 1.9681764162155025e-05,
"loss": 0.8759,
"step": 5440
},
{
"epoch": 3.2442067736185383,
"grad_norm": 46.642784118652344,
"learning_rate": 1.9549716096659186e-05,
"loss": 0.8714,
"step": 5460
},
{
"epoch": 3.2560903149138443,
"grad_norm": 13.699341773986816,
"learning_rate": 1.9417668031163343e-05,
"loss": 1.3097,
"step": 5480
},
{
"epoch": 3.2679738562091503,
"grad_norm": 0.002345487242564559,
"learning_rate": 1.9285619965667503e-05,
"loss": 1.2136,
"step": 5500
},
{
"epoch": 3.2798573975044563,
"grad_norm": 1.9926633834838867,
"learning_rate": 1.9153571900171664e-05,
"loss": 1.7481,
"step": 5520
},
{
"epoch": 3.2917409387997623,
"grad_norm": 1.4836812019348145,
"learning_rate": 1.902152383467582e-05,
"loss": 1.0077,
"step": 5540
},
{
"epoch": 3.3036244800950683,
"grad_norm": 11.53355884552002,
"learning_rate": 1.888947576917998e-05,
"loss": 1.0962,
"step": 5560
},
{
"epoch": 3.3155080213903743,
"grad_norm": 0.4469129741191864,
"learning_rate": 1.8757427703684142e-05,
"loss": 0.5683,
"step": 5580
},
{
"epoch": 3.3273915626856803,
"grad_norm": 13.35949993133545,
"learning_rate": 1.86253796381883e-05,
"loss": 1.5648,
"step": 5600
},
{
"epoch": 3.3392751039809863,
"grad_norm": 20.45748519897461,
"learning_rate": 1.849333157269246e-05,
"loss": 0.9816,
"step": 5620
},
{
"epoch": 3.3511586452762923,
"grad_norm": 19.171648025512695,
"learning_rate": 1.836128350719662e-05,
"loss": 1.2036,
"step": 5640
},
{
"epoch": 3.3630421865715983,
"grad_norm": 8.639327049255371,
"learning_rate": 1.8229235441700777e-05,
"loss": 1.4011,
"step": 5660
},
{
"epoch": 3.3749257278669043,
"grad_norm": 18.734729766845703,
"learning_rate": 1.8097187376204938e-05,
"loss": 0.9052,
"step": 5680
},
{
"epoch": 3.3868092691622103,
"grad_norm": 8.798933982849121,
"learning_rate": 1.79651393107091e-05,
"loss": 1.065,
"step": 5700
},
{
"epoch": 3.3986928104575163,
"grad_norm": 13.333514213562012,
"learning_rate": 1.783309124521326e-05,
"loss": 1.2257,
"step": 5720
},
{
"epoch": 3.4105763517528223,
"grad_norm": 8.952251434326172,
"learning_rate": 1.770104317971742e-05,
"loss": 1.7173,
"step": 5740
},
{
"epoch": 3.4224598930481283,
"grad_norm": 60.43721389770508,
"learning_rate": 1.756899511422158e-05,
"loss": 1.6395,
"step": 5760
},
{
"epoch": 3.4343434343434343,
"grad_norm": 28.346031188964844,
"learning_rate": 1.7436947048725737e-05,
"loss": 0.7907,
"step": 5780
},
{
"epoch": 3.4462269756387403,
"grad_norm": 35.44182586669922,
"learning_rate": 1.7304898983229898e-05,
"loss": 0.8268,
"step": 5800
},
{
"epoch": 3.4581105169340463,
"grad_norm": 22.25836181640625,
"learning_rate": 1.717285091773406e-05,
"loss": 1.3629,
"step": 5820
},
{
"epoch": 3.4699940582293523,
"grad_norm": 0.6172131299972534,
"learning_rate": 1.7040802852238215e-05,
"loss": 1.3379,
"step": 5840
},
{
"epoch": 3.4818775995246583,
"grad_norm": 19.02311897277832,
"learning_rate": 1.6908754786742376e-05,
"loss": 1.9869,
"step": 5860
},
{
"epoch": 3.4937611408199643,
"grad_norm": 3.069859266281128,
"learning_rate": 1.6776706721246537e-05,
"loss": 1.1256,
"step": 5880
},
{
"epoch": 3.5056446821152702,
"grad_norm": 2.17348051071167,
"learning_rate": 1.6644658655750694e-05,
"loss": 1.6133,
"step": 5900
},
{
"epoch": 3.5175282234105762,
"grad_norm": 0.22041022777557373,
"learning_rate": 1.6512610590254854e-05,
"loss": 1.0189,
"step": 5920
},
{
"epoch": 3.5294117647058822,
"grad_norm": 20.462352752685547,
"learning_rate": 1.6380562524759015e-05,
"loss": 1.5619,
"step": 5940
},
{
"epoch": 3.5412953060011882,
"grad_norm": 0.396953284740448,
"learning_rate": 1.6248514459263172e-05,
"loss": 0.9543,
"step": 5960
},
{
"epoch": 3.5531788472964942,
"grad_norm": 0.23321232199668884,
"learning_rate": 1.6116466393767332e-05,
"loss": 1.8788,
"step": 5980
},
{
"epoch": 3.5650623885918002,
"grad_norm": 3.4248406887054443,
"learning_rate": 1.5984418328271493e-05,
"loss": 1.2511,
"step": 6000
},
{
"epoch": 3.5769459298871062,
"grad_norm": 12.430109977722168,
"learning_rate": 1.585237026277565e-05,
"loss": 1.1817,
"step": 6020
},
{
"epoch": 3.5888294711824122,
"grad_norm": 4.982590675354004,
"learning_rate": 1.572032219727981e-05,
"loss": 0.9199,
"step": 6040
},
{
"epoch": 3.6007130124777182,
"grad_norm": 0.2364758551120758,
"learning_rate": 1.558827413178397e-05,
"loss": 0.9745,
"step": 6060
},
{
"epoch": 3.6125965537730242,
"grad_norm": 7.997034072875977,
"learning_rate": 1.5456226066288128e-05,
"loss": 0.6052,
"step": 6080
},
{
"epoch": 3.6244800950683302,
"grad_norm": 0.0892493948340416,
"learning_rate": 1.532417800079229e-05,
"loss": 1.5061,
"step": 6100
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.2148207426071167,
"learning_rate": 1.5192129935296448e-05,
"loss": 0.89,
"step": 6120
},
{
"epoch": 3.6482471776589422,
"grad_norm": 13.237604141235352,
"learning_rate": 1.5060081869800608e-05,
"loss": 0.7876,
"step": 6140
},
{
"epoch": 3.6601307189542482,
"grad_norm": 8.745626449584961,
"learning_rate": 1.4928033804304767e-05,
"loss": 0.8908,
"step": 6160
},
{
"epoch": 3.6720142602495542,
"grad_norm": 13.550650596618652,
"learning_rate": 1.4795985738808926e-05,
"loss": 1.7108,
"step": 6180
},
{
"epoch": 3.6838978015448602,
"grad_norm": 11.73013973236084,
"learning_rate": 1.4663937673313086e-05,
"loss": 1.2625,
"step": 6200
},
{
"epoch": 3.6957813428401662,
"grad_norm": 0.0058449869975447655,
"learning_rate": 1.4531889607817245e-05,
"loss": 1.1066,
"step": 6220
},
{
"epoch": 3.7076648841354722,
"grad_norm": 0.6316971778869629,
"learning_rate": 1.4399841542321404e-05,
"loss": 0.9687,
"step": 6240
},
{
"epoch": 3.7195484254307782,
"grad_norm": 2.3133366107940674,
"learning_rate": 1.4267793476825565e-05,
"loss": 1.2709,
"step": 6260
},
{
"epoch": 3.7314319667260842,
"grad_norm": 33.18562698364258,
"learning_rate": 1.4135745411329727e-05,
"loss": 1.2372,
"step": 6280
},
{
"epoch": 3.7433155080213902,
"grad_norm": 3.1751153469085693,
"learning_rate": 1.4003697345833886e-05,
"loss": 1.6425,
"step": 6300
},
{
"epoch": 3.7551990493166962,
"grad_norm": 21.756851196289062,
"learning_rate": 1.3871649280338044e-05,
"loss": 0.9176,
"step": 6320
},
{
"epoch": 3.7670825906120022,
"grad_norm": 0.19835074245929718,
"learning_rate": 1.3739601214842205e-05,
"loss": 0.5485,
"step": 6340
},
{
"epoch": 3.7789661319073082,
"grad_norm": 20.786287307739258,
"learning_rate": 1.3607553149346364e-05,
"loss": 1.8427,
"step": 6360
},
{
"epoch": 3.7908496732026142,
"grad_norm": 3.5100600719451904,
"learning_rate": 1.3475505083850523e-05,
"loss": 1.1399,
"step": 6380
},
{
"epoch": 3.8027332144979202,
"grad_norm": 30.671335220336914,
"learning_rate": 1.3343457018354683e-05,
"loss": 0.9599,
"step": 6400
},
{
"epoch": 3.8146167557932262,
"grad_norm": 15.896026611328125,
"learning_rate": 1.3211408952858842e-05,
"loss": 0.6175,
"step": 6420
},
{
"epoch": 3.8265002970885322,
"grad_norm": 44.605567932128906,
"learning_rate": 1.3079360887363e-05,
"loss": 1.1109,
"step": 6440
},
{
"epoch": 3.8383838383838382,
"grad_norm": 3.223285675048828,
"learning_rate": 1.2947312821867161e-05,
"loss": 1.3965,
"step": 6460
},
{
"epoch": 3.8502673796791442,
"grad_norm": 0.000214108542422764,
"learning_rate": 1.281526475637132e-05,
"loss": 1.1792,
"step": 6480
},
{
"epoch": 3.8621509209744502,
"grad_norm": 26.38980484008789,
"learning_rate": 1.2683216690875479e-05,
"loss": 1.3786,
"step": 6500
},
{
"epoch": 3.8740344622697562,
"grad_norm": 17.448007583618164,
"learning_rate": 1.255116862537964e-05,
"loss": 1.0316,
"step": 6520
},
{
"epoch": 3.8859180035650622,
"grad_norm": 26.470539093017578,
"learning_rate": 1.2419120559883798e-05,
"loss": 1.0809,
"step": 6540
},
{
"epoch": 3.8978015448603682,
"grad_norm": 7.585838317871094,
"learning_rate": 1.2287072494387957e-05,
"loss": 0.9324,
"step": 6560
},
{
"epoch": 3.9096850861556742,
"grad_norm": 0.20393586158752441,
"learning_rate": 1.2155024428892118e-05,
"loss": 1.3897,
"step": 6580
},
{
"epoch": 3.9215686274509802,
"grad_norm": 0.0758831799030304,
"learning_rate": 1.2022976363396277e-05,
"loss": 0.9505,
"step": 6600
},
{
"epoch": 3.9334521687462862,
"grad_norm": 9.853219985961914,
"learning_rate": 1.1890928297900435e-05,
"loss": 0.7602,
"step": 6620
},
{
"epoch": 3.9453357100415927,
"grad_norm": 25.84638214111328,
"learning_rate": 1.1758880232404596e-05,
"loss": 0.8738,
"step": 6640
},
{
"epoch": 3.9572192513368982,
"grad_norm": 7.686094760894775,
"learning_rate": 1.1626832166908755e-05,
"loss": 0.7964,
"step": 6660
},
{
"epoch": 3.9691027926322047,
"grad_norm": 22.53441047668457,
"learning_rate": 1.1494784101412914e-05,
"loss": 1.2056,
"step": 6680
},
{
"epoch": 3.9809863339275102,
"grad_norm": 11.927616119384766,
"learning_rate": 1.1362736035917074e-05,
"loss": 1.2257,
"step": 6700
},
{
"epoch": 3.9928698752228167,
"grad_norm": 26.10556983947754,
"learning_rate": 1.1230687970421235e-05,
"loss": 1.4232,
"step": 6720
},
{
"epoch": 4.0,
"eval_accuracy": 0.494949494949495,
"eval_loss": 1.853576421737671,
"eval_runtime": 95.916,
"eval_samples_per_second": 3.096,
"eval_steps_per_second": 3.096,
"step": 6732
},
{
"epoch": 4.004753416518122,
"grad_norm": 12.647480964660645,
"learning_rate": 1.1098639904925393e-05,
"loss": 1.668,
"step": 6740
},
{
"epoch": 4.016636957813429,
"grad_norm": 10.921769142150879,
"learning_rate": 1.0966591839429554e-05,
"loss": 1.0108,
"step": 6760
},
{
"epoch": 4.028520499108734,
"grad_norm": 19.605995178222656,
"learning_rate": 1.0834543773933713e-05,
"loss": 0.8394,
"step": 6780
},
{
"epoch": 4.040404040404041,
"grad_norm": 0.858731746673584,
"learning_rate": 1.0702495708437872e-05,
"loss": 1.0475,
"step": 6800
},
{
"epoch": 4.052287581699346,
"grad_norm": 17.299776077270508,
"learning_rate": 1.0570447642942032e-05,
"loss": 1.3388,
"step": 6820
},
{
"epoch": 4.064171122994653,
"grad_norm": 14.995153427124023,
"learning_rate": 1.0438399577446191e-05,
"loss": 1.6191,
"step": 6840
},
{
"epoch": 4.076054664289958,
"grad_norm": 7.30485200881958,
"learning_rate": 1.030635151195035e-05,
"loss": 0.4908,
"step": 6860
},
{
"epoch": 4.087938205585265,
"grad_norm": 30.56113624572754,
"learning_rate": 1.017430344645451e-05,
"loss": 1.4799,
"step": 6880
},
{
"epoch": 4.09982174688057,
"grad_norm": 0.30605238676071167,
"learning_rate": 1.004225538095867e-05,
"loss": 0.8943,
"step": 6900
},
{
"epoch": 4.111705288175877,
"grad_norm": 0.005065273959189653,
"learning_rate": 9.910207315462828e-06,
"loss": 0.8102,
"step": 6920
},
{
"epoch": 4.123588829471182,
"grad_norm": 15.29279899597168,
"learning_rate": 9.778159249966989e-06,
"loss": 1.6485,
"step": 6940
},
{
"epoch": 4.135472370766489,
"grad_norm": 5.082457065582275,
"learning_rate": 9.646111184471147e-06,
"loss": 1.4701,
"step": 6960
},
{
"epoch": 4.147355912061794,
"grad_norm": 24.667327880859375,
"learning_rate": 9.514063118975306e-06,
"loss": 0.9979,
"step": 6980
},
{
"epoch": 4.159239453357101,
"grad_norm": 3.714982032775879,
"learning_rate": 9.382015053479467e-06,
"loss": 1.7571,
"step": 7000
},
{
"epoch": 4.171122994652406,
"grad_norm": 26.196866989135742,
"learning_rate": 9.249966987983627e-06,
"loss": 1.0751,
"step": 7020
},
{
"epoch": 4.183006535947713,
"grad_norm": 3.4196934700012207,
"learning_rate": 9.117918922487786e-06,
"loss": 1.7258,
"step": 7040
},
{
"epoch": 4.194890077243018,
"grad_norm": 0.00030567339854314923,
"learning_rate": 8.985870856991947e-06,
"loss": 1.2107,
"step": 7060
},
{
"epoch": 4.206773618538325,
"grad_norm": 32.88508224487305,
"learning_rate": 8.853822791496106e-06,
"loss": 1.189,
"step": 7080
},
{
"epoch": 4.21865715983363,
"grad_norm": 2.1774134635925293,
"learning_rate": 8.721774726000264e-06,
"loss": 0.788,
"step": 7100
},
{
"epoch": 4.230540701128937,
"grad_norm": 4.273499011993408,
"learning_rate": 8.589726660504425e-06,
"loss": 1.039,
"step": 7120
},
{
"epoch": 4.242424242424242,
"grad_norm": 0.47934776544570923,
"learning_rate": 8.457678595008584e-06,
"loss": 0.9438,
"step": 7140
},
{
"epoch": 4.254307783719549,
"grad_norm": 0.0029345829971134663,
"learning_rate": 8.325630529512743e-06,
"loss": 0.8263,
"step": 7160
},
{
"epoch": 4.266191325014854,
"grad_norm": 28.229270935058594,
"learning_rate": 8.193582464016903e-06,
"loss": 1.4488,
"step": 7180
},
{
"epoch": 4.278074866310161,
"grad_norm": 65.6903076171875,
"learning_rate": 8.061534398521062e-06,
"loss": 1.137,
"step": 7200
},
{
"epoch": 4.289958407605466,
"grad_norm": 42.370643615722656,
"learning_rate": 7.92948633302522e-06,
"loss": 1.3585,
"step": 7220
},
{
"epoch": 4.301841948900773,
"grad_norm": 3.4301371574401855,
"learning_rate": 7.797438267529381e-06,
"loss": 0.7765,
"step": 7240
},
{
"epoch": 4.313725490196078,
"grad_norm": 27.960153579711914,
"learning_rate": 7.66539020203354e-06,
"loss": 1.119,
"step": 7260
},
{
"epoch": 4.325609031491385,
"grad_norm": 26.195892333984375,
"learning_rate": 7.533342136537701e-06,
"loss": 1.0187,
"step": 7280
},
{
"epoch": 4.33749257278669,
"grad_norm": 0.002473491011187434,
"learning_rate": 7.40129407104186e-06,
"loss": 1.0475,
"step": 7300
},
{
"epoch": 4.349376114081997,
"grad_norm": 9.781648635864258,
"learning_rate": 7.26924600554602e-06,
"loss": 1.2822,
"step": 7320
},
{
"epoch": 4.361259655377302,
"grad_norm": 36.30683135986328,
"learning_rate": 7.137197940050179e-06,
"loss": 1.569,
"step": 7340
},
{
"epoch": 4.373143196672609,
"grad_norm": 56.16667175292969,
"learning_rate": 7.0051498745543385e-06,
"loss": 0.7711,
"step": 7360
},
{
"epoch": 4.385026737967914,
"grad_norm": 29.77967071533203,
"learning_rate": 6.873101809058498e-06,
"loss": 0.9722,
"step": 7380
},
{
"epoch": 4.396910279263221,
"grad_norm": 0.004187744576483965,
"learning_rate": 6.741053743562657e-06,
"loss": 1.1351,
"step": 7400
},
{
"epoch": 4.408793820558526,
"grad_norm": 37.170188903808594,
"learning_rate": 6.609005678066817e-06,
"loss": 0.7695,
"step": 7420
},
{
"epoch": 4.420677361853833,
"grad_norm": 0.7948021292686462,
"learning_rate": 6.476957612570976e-06,
"loss": 1.4337,
"step": 7440
},
{
"epoch": 4.432560903149138,
"grad_norm": 7.071142673492432,
"learning_rate": 6.344909547075135e-06,
"loss": 1.1206,
"step": 7460
},
{
"epoch": 4.444444444444445,
"grad_norm": 7.999136447906494,
"learning_rate": 6.212861481579295e-06,
"loss": 1.0916,
"step": 7480
},
{
"epoch": 4.45632798573975,
"grad_norm": 0.48218631744384766,
"learning_rate": 6.080813416083455e-06,
"loss": 1.2054,
"step": 7500
},
{
"epoch": 4.468211527035057,
"grad_norm": 0.760070264339447,
"learning_rate": 5.948765350587614e-06,
"loss": 0.5043,
"step": 7520
},
{
"epoch": 4.480095068330362,
"grad_norm": 24.503273010253906,
"learning_rate": 5.816717285091774e-06,
"loss": 1.3106,
"step": 7540
},
{
"epoch": 4.491978609625669,
"grad_norm": 1.2818807363510132,
"learning_rate": 5.684669219595933e-06,
"loss": 1.1969,
"step": 7560
},
{
"epoch": 4.503862150920974,
"grad_norm": 41.00161361694336,
"learning_rate": 5.5526211541000925e-06,
"loss": 1.0278,
"step": 7580
},
{
"epoch": 4.515745692216281,
"grad_norm": 16.333484649658203,
"learning_rate": 5.420573088604252e-06,
"loss": 0.9644,
"step": 7600
},
{
"epoch": 4.527629233511586,
"grad_norm": 1.6461055278778076,
"learning_rate": 5.288525023108412e-06,
"loss": 1.2938,
"step": 7620
},
{
"epoch": 4.539512774806893,
"grad_norm": 0.09808467328548431,
"learning_rate": 5.1564769576125715e-06,
"loss": 0.9411,
"step": 7640
},
{
"epoch": 4.551396316102198,
"grad_norm": 7.122012138366699,
"learning_rate": 5.024428892116731e-06,
"loss": 1.114,
"step": 7660
},
{
"epoch": 4.563279857397505,
"grad_norm": 15.417506217956543,
"learning_rate": 4.89238082662089e-06,
"loss": 1.4353,
"step": 7680
},
{
"epoch": 4.57516339869281,
"grad_norm": 22.031047821044922,
"learning_rate": 4.76033276112505e-06,
"loss": 1.5221,
"step": 7700
},
{
"epoch": 4.587046939988117,
"grad_norm": 6.318397521972656,
"learning_rate": 4.628284695629209e-06,
"loss": 0.8358,
"step": 7720
},
{
"epoch": 4.598930481283422,
"grad_norm": 23.213071823120117,
"learning_rate": 4.496236630133368e-06,
"loss": 0.9493,
"step": 7740
},
{
"epoch": 4.610814022578729,
"grad_norm": 36.51852035522461,
"learning_rate": 4.364188564637528e-06,
"loss": 2.1477,
"step": 7760
},
{
"epoch": 4.622697563874034,
"grad_norm": 12.436498641967773,
"learning_rate": 4.2321404991416884e-06,
"loss": 0.9672,
"step": 7780
},
{
"epoch": 4.634581105169341,
"grad_norm": 22.718385696411133,
"learning_rate": 4.100092433645847e-06,
"loss": 1.3351,
"step": 7800
},
{
"epoch": 4.646464646464646,
"grad_norm": 10.148547172546387,
"learning_rate": 3.968044368150007e-06,
"loss": 1.2337,
"step": 7820
},
{
"epoch": 4.658348187759953,
"grad_norm": 15.9627685546875,
"learning_rate": 3.835996302654167e-06,
"loss": 0.6878,
"step": 7840
},
{
"epoch": 4.670231729055258,
"grad_norm": 13.22270393371582,
"learning_rate": 3.703948237158326e-06,
"loss": 1.1205,
"step": 7860
},
{
"epoch": 4.682115270350565,
"grad_norm": 17.37915802001953,
"learning_rate": 3.571900171662485e-06,
"loss": 0.803,
"step": 7880
},
{
"epoch": 4.69399881164587,
"grad_norm": 5.952159404754639,
"learning_rate": 3.4398521061666444e-06,
"loss": 0.6111,
"step": 7900
},
{
"epoch": 4.705882352941177,
"grad_norm": 7.648906707763672,
"learning_rate": 3.307804040670804e-06,
"loss": 0.7931,
"step": 7920
},
{
"epoch": 4.717765894236482,
"grad_norm": 5.780369281768799,
"learning_rate": 3.175755975174964e-06,
"loss": 0.7795,
"step": 7940
},
{
"epoch": 4.729649435531789,
"grad_norm": 0.2420853227376938,
"learning_rate": 3.0437079096791235e-06,
"loss": 0.8364,
"step": 7960
},
{
"epoch": 4.741532976827094,
"grad_norm": 0.007712378166615963,
"learning_rate": 2.9116598441832827e-06,
"loss": 0.7652,
"step": 7980
},
{
"epoch": 4.753416518122401,
"grad_norm": 1.513141393661499,
"learning_rate": 2.7796117786874424e-06,
"loss": 0.6637,
"step": 8000
},
{
"epoch": 4.765300059417706,
"grad_norm": 8.523009300231934,
"learning_rate": 2.6475637131916017e-06,
"loss": 0.7923,
"step": 8020
},
{
"epoch": 4.777183600713013,
"grad_norm": 8.576272010803223,
"learning_rate": 2.5155156476957613e-06,
"loss": 0.8144,
"step": 8040
},
{
"epoch": 4.789067142008318,
"grad_norm": 4.978365421295166,
"learning_rate": 2.383467582199921e-06,
"loss": 1.3135,
"step": 8060
},
{
"epoch": 4.800950683303625,
"grad_norm": 22.845678329467773,
"learning_rate": 2.2514195167040803e-06,
"loss": 1.5446,
"step": 8080
},
{
"epoch": 4.81283422459893,
"grad_norm": 4.380787372589111,
"learning_rate": 2.11937145120824e-06,
"loss": 0.6026,
"step": 8100
},
{
"epoch": 4.824717765894237,
"grad_norm": 0.1694694310426712,
"learning_rate": 1.9873233857123996e-06,
"loss": 1.2781,
"step": 8120
},
{
"epoch": 4.836601307189542,
"grad_norm": 105.8367691040039,
"learning_rate": 1.8552753202165589e-06,
"loss": 1.6788,
"step": 8140
},
{
"epoch": 4.848484848484849,
"grad_norm": 0.12047601491212845,
"learning_rate": 1.7232272547207184e-06,
"loss": 1.0554,
"step": 8160
},
{
"epoch": 4.860368389780154,
"grad_norm": 21.857023239135742,
"learning_rate": 1.591179189224878e-06,
"loss": 1.0381,
"step": 8180
},
{
"epoch": 4.872251931075461,
"grad_norm": 64.79426574707031,
"learning_rate": 1.4591311237290375e-06,
"loss": 1.9111,
"step": 8200
},
{
"epoch": 4.884135472370766,
"grad_norm": 11.07774829864502,
"learning_rate": 1.327083058233197e-06,
"loss": 1.0473,
"step": 8220
},
{
"epoch": 4.896019013666073,
"grad_norm": 25.981216430664062,
"learning_rate": 1.1950349927373565e-06,
"loss": 1.2634,
"step": 8240
},
{
"epoch": 4.907902554961378,
"grad_norm": 16.583120346069336,
"learning_rate": 1.062986927241516e-06,
"loss": 1.1052,
"step": 8260
},
{
"epoch": 4.919786096256685,
"grad_norm": 48.31085968017578,
"learning_rate": 9.309388617456755e-07,
"loss": 1.2838,
"step": 8280
},
{
"epoch": 4.93166963755199,
"grad_norm": 15.064263343811035,
"learning_rate": 7.988907962498351e-07,
"loss": 0.8866,
"step": 8300
},
{
"epoch": 4.943553178847297,
"grad_norm": 4.631924629211426,
"learning_rate": 6.668427307539944e-07,
"loss": 0.9971,
"step": 8320
},
{
"epoch": 4.955436720142602,
"grad_norm": 3.649411678314209,
"learning_rate": 5.347946652581539e-07,
"loss": 0.7016,
"step": 8340
},
{
"epoch": 4.967320261437909,
"grad_norm": 49.374534606933594,
"learning_rate": 4.0274659976231353e-07,
"loss": 1.1992,
"step": 8360
},
{
"epoch": 4.979203802733214,
"grad_norm": 8.74241828918457,
"learning_rate": 2.70698534266473e-07,
"loss": 0.5942,
"step": 8380
},
{
"epoch": 4.991087344028521,
"grad_norm": 20.124603271484375,
"learning_rate": 1.386504687706325e-07,
"loss": 0.9934,
"step": 8400
},
{
"epoch": 5.0,
"eval_accuracy": 0.4983164983164983,
"eval_loss": 1.831876277923584,
"eval_runtime": 97.0764,
"eval_samples_per_second": 3.059,
"eval_steps_per_second": 3.059,
"step": 8415
}
],
"logging_steps": 20,
"max_steps": 8415,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1214329969311736e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}