cccxi / trainer_state.json
AaronWu901225's picture
Upload LoRA adapter folder
79223c7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3651,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008221993833504625,
"grad_norm": 0.9403873682022095,
"learning_rate": 0.0,
"loss": 0.8484,
"step": 1
},
{
"epoch": 0.0041109969167523125,
"grad_norm": 0.8725608587265015,
"learning_rate": 4.3715846994535524e-07,
"loss": 0.9215,
"step": 5
},
{
"epoch": 0.008221993833504625,
"grad_norm": 0.8453531861305237,
"learning_rate": 9.836065573770493e-07,
"loss": 0.9604,
"step": 10
},
{
"epoch": 0.012332990750256937,
"grad_norm": 0.7130156755447388,
"learning_rate": 1.5300546448087432e-06,
"loss": 0.8716,
"step": 15
},
{
"epoch": 0.01644398766700925,
"grad_norm": 1.148436427116394,
"learning_rate": 2.0765027322404376e-06,
"loss": 0.9847,
"step": 20
},
{
"epoch": 0.020554984583761562,
"grad_norm": 1.012150764465332,
"learning_rate": 2.6229508196721314e-06,
"loss": 0.8876,
"step": 25
},
{
"epoch": 0.024665981500513873,
"grad_norm": 0.9746986627578735,
"learning_rate": 3.1693989071038255e-06,
"loss": 0.9536,
"step": 30
},
{
"epoch": 0.02877697841726619,
"grad_norm": 1.1652287244796753,
"learning_rate": 3.7158469945355197e-06,
"loss": 0.9013,
"step": 35
},
{
"epoch": 0.0328879753340185,
"grad_norm": 0.9780025482177734,
"learning_rate": 4.2622950819672135e-06,
"loss": 0.8908,
"step": 40
},
{
"epoch": 0.03699897225077081,
"grad_norm": 0.9223105907440186,
"learning_rate": 4.808743169398907e-06,
"loss": 0.821,
"step": 45
},
{
"epoch": 0.041109969167523124,
"grad_norm": 0.7905207276344299,
"learning_rate": 5.355191256830602e-06,
"loss": 0.8651,
"step": 50
},
{
"epoch": 0.045220966084275435,
"grad_norm": 0.7562019228935242,
"learning_rate": 5.9016393442622956e-06,
"loss": 0.7358,
"step": 55
},
{
"epoch": 0.04933196300102775,
"grad_norm": 0.38864654302597046,
"learning_rate": 6.44808743169399e-06,
"loss": 0.764,
"step": 60
},
{
"epoch": 0.05344295991778006,
"grad_norm": 0.6753324866294861,
"learning_rate": 6.994535519125684e-06,
"loss": 0.8409,
"step": 65
},
{
"epoch": 0.05755395683453238,
"grad_norm": 0.4633980691432953,
"learning_rate": 7.540983606557377e-06,
"loss": 0.6562,
"step": 70
},
{
"epoch": 0.06166495375128469,
"grad_norm": 0.41575130820274353,
"learning_rate": 8.087431693989072e-06,
"loss": 0.7852,
"step": 75
},
{
"epoch": 0.065775950668037,
"grad_norm": 0.6721888184547424,
"learning_rate": 8.633879781420765e-06,
"loss": 0.7728,
"step": 80
},
{
"epoch": 0.0698869475847893,
"grad_norm": 0.7128573060035706,
"learning_rate": 9.18032786885246e-06,
"loss": 0.6927,
"step": 85
},
{
"epoch": 0.07399794450154162,
"grad_norm": 0.4412561058998108,
"learning_rate": 9.726775956284153e-06,
"loss": 0.6382,
"step": 90
},
{
"epoch": 0.07810894141829394,
"grad_norm": 0.501618504524231,
"learning_rate": 1.0273224043715849e-05,
"loss": 0.685,
"step": 95
},
{
"epoch": 0.08221993833504625,
"grad_norm": 0.3464237153530121,
"learning_rate": 1.0819672131147544e-05,
"loss": 0.6624,
"step": 100
},
{
"epoch": 0.08633093525179857,
"grad_norm": 0.5196655988693237,
"learning_rate": 1.1366120218579235e-05,
"loss": 0.6127,
"step": 105
},
{
"epoch": 0.09044193216855087,
"grad_norm": 0.5325089693069458,
"learning_rate": 1.191256830601093e-05,
"loss": 0.5875,
"step": 110
},
{
"epoch": 0.09455292908530319,
"grad_norm": 0.30435436964035034,
"learning_rate": 1.2459016393442624e-05,
"loss": 0.5136,
"step": 115
},
{
"epoch": 0.0986639260020555,
"grad_norm": 0.7467198371887207,
"learning_rate": 1.3005464480874317e-05,
"loss": 0.5921,
"step": 120
},
{
"epoch": 0.10277492291880781,
"grad_norm": 0.7135942578315735,
"learning_rate": 1.3551912568306011e-05,
"loss": 0.5228,
"step": 125
},
{
"epoch": 0.10688591983556012,
"grad_norm": 0.663159966468811,
"learning_rate": 1.4098360655737706e-05,
"loss": 0.5076,
"step": 130
},
{
"epoch": 0.11099691675231244,
"grad_norm": 0.5868617296218872,
"learning_rate": 1.46448087431694e-05,
"loss": 0.4551,
"step": 135
},
{
"epoch": 0.11510791366906475,
"grad_norm": 0.6190376877784729,
"learning_rate": 1.5191256830601094e-05,
"loss": 0.5692,
"step": 140
},
{
"epoch": 0.11921891058581706,
"grad_norm": 0.8358873724937439,
"learning_rate": 1.5737704918032788e-05,
"loss": 0.5348,
"step": 145
},
{
"epoch": 0.12332990750256938,
"grad_norm": 0.6471666693687439,
"learning_rate": 1.628415300546448e-05,
"loss": 0.4363,
"step": 150
},
{
"epoch": 0.12744090441932168,
"grad_norm": 0.5585145354270935,
"learning_rate": 1.6830601092896177e-05,
"loss": 0.4181,
"step": 155
},
{
"epoch": 0.131551901336074,
"grad_norm": 0.5833807587623596,
"learning_rate": 1.737704918032787e-05,
"loss": 0.4133,
"step": 160
},
{
"epoch": 0.13566289825282632,
"grad_norm": 0.7664275169372559,
"learning_rate": 1.7923497267759563e-05,
"loss": 0.5059,
"step": 165
},
{
"epoch": 0.1397738951695786,
"grad_norm": 0.7638784050941467,
"learning_rate": 1.846994535519126e-05,
"loss": 0.3536,
"step": 170
},
{
"epoch": 0.14388489208633093,
"grad_norm": 0.5982478857040405,
"learning_rate": 1.9016393442622952e-05,
"loss": 0.4113,
"step": 175
},
{
"epoch": 0.14799588900308325,
"grad_norm": 0.7887367606163025,
"learning_rate": 1.9562841530054645e-05,
"loss": 0.3836,
"step": 180
},
{
"epoch": 0.15210688591983557,
"grad_norm": 0.6601587533950806,
"learning_rate": 1.9999995896905283e-05,
"loss": 0.4506,
"step": 185
},
{
"epoch": 0.15621788283658788,
"grad_norm": 0.7067707180976868,
"learning_rate": 1.9999852288943748e-05,
"loss": 0.4673,
"step": 190
},
{
"epoch": 0.16032887975334018,
"grad_norm": 0.6842494606971741,
"learning_rate": 1.9999503529613444e-05,
"loss": 0.425,
"step": 195
},
{
"epoch": 0.1644398766700925,
"grad_norm": 0.7092298865318298,
"learning_rate": 1.999894962606933e-05,
"loss": 0.3451,
"step": 200
},
{
"epoch": 0.1685508735868448,
"grad_norm": 0.9632850289344788,
"learning_rate": 1.999819058967497e-05,
"loss": 0.3734,
"step": 205
},
{
"epoch": 0.17266187050359713,
"grad_norm": 0.8995895385742188,
"learning_rate": 1.999722643600234e-05,
"loss": 0.4026,
"step": 210
},
{
"epoch": 0.17677286742034942,
"grad_norm": 1.071627140045166,
"learning_rate": 1.9996057184831475e-05,
"loss": 0.3483,
"step": 215
},
{
"epoch": 0.18088386433710174,
"grad_norm": 0.7492429614067078,
"learning_rate": 1.9994682860150073e-05,
"loss": 0.3331,
"step": 220
},
{
"epoch": 0.18499486125385406,
"grad_norm": 0.8149610161781311,
"learning_rate": 1.999310349015301e-05,
"loss": 0.3281,
"step": 225
},
{
"epoch": 0.18910585817060638,
"grad_norm": 0.8723487257957458,
"learning_rate": 1.9991319107241766e-05,
"loss": 0.4638,
"step": 230
},
{
"epoch": 0.1932168550873587,
"grad_norm": 0.8918459415435791,
"learning_rate": 1.9989329748023728e-05,
"loss": 0.3805,
"step": 235
},
{
"epoch": 0.197327852004111,
"grad_norm": 0.7838431596755981,
"learning_rate": 1.998713545331148e-05,
"loss": 0.3863,
"step": 240
},
{
"epoch": 0.2014388489208633,
"grad_norm": 0.8793269991874695,
"learning_rate": 1.9984736268121944e-05,
"loss": 0.3898,
"step": 245
},
{
"epoch": 0.20554984583761562,
"grad_norm": 0.9354943633079529,
"learning_rate": 1.998213224167546e-05,
"loss": 0.371,
"step": 250
},
{
"epoch": 0.20966084275436794,
"grad_norm": 0.8775899410247803,
"learning_rate": 1.997932342739478e-05,
"loss": 0.3388,
"step": 255
},
{
"epoch": 0.21377183967112023,
"grad_norm": 1.0165460109710693,
"learning_rate": 1.9976309882903957e-05,
"loss": 0.3304,
"step": 260
},
{
"epoch": 0.21788283658787255,
"grad_norm": 0.9298381805419922,
"learning_rate": 1.9973091670027184e-05,
"loss": 0.3581,
"step": 265
},
{
"epoch": 0.22199383350462487,
"grad_norm": 0.853493332862854,
"learning_rate": 1.996966885478752e-05,
"loss": 0.3808,
"step": 270
},
{
"epoch": 0.2261048304213772,
"grad_norm": 1.078824520111084,
"learning_rate": 1.996604150740552e-05,
"loss": 0.3559,
"step": 275
},
{
"epoch": 0.2302158273381295,
"grad_norm": 0.6552051305770874,
"learning_rate": 1.9962209702297807e-05,
"loss": 0.3194,
"step": 280
},
{
"epoch": 0.2343268242548818,
"grad_norm": 1.0376273393630981,
"learning_rate": 1.995817351807556e-05,
"loss": 0.4223,
"step": 285
},
{
"epoch": 0.23843782117163412,
"grad_norm": 1.0463075637817383,
"learning_rate": 1.9953933037542864e-05,
"loss": 0.3439,
"step": 290
},
{
"epoch": 0.24254881808838644,
"grad_norm": 0.997489869594574,
"learning_rate": 1.9949488347695044e-05,
"loss": 0.3264,
"step": 295
},
{
"epoch": 0.24665981500513876,
"grad_norm": 1.2122714519500732,
"learning_rate": 1.994483953971687e-05,
"loss": 0.2846,
"step": 300
},
{
"epoch": 0.25077081192189105,
"grad_norm": 1.008628010749817,
"learning_rate": 1.9939986708980686e-05,
"loss": 0.2891,
"step": 305
},
{
"epoch": 0.25488180883864336,
"grad_norm": 0.8791748285293579,
"learning_rate": 1.993492995504444e-05,
"loss": 0.3377,
"step": 310
},
{
"epoch": 0.2589928057553957,
"grad_norm": 0.9464241862297058,
"learning_rate": 1.9929669381649673e-05,
"loss": 0.3894,
"step": 315
},
{
"epoch": 0.263103802672148,
"grad_norm": 1.1983399391174316,
"learning_rate": 1.992420509671936e-05,
"loss": 0.3583,
"step": 320
},
{
"epoch": 0.2672147995889003,
"grad_norm": 1.1101964712142944,
"learning_rate": 1.9918537212355704e-05,
"loss": 0.2783,
"step": 325
},
{
"epoch": 0.27132579650565264,
"grad_norm": 1.0709152221679688,
"learning_rate": 1.9912665844837855e-05,
"loss": 0.349,
"step": 330
},
{
"epoch": 0.27543679342240496,
"grad_norm": 1.0815412998199463,
"learning_rate": 1.9906591114619494e-05,
"loss": 0.3074,
"step": 335
},
{
"epoch": 0.2795477903391572,
"grad_norm": 1.0269906520843506,
"learning_rate": 1.9900313146326384e-05,
"loss": 0.2844,
"step": 340
},
{
"epoch": 0.28365878725590954,
"grad_norm": 1.0790280103683472,
"learning_rate": 1.989383206875381e-05,
"loss": 0.2912,
"step": 345
},
{
"epoch": 0.28776978417266186,
"grad_norm": 0.8625520467758179,
"learning_rate": 1.988714801486393e-05,
"loss": 0.2805,
"step": 350
},
{
"epoch": 0.2918807810894142,
"grad_norm": 0.9437581896781921,
"learning_rate": 1.9880261121783046e-05,
"loss": 0.3312,
"step": 355
},
{
"epoch": 0.2959917780061665,
"grad_norm": 0.8448299765586853,
"learning_rate": 1.98731715307988e-05,
"loss": 0.252,
"step": 360
},
{
"epoch": 0.3001027749229188,
"grad_norm": 0.7578240633010864,
"learning_rate": 1.9865879387357272e-05,
"loss": 0.3417,
"step": 365
},
{
"epoch": 0.30421377183967113,
"grad_norm": 1.0512038469314575,
"learning_rate": 1.985838484105999e-05,
"loss": 0.3185,
"step": 370
},
{
"epoch": 0.30832476875642345,
"grad_norm": 1.0025371313095093,
"learning_rate": 1.985068804566087e-05,
"loss": 0.3161,
"step": 375
},
{
"epoch": 0.31243576567317577,
"grad_norm": 0.8572078347206116,
"learning_rate": 1.9842789159063056e-05,
"loss": 0.328,
"step": 380
},
{
"epoch": 0.31654676258992803,
"grad_norm": 1.0248758792877197,
"learning_rate": 1.983468834331568e-05,
"loss": 0.3121,
"step": 385
},
{
"epoch": 0.32065775950668035,
"grad_norm": 0.899937629699707,
"learning_rate": 1.9826385764610542e-05,
"loss": 0.3092,
"step": 390
},
{
"epoch": 0.32476875642343267,
"grad_norm": 0.9734247922897339,
"learning_rate": 1.9817881593278695e-05,
"loss": 0.2803,
"step": 395
},
{
"epoch": 0.328879753340185,
"grad_norm": 0.99278324842453,
"learning_rate": 1.9809176003786953e-05,
"loss": 0.3602,
"step": 400
},
{
"epoch": 0.3329907502569373,
"grad_norm": 0.9903774857521057,
"learning_rate": 1.980026917473432e-05,
"loss": 0.2851,
"step": 405
},
{
"epoch": 0.3371017471736896,
"grad_norm": 0.9723519682884216,
"learning_rate": 1.979116128884831e-05,
"loss": 0.3119,
"step": 410
},
{
"epoch": 0.34121274409044194,
"grad_norm": 1.1728214025497437,
"learning_rate": 1.9781852532981204e-05,
"loss": 0.2933,
"step": 415
},
{
"epoch": 0.34532374100719426,
"grad_norm": 1.1374309062957764,
"learning_rate": 1.9772343098106227e-05,
"loss": 0.2955,
"step": 420
},
{
"epoch": 0.3494347379239466,
"grad_norm": 1.145560622215271,
"learning_rate": 1.9762633179313625e-05,
"loss": 0.3452,
"step": 425
},
{
"epoch": 0.35354573484069884,
"grad_norm": 1.1968661546707153,
"learning_rate": 1.9752722975806643e-05,
"loss": 0.3625,
"step": 430
},
{
"epoch": 0.35765673175745116,
"grad_norm": 1.1781413555145264,
"learning_rate": 1.9742612690897473e-05,
"loss": 0.2798,
"step": 435
},
{
"epoch": 0.3617677286742035,
"grad_norm": 1.2856999635696411,
"learning_rate": 1.973230253200305e-05,
"loss": 0.265,
"step": 440
},
{
"epoch": 0.3658787255909558,
"grad_norm": 1.1101672649383545,
"learning_rate": 1.972179271064083e-05,
"loss": 0.2528,
"step": 445
},
{
"epoch": 0.3699897225077081,
"grad_norm": 1.117969274520874,
"learning_rate": 1.971108344242441e-05,
"loss": 0.3159,
"step": 450
},
{
"epoch": 0.37410071942446044,
"grad_norm": 1.3405277729034424,
"learning_rate": 1.9700174947059146e-05,
"loss": 0.2576,
"step": 455
},
{
"epoch": 0.37821171634121276,
"grad_norm": 1.0911766290664673,
"learning_rate": 1.968906744833762e-05,
"loss": 0.2896,
"step": 460
},
{
"epoch": 0.3823227132579651,
"grad_norm": 0.8619614243507385,
"learning_rate": 1.967776117413505e-05,
"loss": 0.2734,
"step": 465
},
{
"epoch": 0.3864337101747174,
"grad_norm": 0.9228631258010864,
"learning_rate": 1.9666256356404628e-05,
"loss": 0.295,
"step": 470
},
{
"epoch": 0.39054470709146966,
"grad_norm": 1.2374449968338013,
"learning_rate": 1.9654553231172748e-05,
"loss": 0.2974,
"step": 475
},
{
"epoch": 0.394655704008222,
"grad_norm": 1.177212119102478,
"learning_rate": 1.9642652038534174e-05,
"loss": 0.2841,
"step": 480
},
{
"epoch": 0.3987667009249743,
"grad_norm": 0.7735481262207031,
"learning_rate": 1.9630553022647113e-05,
"loss": 0.226,
"step": 485
},
{
"epoch": 0.4028776978417266,
"grad_norm": 1.1202296018600464,
"learning_rate": 1.961825643172819e-05,
"loss": 0.3067,
"step": 490
},
{
"epoch": 0.40698869475847893,
"grad_norm": 1.363702654838562,
"learning_rate": 1.9605762518047387e-05,
"loss": 0.3293,
"step": 495
},
{
"epoch": 0.41109969167523125,
"grad_norm": 0.9886282682418823,
"learning_rate": 1.959307153792283e-05,
"loss": 0.2591,
"step": 500
},
{
"epoch": 0.41521068859198357,
"grad_norm": 0.8387702703475952,
"learning_rate": 1.9580183751715563e-05,
"loss": 0.2815,
"step": 505
},
{
"epoch": 0.4193216855087359,
"grad_norm": 1.0631368160247803,
"learning_rate": 1.956709942382419e-05,
"loss": 0.2227,
"step": 510
},
{
"epoch": 0.4234326824254882,
"grad_norm": 0.6852805614471436,
"learning_rate": 1.955381882267945e-05,
"loss": 0.2567,
"step": 515
},
{
"epoch": 0.42754367934224047,
"grad_norm": 1.046186089515686,
"learning_rate": 1.9540342220738726e-05,
"loss": 0.2553,
"step": 520
},
{
"epoch": 0.4316546762589928,
"grad_norm": 0.8813861608505249,
"learning_rate": 1.952666989448043e-05,
"loss": 0.311,
"step": 525
},
{
"epoch": 0.4357656731757451,
"grad_norm": 1.0005453824996948,
"learning_rate": 1.9512802124398348e-05,
"loss": 0.3037,
"step": 530
},
{
"epoch": 0.4398766700924974,
"grad_norm": 1.0475354194641113,
"learning_rate": 1.9498739194995885e-05,
"loss": 0.257,
"step": 535
},
{
"epoch": 0.44398766700924974,
"grad_norm": 1.3572496175765991,
"learning_rate": 1.9484481394780225e-05,
"loss": 0.2658,
"step": 540
},
{
"epoch": 0.44809866392600206,
"grad_norm": 1.3119100332260132,
"learning_rate": 1.9470029016256417e-05,
"loss": 0.3076,
"step": 545
},
{
"epoch": 0.4522096608427544,
"grad_norm": 1.0053207874298096,
"learning_rate": 1.945538235592135e-05,
"loss": 0.2648,
"step": 550
},
{
"epoch": 0.4563206577595067,
"grad_norm": 1.1991180181503296,
"learning_rate": 1.944054171425772e-05,
"loss": 0.2933,
"step": 555
},
{
"epoch": 0.460431654676259,
"grad_norm": 0.9475321173667908,
"learning_rate": 1.942550739572781e-05,
"loss": 0.3323,
"step": 560
},
{
"epoch": 0.4645426515930113,
"grad_norm": 1.4173755645751953,
"learning_rate": 1.9410279708767282e-05,
"loss": 0.2799,
"step": 565
},
{
"epoch": 0.4686536485097636,
"grad_norm": 1.1307871341705322,
"learning_rate": 1.939485896577883e-05,
"loss": 0.2934,
"step": 570
},
{
"epoch": 0.4727646454265159,
"grad_norm": 1.2060418128967285,
"learning_rate": 1.9379245483125783e-05,
"loss": 0.2336,
"step": 575
},
{
"epoch": 0.47687564234326824,
"grad_norm": 1.452938199043274,
"learning_rate": 1.9363439581125603e-05,
"loss": 0.2929,
"step": 580
},
{
"epoch": 0.48098663926002055,
"grad_norm": 1.2634321451187134,
"learning_rate": 1.9347441584043325e-05,
"loss": 0.2966,
"step": 585
},
{
"epoch": 0.4850976361767729,
"grad_norm": 1.2690166234970093,
"learning_rate": 1.9331251820084897e-05,
"loss": 0.2765,
"step": 590
},
{
"epoch": 0.4892086330935252,
"grad_norm": 1.231868028640747,
"learning_rate": 1.931487062139045e-05,
"loss": 0.2978,
"step": 595
},
{
"epoch": 0.4933196300102775,
"grad_norm": 1.2768539190292358,
"learning_rate": 1.9298298324027476e-05,
"loss": 0.2712,
"step": 600
},
{
"epoch": 0.49743062692702983,
"grad_norm": 1.0039498805999756,
"learning_rate": 1.928153526798395e-05,
"loss": 0.2542,
"step": 605
},
{
"epoch": 0.5015416238437821,
"grad_norm": 0.9825944900512695,
"learning_rate": 1.9264581797161345e-05,
"loss": 0.2919,
"step": 610
},
{
"epoch": 0.5056526207605344,
"grad_norm": 1.2589720487594604,
"learning_rate": 1.9247438259367562e-05,
"loss": 0.2647,
"step": 615
},
{
"epoch": 0.5097636176772867,
"grad_norm": 0.9228754639625549,
"learning_rate": 1.923010500630983e-05,
"loss": 0.314,
"step": 620
},
{
"epoch": 0.513874614594039,
"grad_norm": 1.0735732316970825,
"learning_rate": 1.921258239358746e-05,
"loss": 0.2404,
"step": 625
},
{
"epoch": 0.5179856115107914,
"grad_norm": 1.3011178970336914,
"learning_rate": 1.919487078068455e-05,
"loss": 0.2818,
"step": 630
},
{
"epoch": 0.5220966084275437,
"grad_norm": 1.1417852640151978,
"learning_rate": 1.9176970530962644e-05,
"loss": 0.2779,
"step": 635
},
{
"epoch": 0.526207605344296,
"grad_norm": 1.5373178720474243,
"learning_rate": 1.9158882011653233e-05,
"loss": 0.2582,
"step": 640
},
{
"epoch": 0.5303186022610483,
"grad_norm": 1.3577611446380615,
"learning_rate": 1.914060559385025e-05,
"loss": 0.3093,
"step": 645
},
{
"epoch": 0.5344295991778006,
"grad_norm": 1.2170337438583374,
"learning_rate": 1.9122141652502442e-05,
"loss": 0.255,
"step": 650
},
{
"epoch": 0.538540596094553,
"grad_norm": 1.4195996522903442,
"learning_rate": 1.9103490566405694e-05,
"loss": 0.2874,
"step": 655
},
{
"epoch": 0.5426515930113053,
"grad_norm": 1.1771873235702515,
"learning_rate": 1.9084652718195237e-05,
"loss": 0.2817,
"step": 660
},
{
"epoch": 0.5467625899280576,
"grad_norm": 1.158410668373108,
"learning_rate": 1.906562849433782e-05,
"loss": 0.2421,
"step": 665
},
{
"epoch": 0.5508735868448099,
"grad_norm": 1.0603723526000977,
"learning_rate": 1.9046418285123755e-05,
"loss": 0.2557,
"step": 670
},
{
"epoch": 0.5549845837615622,
"grad_norm": 1.2592222690582275,
"learning_rate": 1.9027022484658947e-05,
"loss": 0.2406,
"step": 675
},
{
"epoch": 0.5590955806783144,
"grad_norm": 0.9237696528434753,
"learning_rate": 1.9007441490856764e-05,
"loss": 0.2735,
"step": 680
},
{
"epoch": 0.5632065775950668,
"grad_norm": 1.1776955127716064,
"learning_rate": 1.8987675705429916e-05,
"loss": 0.2444,
"step": 685
},
{
"epoch": 0.5673175745118191,
"grad_norm": 1.037622332572937,
"learning_rate": 1.896772553388218e-05,
"loss": 0.2928,
"step": 690
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.2918436527252197,
"learning_rate": 1.8947591385500104e-05,
"loss": 0.2912,
"step": 695
},
{
"epoch": 0.5755395683453237,
"grad_norm": 0.9499860405921936,
"learning_rate": 1.89272736733446e-05,
"loss": 0.2149,
"step": 700
},
{
"epoch": 0.579650565262076,
"grad_norm": 0.8249284029006958,
"learning_rate": 1.8906772814242472e-05,
"loss": 0.2664,
"step": 705
},
{
"epoch": 0.5837615621788284,
"grad_norm": 0.9467989802360535,
"learning_rate": 1.8886089228777858e-05,
"loss": 0.2239,
"step": 710
},
{
"epoch": 0.5878725590955807,
"grad_norm": 0.9601849317550659,
"learning_rate": 1.8865223341283618e-05,
"loss": 0.2638,
"step": 715
},
{
"epoch": 0.591983556012333,
"grad_norm": 1.234498381614685,
"learning_rate": 1.8844175579832613e-05,
"loss": 0.2486,
"step": 720
},
{
"epoch": 0.5960945529290853,
"grad_norm": 1.1814972162246704,
"learning_rate": 1.8822946376228926e-05,
"loss": 0.3067,
"step": 725
},
{
"epoch": 0.6002055498458376,
"grad_norm": 1.0824075937271118,
"learning_rate": 1.8801536165999008e-05,
"loss": 0.2663,
"step": 730
},
{
"epoch": 0.60431654676259,
"grad_norm": 1.0348601341247559,
"learning_rate": 1.8779945388382742e-05,
"loss": 0.2127,
"step": 735
},
{
"epoch": 0.6084275436793423,
"grad_norm": 1.0171630382537842,
"learning_rate": 1.875817448632443e-05,
"loss": 0.2737,
"step": 740
},
{
"epoch": 0.6125385405960946,
"grad_norm": 0.9645372033119202,
"learning_rate": 1.8736223906463698e-05,
"loss": 0.1949,
"step": 745
},
{
"epoch": 0.6166495375128469,
"grad_norm": 1.3011705875396729,
"learning_rate": 1.8714094099126353e-05,
"loss": 0.2427,
"step": 750
},
{
"epoch": 0.6207605344295992,
"grad_norm": 1.6664115190505981,
"learning_rate": 1.8691785518315132e-05,
"loss": 0.2432,
"step": 755
},
{
"epoch": 0.6248715313463515,
"grad_norm": 1.0895479917526245,
"learning_rate": 1.866929862170038e-05,
"loss": 0.2512,
"step": 760
},
{
"epoch": 0.6289825282631039,
"grad_norm": 1.159765362739563,
"learning_rate": 1.864663387061067e-05,
"loss": 0.2373,
"step": 765
},
{
"epoch": 0.6330935251798561,
"grad_norm": 1.1278481483459473,
"learning_rate": 1.8623791730023347e-05,
"loss": 0.2669,
"step": 770
},
{
"epoch": 0.6372045220966084,
"grad_norm": 1.1040396690368652,
"learning_rate": 1.860077266855497e-05,
"loss": 0.2374,
"step": 775
},
{
"epoch": 0.6413155190133607,
"grad_norm": 1.3087905645370483,
"learning_rate": 1.8577577158451713e-05,
"loss": 0.2313,
"step": 780
},
{
"epoch": 0.645426515930113,
"grad_norm": 1.0294170379638672,
"learning_rate": 1.8554205675579665e-05,
"loss": 0.2876,
"step": 785
},
{
"epoch": 0.6495375128468653,
"grad_norm": 1.6003506183624268,
"learning_rate": 1.8530658699415088e-05,
"loss": 0.2334,
"step": 790
},
{
"epoch": 0.6536485097636177,
"grad_norm": 1.433346152305603,
"learning_rate": 1.850693671303455e-05,
"loss": 0.2283,
"step": 795
},
{
"epoch": 0.65775950668037,
"grad_norm": 1.0867021083831787,
"learning_rate": 1.8483040203105038e-05,
"loss": 0.2255,
"step": 800
},
{
"epoch": 0.6618705035971223,
"grad_norm": 1.2296451330184937,
"learning_rate": 1.8458969659873966e-05,
"loss": 0.2523,
"step": 805
},
{
"epoch": 0.6659815005138746,
"grad_norm": 1.2868030071258545,
"learning_rate": 1.843472557715912e-05,
"loss": 0.2133,
"step": 810
},
{
"epoch": 0.6700924974306269,
"grad_norm": 1.3351691961288452,
"learning_rate": 1.841030845233852e-05,
"loss": 0.205,
"step": 815
},
{
"epoch": 0.6742034943473793,
"grad_norm": 1.3833565711975098,
"learning_rate": 1.8385718786340216e-05,
"loss": 0.2426,
"step": 820
},
{
"epoch": 0.6783144912641316,
"grad_norm": 0.7566655278205872,
"learning_rate": 1.8360957083632037e-05,
"loss": 0.2464,
"step": 825
},
{
"epoch": 0.6824254881808839,
"grad_norm": 1.2757395505905151,
"learning_rate": 1.8336023852211197e-05,
"loss": 0.2916,
"step": 830
},
{
"epoch": 0.6865364850976362,
"grad_norm": 1.3110612630844116,
"learning_rate": 1.831091960359391e-05,
"loss": 0.2279,
"step": 835
},
{
"epoch": 0.6906474820143885,
"grad_norm": 1.1281332969665527,
"learning_rate": 1.828564485280488e-05,
"loss": 0.2405,
"step": 840
},
{
"epoch": 0.6947584789311408,
"grad_norm": 0.971794068813324,
"learning_rate": 1.826020011836674e-05,
"loss": 0.2175,
"step": 845
},
{
"epoch": 0.6988694758478932,
"grad_norm": 1.0543270111083984,
"learning_rate": 1.8234585922289408e-05,
"loss": 0.1993,
"step": 850
},
{
"epoch": 0.7029804727646455,
"grad_norm": 0.9650211334228516,
"learning_rate": 1.8208802790059383e-05,
"loss": 0.2252,
"step": 855
},
{
"epoch": 0.7070914696813977,
"grad_norm": 1.2143207788467407,
"learning_rate": 1.818285125062897e-05,
"loss": 0.2717,
"step": 860
},
{
"epoch": 0.71120246659815,
"grad_norm": 1.3001195192337036,
"learning_rate": 1.815673183640541e-05,
"loss": 0.2537,
"step": 865
},
{
"epoch": 0.7153134635149023,
"grad_norm": 1.3198888301849365,
"learning_rate": 1.8130445083239982e-05,
"loss": 0.247,
"step": 870
},
{
"epoch": 0.7194244604316546,
"grad_norm": 1.1122859716415405,
"learning_rate": 1.8103991530416992e-05,
"loss": 0.2735,
"step": 875
},
{
"epoch": 0.723535457348407,
"grad_norm": 0.8387332558631897,
"learning_rate": 1.807737172064271e-05,
"loss": 0.2444,
"step": 880
},
{
"epoch": 0.7276464542651593,
"grad_norm": 0.7013024687767029,
"learning_rate": 1.805058620003424e-05,
"loss": 0.2819,
"step": 885
},
{
"epoch": 0.7317574511819116,
"grad_norm": 0.9751718640327454,
"learning_rate": 1.802363551810833e-05,
"loss": 0.222,
"step": 890
},
{
"epoch": 0.7358684480986639,
"grad_norm": 1.452439785003662,
"learning_rate": 1.7996520227770067e-05,
"loss": 0.2552,
"step": 895
},
{
"epoch": 0.7399794450154162,
"grad_norm": 1.1648454666137695,
"learning_rate": 1.7969240885301564e-05,
"loss": 0.2902,
"step": 900
},
{
"epoch": 0.7440904419321686,
"grad_norm": 0.932353138923645,
"learning_rate": 1.7941798050350535e-05,
"loss": 0.2491,
"step": 905
},
{
"epoch": 0.7482014388489209,
"grad_norm": 1.0840809345245361,
"learning_rate": 1.7914192285918807e-05,
"loss": 0.2195,
"step": 910
},
{
"epoch": 0.7523124357656732,
"grad_norm": 1.059841513633728,
"learning_rate": 1.7886424158350784e-05,
"loss": 0.2218,
"step": 915
},
{
"epoch": 0.7564234326824255,
"grad_norm": 1.077287197113037,
"learning_rate": 1.785849423732182e-05,
"loss": 0.2158,
"step": 920
},
{
"epoch": 0.7605344295991778,
"grad_norm": 1.2055879831314087,
"learning_rate": 1.7830403095826527e-05,
"loss": 0.2193,
"step": 925
},
{
"epoch": 0.7646454265159301,
"grad_norm": 1.289574384689331,
"learning_rate": 1.7802151310167033e-05,
"loss": 0.2343,
"step": 930
},
{
"epoch": 0.7687564234326825,
"grad_norm": 1.0531070232391357,
"learning_rate": 1.777373945994115e-05,
"loss": 0.2344,
"step": 935
},
{
"epoch": 0.7728674203494348,
"grad_norm": 1.2793794870376587,
"learning_rate": 1.7745168128030483e-05,
"loss": 0.2019,
"step": 940
},
{
"epoch": 0.7769784172661871,
"grad_norm": 1.2498109340667725,
"learning_rate": 1.7716437900588475e-05,
"loss": 0.2167,
"step": 945
},
{
"epoch": 0.7810894141829393,
"grad_norm": 1.266842007637024,
"learning_rate": 1.7687549367028382e-05,
"loss": 0.2029,
"step": 950
},
{
"epoch": 0.7852004110996916,
"grad_norm": 1.0297633409500122,
"learning_rate": 1.7658503120011177e-05,
"loss": 0.2142,
"step": 955
},
{
"epoch": 0.789311408016444,
"grad_norm": 1.0781053304672241,
"learning_rate": 1.7629299755433396e-05,
"loss": 0.2336,
"step": 960
},
{
"epoch": 0.7934224049331963,
"grad_norm": 1.2344133853912354,
"learning_rate": 1.759993987241491e-05,
"loss": 0.3145,
"step": 965
},
{
"epoch": 0.7975334018499486,
"grad_norm": 1.3422881364822388,
"learning_rate": 1.7570424073286635e-05,
"loss": 0.2252,
"step": 970
},
{
"epoch": 0.8016443987667009,
"grad_norm": 1.1779459714889526,
"learning_rate": 1.7540752963578174e-05,
"loss": 0.2348,
"step": 975
},
{
"epoch": 0.8057553956834532,
"grad_norm": 0.9927631616592407,
"learning_rate": 1.7510927152005394e-05,
"loss": 0.2733,
"step": 980
},
{
"epoch": 0.8098663926002055,
"grad_norm": 1.211342215538025,
"learning_rate": 1.748094725045794e-05,
"loss": 0.2836,
"step": 985
},
{
"epoch": 0.8139773895169579,
"grad_norm": 1.2209789752960205,
"learning_rate": 1.7450813873986692e-05,
"loss": 0.2094,
"step": 990
},
{
"epoch": 0.8180883864337102,
"grad_norm": 1.2045336961746216,
"learning_rate": 1.7420527640791108e-05,
"loss": 0.2166,
"step": 995
},
{
"epoch": 0.8221993833504625,
"grad_norm": 1.2108007669448853,
"learning_rate": 1.7390089172206594e-05,
"loss": 0.2397,
"step": 1000
},
{
"epoch": 0.8263103802672148,
"grad_norm": 1.415350079536438,
"learning_rate": 1.735949909269172e-05,
"loss": 0.2734,
"step": 1005
},
{
"epoch": 0.8304213771839671,
"grad_norm": 1.2366716861724854,
"learning_rate": 1.7328758029815425e-05,
"loss": 0.2218,
"step": 1010
},
{
"epoch": 0.8345323741007195,
"grad_norm": 1.1275813579559326,
"learning_rate": 1.7297866614244142e-05,
"loss": 0.1967,
"step": 1015
},
{
"epoch": 0.8386433710174718,
"grad_norm": 1.3222373723983765,
"learning_rate": 1.7266825479728843e-05,
"loss": 0.2229,
"step": 1020
},
{
"epoch": 0.8427543679342241,
"grad_norm": 1.0726741552352905,
"learning_rate": 1.7235635263092066e-05,
"loss": 0.2334,
"step": 1025
},
{
"epoch": 0.8468653648509764,
"grad_norm": 1.1619436740875244,
"learning_rate": 1.7204296604214818e-05,
"loss": 0.2326,
"step": 1030
},
{
"epoch": 0.8509763617677287,
"grad_norm": 0.8942896723747253,
"learning_rate": 1.7172810146023476e-05,
"loss": 0.2356,
"step": 1035
},
{
"epoch": 0.8550873586844809,
"grad_norm": 1.0873944759368896,
"learning_rate": 1.7141176534476586e-05,
"loss": 0.219,
"step": 1040
},
{
"epoch": 0.8591983556012333,
"grad_norm": 1.1331894397735596,
"learning_rate": 1.71093964185516e-05,
"loss": 0.2287,
"step": 1045
},
{
"epoch": 0.8633093525179856,
"grad_norm": 1.2221019268035889,
"learning_rate": 1.7077470450231573e-05,
"loss": 0.2129,
"step": 1050
},
{
"epoch": 0.8674203494347379,
"grad_norm": 1.3943604230880737,
"learning_rate": 1.7045399284491796e-05,
"loss": 0.1951,
"step": 1055
},
{
"epoch": 0.8715313463514902,
"grad_norm": 1.3927587270736694,
"learning_rate": 1.701318357928634e-05,
"loss": 0.2304,
"step": 1060
},
{
"epoch": 0.8756423432682425,
"grad_norm": 1.363983392715454,
"learning_rate": 1.698082399553457e-05,
"loss": 0.1764,
"step": 1065
},
{
"epoch": 0.8797533401849948,
"grad_norm": 0.9369764924049377,
"learning_rate": 1.694832119710758e-05,
"loss": 0.2145,
"step": 1070
},
{
"epoch": 0.8838643371017472,
"grad_norm": 1.4691601991653442,
"learning_rate": 1.691567585081458e-05,
"loss": 0.2607,
"step": 1075
},
{
"epoch": 0.8879753340184995,
"grad_norm": 1.1497533321380615,
"learning_rate": 1.6882888626389214e-05,
"loss": 0.1688,
"step": 1080
},
{
"epoch": 0.8920863309352518,
"grad_norm": 1.4440010786056519,
"learning_rate": 1.6849960196475808e-05,
"loss": 0.232,
"step": 1085
},
{
"epoch": 0.8961973278520041,
"grad_norm": 1.0219671726226807,
"learning_rate": 1.6816891236615588e-05,
"loss": 0.2776,
"step": 1090
},
{
"epoch": 0.9003083247687564,
"grad_norm": 1.03767728805542,
"learning_rate": 1.678368242523282e-05,
"loss": 0.212,
"step": 1095
},
{
"epoch": 0.9044193216855088,
"grad_norm": 1.0109082460403442,
"learning_rate": 1.675033444362087e-05,
"loss": 0.2467,
"step": 1100
},
{
"epoch": 0.9085303186022611,
"grad_norm": 1.099452257156372,
"learning_rate": 1.6716847975928256e-05,
"loss": 0.223,
"step": 1105
},
{
"epoch": 0.9126413155190134,
"grad_norm": 1.117277979850769,
"learning_rate": 1.668322370914459e-05,
"loss": 0.2235,
"step": 1110
},
{
"epoch": 0.9167523124357657,
"grad_norm": 1.1694763898849487,
"learning_rate": 1.66494623330865e-05,
"loss": 0.2628,
"step": 1115
},
{
"epoch": 0.920863309352518,
"grad_norm": 1.2822755575180054,
"learning_rate": 1.6615564540383465e-05,
"loss": 0.172,
"step": 1120
},
{
"epoch": 0.9249743062692704,
"grad_norm": 1.2870562076568604,
"learning_rate": 1.658153102646362e-05,
"loss": 0.2177,
"step": 1125
},
{
"epoch": 0.9290853031860226,
"grad_norm": 1.1713252067565918,
"learning_rate": 1.6547362489539473e-05,
"loss": 0.2219,
"step": 1130
},
{
"epoch": 0.9331963001027749,
"grad_norm": 0.9934049248695374,
"learning_rate": 1.651305963059358e-05,
"loss": 0.2257,
"step": 1135
},
{
"epoch": 0.9373072970195272,
"grad_norm": 1.4836572408676147,
"learning_rate": 1.6478623153364197e-05,
"loss": 0.2625,
"step": 1140
},
{
"epoch": 0.9414182939362795,
"grad_norm": 1.4694515466690063,
"learning_rate": 1.6444053764330794e-05,
"loss": 0.3092,
"step": 1145
},
{
"epoch": 0.9455292908530318,
"grad_norm": 0.8094273209571838,
"learning_rate": 1.64093521726996e-05,
"loss": 0.1948,
"step": 1150
},
{
"epoch": 0.9496402877697842,
"grad_norm": 1.1239687204360962,
"learning_rate": 1.637451909038903e-05,
"loss": 0.3174,
"step": 1155
},
{
"epoch": 0.9537512846865365,
"grad_norm": 1.182337999343872,
"learning_rate": 1.6339555232015093e-05,
"loss": 0.223,
"step": 1160
},
{
"epoch": 0.9578622816032888,
"grad_norm": 1.086424469947815,
"learning_rate": 1.6304461314876722e-05,
"loss": 0.2526,
"step": 1165
},
{
"epoch": 0.9619732785200411,
"grad_norm": 1.3039159774780273,
"learning_rate": 1.626923805894107e-05,
"loss": 0.2073,
"step": 1170
},
{
"epoch": 0.9660842754367934,
"grad_norm": 1.206239938735962,
"learning_rate": 1.6233886186828718e-05,
"loss": 0.2323,
"step": 1175
},
{
"epoch": 0.9701952723535457,
"grad_norm": 1.1538772583007812,
"learning_rate": 1.619840642379888e-05,
"loss": 0.2259,
"step": 1180
},
{
"epoch": 0.9743062692702981,
"grad_norm": 0.9316990375518799,
"learning_rate": 1.6162799497734508e-05,
"loss": 0.2018,
"step": 1185
},
{
"epoch": 0.9784172661870504,
"grad_norm": 1.386884093284607,
"learning_rate": 1.612706613912735e-05,
"loss": 0.2461,
"step": 1190
},
{
"epoch": 0.9825282631038027,
"grad_norm": 1.1741526126861572,
"learning_rate": 1.6091207081062973e-05,
"loss": 0.2356,
"step": 1195
},
{
"epoch": 0.986639260020555,
"grad_norm": 1.0439505577087402,
"learning_rate": 1.605522305920573e-05,
"loss": 0.1756,
"step": 1200
},
{
"epoch": 0.9907502569373073,
"grad_norm": 1.1836326122283936,
"learning_rate": 1.6019114811783663e-05,
"loss": 0.2192,
"step": 1205
},
{
"epoch": 0.9948612538540597,
"grad_norm": 1.37388014793396,
"learning_rate": 1.5982883079573354e-05,
"loss": 0.2178,
"step": 1210
},
{
"epoch": 0.998972250770812,
"grad_norm": 0.9809684157371521,
"learning_rate": 1.5946528605884717e-05,
"loss": 0.2096,
"step": 1215
},
{
"epoch": 1.0024665981500513,
"grad_norm": 1.4830330610275269,
"learning_rate": 1.5910052136545788e-05,
"loss": 0.1877,
"step": 1220
},
{
"epoch": 1.0065775950668037,
"grad_norm": 1.681783676147461,
"learning_rate": 1.5873454419887365e-05,
"loss": 0.2309,
"step": 1225
},
{
"epoch": 1.010688591983556,
"grad_norm": 1.1318143606185913,
"learning_rate": 1.5836736206727717e-05,
"loss": 0.2246,
"step": 1230
},
{
"epoch": 1.0147995889003083,
"grad_norm": 1.2023489475250244,
"learning_rate": 1.5799898250357134e-05,
"loss": 0.2398,
"step": 1235
},
{
"epoch": 1.0189105858170606,
"grad_norm": 1.1647961139678955,
"learning_rate": 1.5762941306522504e-05,
"loss": 0.2077,
"step": 1240
},
{
"epoch": 1.023021582733813,
"grad_norm": 1.2750186920166016,
"learning_rate": 1.5725866133411777e-05,
"loss": 0.2246,
"step": 1245
},
{
"epoch": 1.0271325796505653,
"grad_norm": 1.1721622943878174,
"learning_rate": 1.5688673491638452e-05,
"loss": 0.2273,
"step": 1250
},
{
"epoch": 1.0312435765673176,
"grad_norm": 0.9394551515579224,
"learning_rate": 1.565136414422592e-05,
"loss": 0.2112,
"step": 1255
},
{
"epoch": 1.03535457348407,
"grad_norm": 1.4058277606964111,
"learning_rate": 1.5613938856591867e-05,
"loss": 0.2574,
"step": 1260
},
{
"epoch": 1.0394655704008222,
"grad_norm": 1.388596534729004,
"learning_rate": 1.5576398396532538e-05,
"loss": 0.1818,
"step": 1265
},
{
"epoch": 1.0435765673175745,
"grad_norm": 1.1128935813903809,
"learning_rate": 1.5538743534206968e-05,
"loss": 0.1823,
"step": 1270
},
{
"epoch": 1.0476875642343269,
"grad_norm": 1.1479747295379639,
"learning_rate": 1.550097504212124e-05,
"loss": 0.2382,
"step": 1275
},
{
"epoch": 1.0517985611510792,
"grad_norm": 1.4205436706542969,
"learning_rate": 1.5463093695112572e-05,
"loss": 0.2548,
"step": 1280
},
{
"epoch": 1.0559095580678315,
"grad_norm": 0.9275251030921936,
"learning_rate": 1.542510027033347e-05,
"loss": 0.1469,
"step": 1285
},
{
"epoch": 1.0600205549845838,
"grad_norm": 1.2624469995498657,
"learning_rate": 1.5386995547235756e-05,
"loss": 0.2276,
"step": 1290
},
{
"epoch": 1.0641315519013361,
"grad_norm": 1.2979212999343872,
"learning_rate": 1.534878030755458e-05,
"loss": 0.1892,
"step": 1295
},
{
"epoch": 1.0682425488180884,
"grad_norm": 1.4504139423370361,
"learning_rate": 1.5310455335292404e-05,
"loss": 0.2041,
"step": 1300
},
{
"epoch": 1.0723535457348408,
"grad_norm": 1.2005679607391357,
"learning_rate": 1.527202141670289e-05,
"loss": 0.2174,
"step": 1305
},
{
"epoch": 1.076464542651593,
"grad_norm": 1.295580506324768,
"learning_rate": 1.5233479340274783e-05,
"loss": 0.1981,
"step": 1310
},
{
"epoch": 1.0805755395683454,
"grad_norm": 1.29360830783844,
"learning_rate": 1.5194829896715741e-05,
"loss": 0.2448,
"step": 1315
},
{
"epoch": 1.0846865364850977,
"grad_norm": 0.9235324263572693,
"learning_rate": 1.51560738789361e-05,
"loss": 0.2284,
"step": 1320
},
{
"epoch": 1.08879753340185,
"grad_norm": 1.3092626333236694,
"learning_rate": 1.5117212082032611e-05,
"loss": 0.2107,
"step": 1325
},
{
"epoch": 1.0929085303186024,
"grad_norm": 1.1513203382492065,
"learning_rate": 1.5078245303272133e-05,
"loss": 0.1805,
"step": 1330
},
{
"epoch": 1.0970195272353547,
"grad_norm": 1.2433505058288574,
"learning_rate": 1.5039174342075278e-05,
"loss": 0.2306,
"step": 1335
},
{
"epoch": 1.101130524152107,
"grad_norm": 1.0607112646102905,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.2046,
"step": 1340
},
{
"epoch": 1.105241521068859,
"grad_norm": 1.30000638961792,
"learning_rate": 1.4960723080725164e-05,
"loss": 0.2182,
"step": 1345
},
{
"epoch": 1.1093525179856114,
"grad_norm": 1.1230967044830322,
"learning_rate": 1.492134439003404e-05,
"loss": 0.1732,
"step": 1350
},
{
"epoch": 1.1134635149023637,
"grad_norm": 1.5389747619628906,
"learning_rate": 1.4881864735797798e-05,
"loss": 0.232,
"step": 1355
},
{
"epoch": 1.117574511819116,
"grad_norm": 1.6007003784179688,
"learning_rate": 1.4842284927958908e-05,
"loss": 0.2694,
"step": 1360
},
{
"epoch": 1.1216855087358684,
"grad_norm": 1.1094415187835693,
"learning_rate": 1.4802605778514541e-05,
"loss": 0.1816,
"step": 1365
},
{
"epoch": 1.1257965056526207,
"grad_norm": 1.5207327604293823,
"learning_rate": 1.4762828101499902e-05,
"loss": 0.2355,
"step": 1370
},
{
"epoch": 1.129907502569373,
"grad_norm": 1.3849849700927734,
"learning_rate": 1.4722952712971535e-05,
"loss": 0.2039,
"step": 1375
},
{
"epoch": 1.1340184994861253,
"grad_norm": 1.5623657703399658,
"learning_rate": 1.4682980430990577e-05,
"loss": 0.2081,
"step": 1380
},
{
"epoch": 1.1381294964028776,
"grad_norm": 1.3388752937316895,
"learning_rate": 1.4642912075605976e-05,
"loss": 0.2469,
"step": 1385
},
{
"epoch": 1.14224049331963,
"grad_norm": 1.5565121173858643,
"learning_rate": 1.4602748468837669e-05,
"loss": 0.2172,
"step": 1390
},
{
"epoch": 1.1463514902363823,
"grad_norm": 1.3776781558990479,
"learning_rate": 1.456249043465972e-05,
"loss": 0.1848,
"step": 1395
},
{
"epoch": 1.1504624871531346,
"grad_norm": 1.578230857849121,
"learning_rate": 1.4522138798983408e-05,
"loss": 0.1939,
"step": 1400
},
{
"epoch": 1.154573484069887,
"grad_norm": 1.3362317085266113,
"learning_rate": 1.4481694389640291e-05,
"loss": 0.1807,
"step": 1405
},
{
"epoch": 1.1586844809866392,
"grad_norm": 1.7067232131958008,
"learning_rate": 1.444115803636522e-05,
"loss": 0.2204,
"step": 1410
},
{
"epoch": 1.1627954779033916,
"grad_norm": 1.0936253070831299,
"learning_rate": 1.440053057077931e-05,
"loss": 0.1973,
"step": 1415
},
{
"epoch": 1.1669064748201439,
"grad_norm": 1.5256032943725586,
"learning_rate": 1.4359812826372894e-05,
"loss": 0.2216,
"step": 1420
},
{
"epoch": 1.1710174717368962,
"grad_norm": 1.19940185546875,
"learning_rate": 1.4319005638488413e-05,
"loss": 0.2439,
"step": 1425
},
{
"epoch": 1.1751284686536485,
"grad_norm": 0.9025769233703613,
"learning_rate": 1.4278109844303271e-05,
"loss": 0.2084,
"step": 1430
},
{
"epoch": 1.1792394655704008,
"grad_norm": 1.2088083028793335,
"learning_rate": 1.4237126282812684e-05,
"loss": 0.2359,
"step": 1435
},
{
"epoch": 1.1833504624871531,
"grad_norm": 1.2607866525650024,
"learning_rate": 1.4196055794812437e-05,
"loss": 0.2318,
"step": 1440
},
{
"epoch": 1.1874614594039055,
"grad_norm": 1.3030301332473755,
"learning_rate": 1.4154899222881666e-05,
"loss": 0.1825,
"step": 1445
},
{
"epoch": 1.1915724563206578,
"grad_norm": 1.9711260795593262,
"learning_rate": 1.4113657411365546e-05,
"loss": 0.2345,
"step": 1450
},
{
"epoch": 1.19568345323741,
"grad_norm": 1.4095484018325806,
"learning_rate": 1.4072331206357986e-05,
"loss": 0.2242,
"step": 1455
},
{
"epoch": 1.1997944501541624,
"grad_norm": 1.1082674264907837,
"learning_rate": 1.4030921455684255e-05,
"loss": 0.2083,
"step": 1460
},
{
"epoch": 1.2039054470709147,
"grad_norm": 1.8573055267333984,
"learning_rate": 1.3989429008883613e-05,
"loss": 0.2097,
"step": 1465
},
{
"epoch": 1.208016443987667,
"grad_norm": 1.4647445678710938,
"learning_rate": 1.3947854717191853e-05,
"loss": 0.2139,
"step": 1470
},
{
"epoch": 1.2121274409044194,
"grad_norm": 1.2996141910552979,
"learning_rate": 1.390619943352386e-05,
"loss": 0.244,
"step": 1475
},
{
"epoch": 1.2162384378211717,
"grad_norm": 1.5465834140777588,
"learning_rate": 1.3864464012456103e-05,
"loss": 0.1681,
"step": 1480
},
{
"epoch": 1.220349434737924,
"grad_norm": 1.2666230201721191,
"learning_rate": 1.3822649310209106e-05,
"loss": 0.1859,
"step": 1485
},
{
"epoch": 1.2244604316546763,
"grad_norm": 0.884738028049469,
"learning_rate": 1.378075618462988e-05,
"loss": 0.2195,
"step": 1490
},
{
"epoch": 1.2285714285714286,
"grad_norm": 1.7185105085372925,
"learning_rate": 1.3738785495174325e-05,
"loss": 0.2417,
"step": 1495
},
{
"epoch": 1.232682425488181,
"grad_norm": 1.2726584672927856,
"learning_rate": 1.36967381028896e-05,
"loss": 0.1806,
"step": 1500
},
{
"epoch": 1.2367934224049333,
"grad_norm": 1.3861750364303589,
"learning_rate": 1.3654614870396455e-05,
"loss": 0.1957,
"step": 1505
},
{
"epoch": 1.2409044193216856,
"grad_norm": 1.6232922077178955,
"learning_rate": 1.3612416661871532e-05,
"loss": 0.2038,
"step": 1510
},
{
"epoch": 1.245015416238438,
"grad_norm": 1.4664605855941772,
"learning_rate": 1.3570144343029644e-05,
"loss": 0.2569,
"step": 1515
},
{
"epoch": 1.24912641315519,
"grad_norm": 1.2884601354599,
"learning_rate": 1.352779878110601e-05,
"loss": 0.1872,
"step": 1520
},
{
"epoch": 1.2532374100719426,
"grad_norm": 1.3926196098327637,
"learning_rate": 1.3485380844838461e-05,
"loss": 0.2134,
"step": 1525
},
{
"epoch": 1.2573484069886947,
"grad_norm": 1.4619618654251099,
"learning_rate": 1.3442891404449615e-05,
"loss": 0.2232,
"step": 1530
},
{
"epoch": 1.2614594039054472,
"grad_norm": 1.2975983619689941,
"learning_rate": 1.3400331331629038e-05,
"loss": 0.1845,
"step": 1535
},
{
"epoch": 1.2655704008221993,
"grad_norm": 1.7390352487564087,
"learning_rate": 1.3357701499515345e-05,
"loss": 0.2343,
"step": 1540
},
{
"epoch": 1.2696813977389518,
"grad_norm": 1.1968086957931519,
"learning_rate": 1.3315002782678299e-05,
"loss": 0.2185,
"step": 1545
},
{
"epoch": 1.273792394655704,
"grad_norm": 0.9323089718818665,
"learning_rate": 1.3272236057100849e-05,
"loss": 0.1602,
"step": 1550
},
{
"epoch": 1.2779033915724562,
"grad_norm": 1.4192270040512085,
"learning_rate": 1.3229402200161197e-05,
"loss": 0.2131,
"step": 1555
},
{
"epoch": 1.2820143884892086,
"grad_norm": 1.3882615566253662,
"learning_rate": 1.3186502090614752e-05,
"loss": 0.1807,
"step": 1560
},
{
"epoch": 1.2861253854059609,
"grad_norm": 1.6137700080871582,
"learning_rate": 1.3143536608576141e-05,
"loss": 0.1716,
"step": 1565
},
{
"epoch": 1.2902363823227132,
"grad_norm": 1.1685277223587036,
"learning_rate": 1.310050663550112e-05,
"loss": 0.1609,
"step": 1570
},
{
"epoch": 1.2943473792394655,
"grad_norm": 1.3866482973098755,
"learning_rate": 1.3057413054168525e-05,
"loss": 0.1936,
"step": 1575
},
{
"epoch": 1.2984583761562178,
"grad_norm": 1.498495101928711,
"learning_rate": 1.3014256748662127e-05,
"loss": 0.1979,
"step": 1580
},
{
"epoch": 1.3025693730729702,
"grad_norm": 1.0271029472351074,
"learning_rate": 1.2971038604352521e-05,
"loss": 0.1659,
"step": 1585
},
{
"epoch": 1.3066803699897225,
"grad_norm": 1.0579352378845215,
"learning_rate": 1.2927759507878954e-05,
"loss": 0.1897,
"step": 1590
},
{
"epoch": 1.3107913669064748,
"grad_norm": 1.6076267957687378,
"learning_rate": 1.2884420347131123e-05,
"loss": 0.1919,
"step": 1595
},
{
"epoch": 1.3149023638232271,
"grad_norm": 1.143676519393921,
"learning_rate": 1.284102201123098e-05,
"loss": 0.1931,
"step": 1600
},
{
"epoch": 1.3190133607399794,
"grad_norm": 1.077234148979187,
"learning_rate": 1.2797565390514478e-05,
"loss": 0.2089,
"step": 1605
},
{
"epoch": 1.3231243576567318,
"grad_norm": 1.0523713827133179,
"learning_rate": 1.2754051376513304e-05,
"loss": 0.1682,
"step": 1610
},
{
"epoch": 1.327235354573484,
"grad_norm": 1.4638235569000244,
"learning_rate": 1.27104808619366e-05,
"loss": 0.1963,
"step": 1615
},
{
"epoch": 1.3313463514902364,
"grad_norm": 1.4082380533218384,
"learning_rate": 1.2666854740652641e-05,
"loss": 0.2477,
"step": 1620
},
{
"epoch": 1.3354573484069887,
"grad_norm": 0.8287607431411743,
"learning_rate": 1.2623173907670494e-05,
"loss": 0.2151,
"step": 1625
},
{
"epoch": 1.339568345323741,
"grad_norm": 1.617641568183899,
"learning_rate": 1.2579439259121665e-05,
"loss": 0.2654,
"step": 1630
},
{
"epoch": 1.3436793422404933,
"grad_norm": 0.7832649350166321,
"learning_rate": 1.253565169224171e-05,
"loss": 0.1956,
"step": 1635
},
{
"epoch": 1.3477903391572457,
"grad_norm": 1.3806982040405273,
"learning_rate": 1.2491812105351824e-05,
"loss": 0.2043,
"step": 1640
},
{
"epoch": 1.351901336073998,
"grad_norm": 1.2232375144958496,
"learning_rate": 1.2447921397840417e-05,
"loss": 0.1707,
"step": 1645
},
{
"epoch": 1.3560123329907503,
"grad_norm": 1.5429998636245728,
"learning_rate": 1.240398047014466e-05,
"loss": 0.1683,
"step": 1650
},
{
"epoch": 1.3601233299075026,
"grad_norm": 1.6056349277496338,
"learning_rate": 1.2359990223732023e-05,
"loss": 0.239,
"step": 1655
},
{
"epoch": 1.364234326824255,
"grad_norm": 1.1654701232910156,
"learning_rate": 1.2315951561081754e-05,
"loss": 0.2667,
"step": 1660
},
{
"epoch": 1.3683453237410073,
"grad_norm": 2.054358959197998,
"learning_rate": 1.2271865385666394e-05,
"loss": 0.2238,
"step": 1665
},
{
"epoch": 1.3724563206577596,
"grad_norm": 1.4064335823059082,
"learning_rate": 1.222773260193323e-05,
"loss": 0.211,
"step": 1670
},
{
"epoch": 1.376567317574512,
"grad_norm": 1.4539635181427002,
"learning_rate": 1.2183554115285726e-05,
"loss": 0.1958,
"step": 1675
},
{
"epoch": 1.3806783144912642,
"grad_norm": 1.3629333972930908,
"learning_rate": 1.2139330832064975e-05,
"loss": 0.2158,
"step": 1680
},
{
"epoch": 1.3847893114080163,
"grad_norm": 0.8606613874435425,
"learning_rate": 1.2095063659531087e-05,
"loss": 0.1822,
"step": 1685
},
{
"epoch": 1.3889003083247689,
"grad_norm": 1.137616753578186,
"learning_rate": 1.2050753505844581e-05,
"loss": 0.2088,
"step": 1690
},
{
"epoch": 1.393011305241521,
"grad_norm": 1.4008034467697144,
"learning_rate": 1.2006401280047753e-05,
"loss": 0.1917,
"step": 1695
},
{
"epoch": 1.3971223021582735,
"grad_norm": 1.5580987930297852,
"learning_rate": 1.1962007892046017e-05,
"loss": 0.2043,
"step": 1700
},
{
"epoch": 1.4012332990750256,
"grad_norm": 1.2816071510314941,
"learning_rate": 1.191757425258927e-05,
"loss": 0.2047,
"step": 1705
},
{
"epoch": 1.4053442959917781,
"grad_norm": 1.567262053489685,
"learning_rate": 1.1873101273253167e-05,
"loss": 0.1961,
"step": 1710
},
{
"epoch": 1.4094552929085302,
"grad_norm": 1.2683942317962646,
"learning_rate": 1.1828589866420441e-05,
"loss": 0.1941,
"step": 1715
},
{
"epoch": 1.4135662898252828,
"grad_norm": 1.3658758401870728,
"learning_rate": 1.1784040945262185e-05,
"loss": 0.2007,
"step": 1720
},
{
"epoch": 1.4176772867420349,
"grad_norm": 1.0757827758789062,
"learning_rate": 1.173945542371912e-05,
"loss": 0.181,
"step": 1725
},
{
"epoch": 1.4217882836587872,
"grad_norm": 1.5418345928192139,
"learning_rate": 1.1694834216482827e-05,
"loss": 0.2132,
"step": 1730
},
{
"epoch": 1.4258992805755395,
"grad_norm": 1.3307230472564697,
"learning_rate": 1.1650178238977004e-05,
"loss": 0.1859,
"step": 1735
},
{
"epoch": 1.4300102774922918,
"grad_norm": 1.0295861959457397,
"learning_rate": 1.1605488407338674e-05,
"loss": 0.2017,
"step": 1740
},
{
"epoch": 1.4341212744090441,
"grad_norm": 0.7575131058692932,
"learning_rate": 1.1560765638399398e-05,
"loss": 0.2239,
"step": 1745
},
{
"epoch": 1.4382322713257965,
"grad_norm": 1.2658188343048096,
"learning_rate": 1.1516010849666446e-05,
"loss": 0.1836,
"step": 1750
},
{
"epoch": 1.4423432682425488,
"grad_norm": 1.6164782047271729,
"learning_rate": 1.1471224959304003e-05,
"loss": 0.1773,
"step": 1755
},
{
"epoch": 1.446454265159301,
"grad_norm": 1.5609745979309082,
"learning_rate": 1.1426408886114309e-05,
"loss": 0.1787,
"step": 1760
},
{
"epoch": 1.4505652620760534,
"grad_norm": 1.4867188930511475,
"learning_rate": 1.1381563549518823e-05,
"loss": 0.1734,
"step": 1765
},
{
"epoch": 1.4546762589928057,
"grad_norm": 1.335847020149231,
"learning_rate": 1.1336689869539352e-05,
"loss": 0.1931,
"step": 1770
},
{
"epoch": 1.458787255909558,
"grad_norm": 1.2672595977783203,
"learning_rate": 1.1291788766779179e-05,
"loss": 0.2077,
"step": 1775
},
{
"epoch": 1.4628982528263104,
"grad_norm": 1.1603102684020996,
"learning_rate": 1.1246861162404184e-05,
"loss": 0.2657,
"step": 1780
},
{
"epoch": 1.4670092497430627,
"grad_norm": 0.9221446514129639,
"learning_rate": 1.1201907978123933e-05,
"loss": 0.1829,
"step": 1785
},
{
"epoch": 1.471120246659815,
"grad_norm": 1.1459940671920776,
"learning_rate": 1.1156930136172776e-05,
"loss": 0.2014,
"step": 1790
},
{
"epoch": 1.4752312435765673,
"grad_norm": 1.452026605606079,
"learning_rate": 1.1111928559290928e-05,
"loss": 0.2048,
"step": 1795
},
{
"epoch": 1.4793422404933196,
"grad_norm": 1.1732393503189087,
"learning_rate": 1.1066904170705533e-05,
"loss": 0.21,
"step": 1800
},
{
"epoch": 1.483453237410072,
"grad_norm": 1.3255653381347656,
"learning_rate": 1.1021857894111736e-05,
"loss": 0.2174,
"step": 1805
},
{
"epoch": 1.4875642343268243,
"grad_norm": 1.398779034614563,
"learning_rate": 1.097679065365371e-05,
"loss": 0.2079,
"step": 1810
},
{
"epoch": 1.4916752312435766,
"grad_norm": 1.8272420167922974,
"learning_rate": 1.0931703373905722e-05,
"loss": 0.1772,
"step": 1815
},
{
"epoch": 1.495786228160329,
"grad_norm": 1.989067792892456,
"learning_rate": 1.0886596979853152e-05,
"loss": 0.2067,
"step": 1820
},
{
"epoch": 1.4998972250770812,
"grad_norm": 1.9263696670532227,
"learning_rate": 1.0841472396873516e-05,
"loss": 0.251,
"step": 1825
},
{
"epoch": 1.5040082219938335,
"grad_norm": 1.8333402872085571,
"learning_rate": 1.0796330550717484e-05,
"loss": 0.234,
"step": 1830
},
{
"epoch": 1.5081192189105859,
"grad_norm": 0.866995096206665,
"learning_rate": 1.0751172367489886e-05,
"loss": 0.2539,
"step": 1835
},
{
"epoch": 1.512230215827338,
"grad_norm": 1.1297638416290283,
"learning_rate": 1.0705998773630722e-05,
"loss": 0.2017,
"step": 1840
},
{
"epoch": 1.5163412127440905,
"grad_norm": 1.5568816661834717,
"learning_rate": 1.066081069589614e-05,
"loss": 0.1947,
"step": 1845
},
{
"epoch": 1.5204522096608426,
"grad_norm": 0.9311817288398743,
"learning_rate": 1.0615609061339431e-05,
"loss": 0.2098,
"step": 1850
},
{
"epoch": 1.5245632065775951,
"grad_norm": 1.7587332725524902,
"learning_rate": 1.0570394797292015e-05,
"loss": 0.189,
"step": 1855
},
{
"epoch": 1.5286742034943472,
"grad_norm": 1.7202497720718384,
"learning_rate": 1.0525168831344408e-05,
"loss": 0.1959,
"step": 1860
},
{
"epoch": 1.5327852004110998,
"grad_norm": 1.744030475616455,
"learning_rate": 1.0479932091327198e-05,
"loss": 0.1934,
"step": 1865
},
{
"epoch": 1.5368961973278519,
"grad_norm": 1.5156835317611694,
"learning_rate": 1.0434685505292008e-05,
"loss": 0.1969,
"step": 1870
},
{
"epoch": 1.5410071942446044,
"grad_norm": 1.2871688604354858,
"learning_rate": 1.0389430001492453e-05,
"loss": 0.1926,
"step": 1875
},
{
"epoch": 1.5451181911613565,
"grad_norm": 0.996692419052124,
"learning_rate": 1.0344166508365101e-05,
"loss": 0.1972,
"step": 1880
},
{
"epoch": 1.549229188078109,
"grad_norm": 1.0069717168807983,
"learning_rate": 1.0298895954510426e-05,
"loss": 0.2209,
"step": 1885
},
{
"epoch": 1.5533401849948612,
"grad_norm": 1.5595433712005615,
"learning_rate": 1.025361926867376e-05,
"loss": 0.258,
"step": 1890
},
{
"epoch": 1.5574511819116137,
"grad_norm": 1.4904924631118774,
"learning_rate": 1.0208337379726225e-05,
"loss": 0.2009,
"step": 1895
},
{
"epoch": 1.5615621788283658,
"grad_norm": 1.2531158924102783,
"learning_rate": 1.0163051216645693e-05,
"loss": 0.1595,
"step": 1900
},
{
"epoch": 1.5656731757451183,
"grad_norm": 0.9231387972831726,
"learning_rate": 1.0117761708497727e-05,
"loss": 0.1923,
"step": 1905
},
{
"epoch": 1.5697841726618704,
"grad_norm": 1.3887051343917847,
"learning_rate": 1.0072469784416505e-05,
"loss": 0.2554,
"step": 1910
},
{
"epoch": 1.573895169578623,
"grad_norm": 1.1096982955932617,
"learning_rate": 1.0027176373585774e-05,
"loss": 0.227,
"step": 1915
},
{
"epoch": 1.578006166495375,
"grad_norm": 1.3331483602523804,
"learning_rate": 9.981882405219784e-06,
"loss": 0.2092,
"step": 1920
},
{
"epoch": 1.5821171634121276,
"grad_norm": 1.5959802865982056,
"learning_rate": 9.93658880854422e-06,
"loss": 0.215,
"step": 1925
},
{
"epoch": 1.5862281603288797,
"grad_norm": 1.377021312713623,
"learning_rate": 9.891296512777145e-06,
"loss": 0.1934,
"step": 1930
},
{
"epoch": 1.590339157245632,
"grad_norm": 1.6592588424682617,
"learning_rate": 9.846006447109934e-06,
"loss": 0.1759,
"step": 1935
},
{
"epoch": 1.5944501541623843,
"grad_norm": 1.525452971458435,
"learning_rate": 9.800719540688201e-06,
"loss": 0.1736,
"step": 1940
},
{
"epoch": 1.5985611510791367,
"grad_norm": 1.2749650478363037,
"learning_rate": 9.755436722592757e-06,
"loss": 0.2174,
"step": 1945
},
{
"epoch": 1.602672147995889,
"grad_norm": 1.2123613357543945,
"learning_rate": 9.710158921820535e-06,
"loss": 0.2202,
"step": 1950
},
{
"epoch": 1.6067831449126413,
"grad_norm": 1.1123170852661133,
"learning_rate": 9.664887067265533e-06,
"loss": 0.2006,
"step": 1955
},
{
"epoch": 1.6108941418293936,
"grad_norm": 1.1806175708770752,
"learning_rate": 9.619622087699774e-06,
"loss": 0.2497,
"step": 1960
},
{
"epoch": 1.615005138746146,
"grad_norm": 1.343797206878662,
"learning_rate": 9.574364911754212e-06,
"loss": 0.1791,
"step": 1965
},
{
"epoch": 1.6191161356628982,
"grad_norm": 1.6461807489395142,
"learning_rate": 9.52911646789973e-06,
"loss": 0.1587,
"step": 1970
},
{
"epoch": 1.6232271325796506,
"grad_norm": 1.6421260833740234,
"learning_rate": 9.483877684428059e-06,
"loss": 0.1854,
"step": 1975
},
{
"epoch": 1.6273381294964029,
"grad_norm": 1.1155070066452026,
"learning_rate": 9.438649489432737e-06,
"loss": 0.1602,
"step": 1980
},
{
"epoch": 1.6314491264131552,
"grad_norm": 1.255240559577942,
"learning_rate": 9.393432810790083e-06,
"loss": 0.1772,
"step": 1985
},
{
"epoch": 1.6355601233299075,
"grad_norm": 1.605877161026001,
"learning_rate": 9.348228576140159e-06,
"loss": 0.1797,
"step": 1990
},
{
"epoch": 1.6396711202466598,
"grad_norm": 1.8181346654891968,
"learning_rate": 9.303037712867709e-06,
"loss": 0.2196,
"step": 1995
},
{
"epoch": 1.6437821171634122,
"grad_norm": 2.2337422370910645,
"learning_rate": 9.25786114808319e-06,
"loss": 0.1711,
"step": 2000
},
{
"epoch": 1.6478931140801645,
"grad_norm": 1.1943351030349731,
"learning_rate": 9.212699808603687e-06,
"loss": 0.2107,
"step": 2005
},
{
"epoch": 1.6520041109969168,
"grad_norm": 1.4533287286758423,
"learning_rate": 9.167554620933956e-06,
"loss": 0.2321,
"step": 2010
},
{
"epoch": 1.6561151079136691,
"grad_norm": 1.109503149986267,
"learning_rate": 9.122426511247381e-06,
"loss": 0.2227,
"step": 2015
},
{
"epoch": 1.6602261048304214,
"grad_norm": 1.6231653690338135,
"learning_rate": 9.07731640536698e-06,
"loss": 0.2459,
"step": 2020
},
{
"epoch": 1.6643371017471735,
"grad_norm": 1.4417306184768677,
"learning_rate": 9.032225228746424e-06,
"loss": 0.1839,
"step": 2025
},
{
"epoch": 1.668448098663926,
"grad_norm": 1.4839571714401245,
"learning_rate": 8.98715390645104e-06,
"loss": 0.1573,
"step": 2030
},
{
"epoch": 1.6725590955806782,
"grad_norm": 1.40507972240448,
"learning_rate": 8.942103363138824e-06,
"loss": 0.186,
"step": 2035
},
{
"epoch": 1.6766700924974307,
"grad_norm": 1.316387414932251,
"learning_rate": 8.897074523041499e-06,
"loss": 0.1919,
"step": 2040
},
{
"epoch": 1.6807810894141828,
"grad_norm": 1.687321662902832,
"learning_rate": 8.852068309945519e-06,
"loss": 0.1488,
"step": 2045
},
{
"epoch": 1.6848920863309353,
"grad_norm": 1.2021576166152954,
"learning_rate": 8.807085647173151e-06,
"loss": 0.1845,
"step": 2050
},
{
"epoch": 1.6890030832476874,
"grad_norm": 1.2431976795196533,
"learning_rate": 8.762127457563511e-06,
"loss": 0.1868,
"step": 2055
},
{
"epoch": 1.69311408016444,
"grad_norm": 1.729856252670288,
"learning_rate": 8.717194663453634e-06,
"loss": 0.1878,
"step": 2060
},
{
"epoch": 1.697225077081192,
"grad_norm": 1.2510340213775635,
"learning_rate": 8.672288186659555e-06,
"loss": 0.1861,
"step": 2065
},
{
"epoch": 1.7013360739979446,
"grad_norm": 1.1471279859542847,
"learning_rate": 8.627408948457408e-06,
"loss": 0.2274,
"step": 2070
},
{
"epoch": 1.7054470709146967,
"grad_norm": 1.554674744606018,
"learning_rate": 8.582557869564498e-06,
"loss": 0.1864,
"step": 2075
},
{
"epoch": 1.7095580678314493,
"grad_norm": 1.0342234373092651,
"learning_rate": 8.537735870120447e-06,
"loss": 0.2297,
"step": 2080
},
{
"epoch": 1.7136690647482014,
"grad_norm": 1.476158857345581,
"learning_rate": 8.492943869668289e-06,
"loss": 0.2036,
"step": 2085
},
{
"epoch": 1.717780061664954,
"grad_norm": 1.3690853118896484,
"learning_rate": 8.448182787135614e-06,
"loss": 0.2029,
"step": 2090
},
{
"epoch": 1.721891058581706,
"grad_norm": 1.627550721168518,
"learning_rate": 8.403453540815729e-06,
"loss": 0.2098,
"step": 2095
},
{
"epoch": 1.7260020554984585,
"grad_norm": 1.4499547481536865,
"learning_rate": 8.35875704834879e-06,
"loss": 0.1992,
"step": 2100
},
{
"epoch": 1.7301130524152106,
"grad_norm": 1.2868211269378662,
"learning_rate": 8.314094226703007e-06,
"loss": 0.2087,
"step": 2105
},
{
"epoch": 1.734224049331963,
"grad_norm": 1.9403841495513916,
"learning_rate": 8.26946599215582e-06,
"loss": 0.2119,
"step": 2110
},
{
"epoch": 1.7383350462487153,
"grad_norm": 1.3112574815750122,
"learning_rate": 8.22487326027508e-06,
"loss": 0.1865,
"step": 2115
},
{
"epoch": 1.7424460431654676,
"grad_norm": 1.3602478504180908,
"learning_rate": 8.180316945900309e-06,
"loss": 0.1668,
"step": 2120
},
{
"epoch": 1.74655704008222,
"grad_norm": 1.7516307830810547,
"learning_rate": 8.135797963123894e-06,
"loss": 0.1785,
"step": 2125
},
{
"epoch": 1.7506680369989722,
"grad_norm": 1.3060181140899658,
"learning_rate": 8.091317225272347e-06,
"loss": 0.2075,
"step": 2130
},
{
"epoch": 1.7547790339157245,
"grad_norm": 0.9532691836357117,
"learning_rate": 8.04687564488758e-06,
"loss": 0.2036,
"step": 2135
},
{
"epoch": 1.7588900308324769,
"grad_norm": 1.5024046897888184,
"learning_rate": 8.002474133708163e-06,
"loss": 0.1785,
"step": 2140
},
{
"epoch": 1.7630010277492292,
"grad_norm": 1.2384905815124512,
"learning_rate": 7.958113602650623e-06,
"loss": 0.1807,
"step": 2145
},
{
"epoch": 1.7671120246659815,
"grad_norm": 1.592033863067627,
"learning_rate": 7.913794961790783e-06,
"loss": 0.2129,
"step": 2150
},
{
"epoch": 1.7712230215827338,
"grad_norm": 1.531943678855896,
"learning_rate": 7.869519120345042e-06,
"loss": 0.241,
"step": 2155
},
{
"epoch": 1.7753340184994861,
"grad_norm": 1.1052135229110718,
"learning_rate": 7.825286986651773e-06,
"loss": 0.1997,
"step": 2160
},
{
"epoch": 1.7794450154162385,
"grad_norm": 1.6296882629394531,
"learning_rate": 7.78109946815266e-06,
"loss": 0.1648,
"step": 2165
},
{
"epoch": 1.7835560123329908,
"grad_norm": 1.1843360662460327,
"learning_rate": 7.736957471374075e-06,
"loss": 0.2129,
"step": 2170
},
{
"epoch": 1.787667009249743,
"grad_norm": 1.4156749248504639,
"learning_rate": 7.692861901908506e-06,
"loss": 0.2118,
"step": 2175
},
{
"epoch": 1.7917780061664954,
"grad_norm": 1.6439249515533447,
"learning_rate": 7.64881366439596e-06,
"loss": 0.1809,
"step": 2180
},
{
"epoch": 1.7958890030832477,
"grad_norm": 1.2328163385391235,
"learning_rate": 7.6048136625054e-06,
"loss": 0.2138,
"step": 2185
},
{
"epoch": 1.8,
"grad_norm": 1.3099247217178345,
"learning_rate": 7.560862798916229e-06,
"loss": 0.2181,
"step": 2190
},
{
"epoch": 1.8041109969167524,
"grad_norm": 1.3534038066864014,
"learning_rate": 7.516961975299744e-06,
"loss": 0.1762,
"step": 2195
},
{
"epoch": 1.8082219938335045,
"grad_norm": 1.7935746908187866,
"learning_rate": 7.473112092300654e-06,
"loss": 0.1943,
"step": 2200
},
{
"epoch": 1.812332990750257,
"grad_norm": 1.5042284727096558,
"learning_rate": 7.429314049518601e-06,
"loss": 0.2584,
"step": 2205
},
{
"epoch": 1.816443987667009,
"grad_norm": 1.3112459182739258,
"learning_rate": 7.3855687454896965e-06,
"loss": 0.208,
"step": 2210
},
{
"epoch": 1.8205549845837616,
"grad_norm": 0.9232465624809265,
"learning_rate": 7.341877077668098e-06,
"loss": 0.2193,
"step": 2215
},
{
"epoch": 1.8246659815005137,
"grad_norm": 1.4150248765945435,
"learning_rate": 7.298239942407594e-06,
"loss": 0.1792,
"step": 2220
},
{
"epoch": 1.8287769784172663,
"grad_norm": 1.4757970571517944,
"learning_rate": 7.254658234943206e-06,
"loss": 0.1641,
"step": 2225
},
{
"epoch": 1.8328879753340184,
"grad_norm": 1.5827895402908325,
"learning_rate": 7.211132849372838e-06,
"loss": 0.1959,
"step": 2230
},
{
"epoch": 1.836998972250771,
"grad_norm": 1.4939696788787842,
"learning_rate": 7.1676646786389246e-06,
"loss": 0.1984,
"step": 2235
},
{
"epoch": 1.841109969167523,
"grad_norm": 1.4789783954620361,
"learning_rate": 7.1242546145101066e-06,
"loss": 0.2264,
"step": 2240
},
{
"epoch": 1.8452209660842755,
"grad_norm": 1.1284780502319336,
"learning_rate": 7.080903547562949e-06,
"loss": 0.1928,
"step": 2245
},
{
"epoch": 1.8493319630010276,
"grad_norm": 1.3130507469177246,
"learning_rate": 7.037612367163657e-06,
"loss": 0.1793,
"step": 2250
},
{
"epoch": 1.8534429599177802,
"grad_norm": 1.4979199171066284,
"learning_rate": 6.9943819614498435e-06,
"loss": 0.1967,
"step": 2255
},
{
"epoch": 1.8575539568345323,
"grad_norm": 1.5476986169815063,
"learning_rate": 6.951213217312301e-06,
"loss": 0.2151,
"step": 2260
},
{
"epoch": 1.8616649537512848,
"grad_norm": 1.0267906188964844,
"learning_rate": 6.9081070203768e-06,
"loss": 0.2496,
"step": 2265
},
{
"epoch": 1.865775950668037,
"grad_norm": 1.5171856880187988,
"learning_rate": 6.865064254985938e-06,
"loss": 0.2162,
"step": 2270
},
{
"epoch": 1.8698869475847895,
"grad_norm": 1.5356831550598145,
"learning_rate": 6.822085804180985e-06,
"loss": 0.2015,
"step": 2275
},
{
"epoch": 1.8739979445015416,
"grad_norm": 1.261610984802246,
"learning_rate": 6.779172549683761e-06,
"loss": 0.196,
"step": 2280
},
{
"epoch": 1.878108941418294,
"grad_norm": 1.3029332160949707,
"learning_rate": 6.73632537187856e-06,
"loss": 0.1697,
"step": 2285
},
{
"epoch": 1.8822199383350462,
"grad_norm": 1.676698923110962,
"learning_rate": 6.69354514979409e-06,
"loss": 0.1755,
"step": 2290
},
{
"epoch": 1.8863309352517985,
"grad_norm": 2.004960298538208,
"learning_rate": 6.650832761085417e-06,
"loss": 0.1992,
"step": 2295
},
{
"epoch": 1.8904419321685508,
"grad_norm": 0.8850013017654419,
"learning_rate": 6.608189082015993e-06,
"loss": 0.221,
"step": 2300
},
{
"epoch": 1.8945529290853032,
"grad_norm": 1.1899011135101318,
"learning_rate": 6.565614987439648e-06,
"loss": 0.164,
"step": 2305
},
{
"epoch": 1.8986639260020555,
"grad_norm": 1.388355016708374,
"learning_rate": 6.523111350782664e-06,
"loss": 0.2236,
"step": 2310
},
{
"epoch": 1.9027749229188078,
"grad_norm": 1.3841829299926758,
"learning_rate": 6.480679044025846e-06,
"loss": 0.227,
"step": 2315
},
{
"epoch": 1.90688591983556,
"grad_norm": 1.0201807022094727,
"learning_rate": 6.438318937686631e-06,
"loss": 0.2162,
"step": 2320
},
{
"epoch": 1.9109969167523124,
"grad_norm": 1.0088194608688354,
"learning_rate": 6.396031900801238e-06,
"loss": 0.2136,
"step": 2325
},
{
"epoch": 1.9151079136690647,
"grad_norm": 1.3621116876602173,
"learning_rate": 6.3538188009068306e-06,
"loss": 0.1761,
"step": 2330
},
{
"epoch": 1.919218910585817,
"grad_norm": 1.4654209613800049,
"learning_rate": 6.311680504023718e-06,
"loss": 0.2004,
"step": 2335
},
{
"epoch": 1.9233299075025694,
"grad_norm": 1.19648015499115,
"learning_rate": 6.2696178746376035e-06,
"loss": 0.2066,
"step": 2340
},
{
"epoch": 1.9274409044193217,
"grad_norm": 1.38064706325531,
"learning_rate": 6.227631775681834e-06,
"loss": 0.1615,
"step": 2345
},
{
"epoch": 1.931551901336074,
"grad_norm": 1.5320490598678589,
"learning_rate": 6.1857230685196955e-06,
"loss": 0.1885,
"step": 2350
},
{
"epoch": 1.9356628982528263,
"grad_norm": 1.5025681257247925,
"learning_rate": 6.143892612926755e-06,
"loss": 0.182,
"step": 2355
},
{
"epoch": 1.9397738951695787,
"grad_norm": 1.413241982460022,
"learning_rate": 6.102141267073207e-06,
"loss": 0.2431,
"step": 2360
},
{
"epoch": 1.943884892086331,
"grad_norm": 1.4534951448440552,
"learning_rate": 6.060469887506282e-06,
"loss": 0.2318,
"step": 2365
},
{
"epoch": 1.9479958890030833,
"grad_norm": 1.328620433807373,
"learning_rate": 6.018879329132663e-06,
"loss": 0.2112,
"step": 2370
},
{
"epoch": 1.9521068859198356,
"grad_norm": 1.1238036155700684,
"learning_rate": 5.977370445200949e-06,
"loss": 0.2113,
"step": 2375
},
{
"epoch": 1.956217882836588,
"grad_norm": 1.2546100616455078,
"learning_rate": 5.935944087284155e-06,
"loss": 0.182,
"step": 2380
},
{
"epoch": 1.96032887975334,
"grad_norm": 1.6414473056793213,
"learning_rate": 5.894601105262241e-06,
"loss": 0.2252,
"step": 2385
},
{
"epoch": 1.9644398766700926,
"grad_norm": 1.3691917657852173,
"learning_rate": 5.853342347304665e-06,
"loss": 0.1921,
"step": 2390
},
{
"epoch": 1.9685508735868447,
"grad_norm": 1.4818812608718872,
"learning_rate": 5.812168659852998e-06,
"loss": 0.2008,
"step": 2395
},
{
"epoch": 1.9726618705035972,
"grad_norm": 1.7602829933166504,
"learning_rate": 5.7710808876035604e-06,
"loss": 0.1545,
"step": 2400
},
{
"epoch": 1.9767728674203493,
"grad_norm": 1.7359880208969116,
"learning_rate": 5.73007987349006e-06,
"loss": 0.1716,
"step": 2405
},
{
"epoch": 1.9808838643371018,
"grad_norm": 1.7533327341079712,
"learning_rate": 5.689166458666348e-06,
"loss": 0.1731,
"step": 2410
},
{
"epoch": 1.984994861253854,
"grad_norm": 1.7562685012817383,
"learning_rate": 5.64834148248912e-06,
"loss": 0.1772,
"step": 2415
},
{
"epoch": 1.9891058581706065,
"grad_norm": 1.2940082550048828,
"learning_rate": 5.6076057825007315e-06,
"loss": 0.1907,
"step": 2420
},
{
"epoch": 1.9932168550873586,
"grad_norm": 1.5690494775772095,
"learning_rate": 5.566960194411984e-06,
"loss": 0.2064,
"step": 2425
},
{
"epoch": 1.9973278520041111,
"grad_norm": 0.9809213876724243,
"learning_rate": 5.52640555208499e-06,
"loss": 0.1759,
"step": 2430
},
{
"epoch": 2.0008221993833506,
"grad_norm": 1.2464898824691772,
"learning_rate": 5.485942687516086e-06,
"loss": 0.1983,
"step": 2435
},
{
"epoch": 2.0049331963001027,
"grad_norm": 1.286004900932312,
"learning_rate": 5.445572430818744e-06,
"loss": 0.1692,
"step": 2440
},
{
"epoch": 2.0090441932168552,
"grad_norm": 1.5520018339157104,
"learning_rate": 5.405295610206525e-06,
"loss": 0.1707,
"step": 2445
},
{
"epoch": 2.0131551901336073,
"grad_norm": 1.2041336297988892,
"learning_rate": 5.3651130519761315e-06,
"loss": 0.1989,
"step": 2450
},
{
"epoch": 2.01726618705036,
"grad_norm": 1.3793076276779175,
"learning_rate": 5.3250255804904176e-06,
"loss": 0.2276,
"step": 2455
},
{
"epoch": 2.021377183967112,
"grad_norm": 1.8650401830673218,
"learning_rate": 5.285034018161503e-06,
"loss": 0.2138,
"step": 2460
},
{
"epoch": 2.0254881808838645,
"grad_norm": 1.3267054557800293,
"learning_rate": 5.245139185433875e-06,
"loss": 0.1427,
"step": 2465
},
{
"epoch": 2.0295991778006166,
"grad_norm": 1.0150337219238281,
"learning_rate": 5.205341900767575e-06,
"loss": 0.184,
"step": 2470
},
{
"epoch": 2.033710174717369,
"grad_norm": 1.5393043756484985,
"learning_rate": 5.165642980621413e-06,
"loss": 0.1722,
"step": 2475
},
{
"epoch": 2.0378211716341212,
"grad_norm": 1.7602789402008057,
"learning_rate": 5.1260432394362e-06,
"loss": 0.1736,
"step": 2480
},
{
"epoch": 2.041932168550874,
"grad_norm": 1.5538018941879272,
"learning_rate": 5.0865434896180385e-06,
"loss": 0.1915,
"step": 2485
},
{
"epoch": 2.046043165467626,
"grad_norm": 1.6677844524383545,
"learning_rate": 5.047144541521676e-06,
"loss": 0.2089,
"step": 2490
},
{
"epoch": 2.0501541623843784,
"grad_norm": 1.1350642442703247,
"learning_rate": 5.007847203433869e-06,
"loss": 0.226,
"step": 2495
},
{
"epoch": 2.0542651593011305,
"grad_norm": 1.5596694946289062,
"learning_rate": 4.968652281556794e-06,
"loss": 0.1517,
"step": 2500
},
{
"epoch": 2.058376156217883,
"grad_norm": 1.2307167053222656,
"learning_rate": 4.929560579991513e-06,
"loss": 0.1938,
"step": 2505
},
{
"epoch": 2.062487153134635,
"grad_norm": 1.1274449825286865,
"learning_rate": 4.890572900721479e-06,
"loss": 0.1995,
"step": 2510
},
{
"epoch": 2.0665981500513873,
"grad_norm": 1.5779458284378052,
"learning_rate": 4.851690043596086e-06,
"loss": 0.2166,
"step": 2515
},
{
"epoch": 2.07070914696814,
"grad_norm": 1.2458999156951904,
"learning_rate": 4.81291280631426e-06,
"loss": 0.1655,
"step": 2520
},
{
"epoch": 2.074820143884892,
"grad_norm": 1.380728006362915,
"learning_rate": 4.774241984408068e-06,
"loss": 0.1682,
"step": 2525
},
{
"epoch": 2.0789311408016444,
"grad_norm": 1.5582739114761353,
"learning_rate": 4.7356783712264405e-06,
"loss": 0.1587,
"step": 2530
},
{
"epoch": 2.0830421377183965,
"grad_norm": 1.5341260433197021,
"learning_rate": 4.697222757918872e-06,
"loss": 0.2258,
"step": 2535
},
{
"epoch": 2.087153134635149,
"grad_norm": 1.613232970237732,
"learning_rate": 4.65887593341918e-06,
"loss": 0.2103,
"step": 2540
},
{
"epoch": 2.091264131551901,
"grad_norm": 1.5128930807113647,
"learning_rate": 4.620638684429337e-06,
"loss": 0.2013,
"step": 2545
},
{
"epoch": 2.0953751284686537,
"grad_norm": 1.7848138809204102,
"learning_rate": 4.582511795403334e-06,
"loss": 0.2425,
"step": 2550
},
{
"epoch": 2.099486125385406,
"grad_norm": 1.4135546684265137,
"learning_rate": 4.544496048531062e-06,
"loss": 0.2001,
"step": 2555
},
{
"epoch": 2.1035971223021583,
"grad_norm": 1.6589088439941406,
"learning_rate": 4.506592223722306e-06,
"loss": 0.1777,
"step": 2560
},
{
"epoch": 2.1077081192189104,
"grad_norm": 2.0604190826416016,
"learning_rate": 4.46880109859069e-06,
"loss": 0.2061,
"step": 2565
},
{
"epoch": 2.111819116135663,
"grad_norm": 1.185441493988037,
"learning_rate": 4.431123448437778e-06,
"loss": 0.1852,
"step": 2570
},
{
"epoch": 2.115930113052415,
"grad_norm": 1.49964439868927,
"learning_rate": 4.393560046237143e-06,
"loss": 0.2298,
"step": 2575
},
{
"epoch": 2.1200411099691676,
"grad_norm": 1.7512156963348389,
"learning_rate": 4.3561116626185e-06,
"loss": 0.1768,
"step": 2580
},
{
"epoch": 2.1241521068859197,
"grad_norm": 1.6165354251861572,
"learning_rate": 4.31877906585191e-06,
"loss": 0.1517,
"step": 2585
},
{
"epoch": 2.1282631038026723,
"grad_norm": 1.2202645540237427,
"learning_rate": 4.281563021832027e-06,
"loss": 0.1552,
"step": 2590
},
{
"epoch": 2.1323741007194243,
"grad_norm": 1.716781497001648,
"learning_rate": 4.244464294062358e-06,
"loss": 0.1418,
"step": 2595
},
{
"epoch": 2.136485097636177,
"grad_norm": 1.5364643335342407,
"learning_rate": 4.207483643639629e-06,
"loss": 0.176,
"step": 2600
},
{
"epoch": 2.140596094552929,
"grad_norm": 1.496578574180603,
"learning_rate": 4.170621829238152e-06,
"loss": 0.2021,
"step": 2605
},
{
"epoch": 2.1447070914696815,
"grad_norm": 1.2782753705978394,
"learning_rate": 4.1338796070942576e-06,
"loss": 0.1705,
"step": 2610
},
{
"epoch": 2.1488180883864336,
"grad_norm": 1.102035641670227,
"learning_rate": 4.097257730990806e-06,
"loss": 0.1699,
"step": 2615
},
{
"epoch": 2.152929085303186,
"grad_norm": 1.4746476411819458,
"learning_rate": 4.060756952241691e-06,
"loss": 0.1852,
"step": 2620
},
{
"epoch": 2.1570400822199383,
"grad_norm": 1.3999295234680176,
"learning_rate": 4.024378019676444e-06,
"loss": 0.181,
"step": 2625
},
{
"epoch": 2.161151079136691,
"grad_norm": 1.6118948459625244,
"learning_rate": 3.988121679624874e-06,
"loss": 0.2021,
"step": 2630
},
{
"epoch": 2.165262076053443,
"grad_norm": 1.9151121377944946,
"learning_rate": 3.951988675901744e-06,
"loss": 0.2105,
"step": 2635
},
{
"epoch": 2.1693730729701954,
"grad_norm": 0.8960415124893188,
"learning_rate": 3.915979749791524e-06,
"loss": 0.1696,
"step": 2640
},
{
"epoch": 2.1734840698869475,
"grad_norm": 1.4472572803497314,
"learning_rate": 3.880095640033174e-06,
"loss": 0.1608,
"step": 2645
},
{
"epoch": 2.1775950668037,
"grad_norm": 1.544793725013733,
"learning_rate": 3.844337082804984e-06,
"loss": 0.1833,
"step": 2650
},
{
"epoch": 2.181706063720452,
"grad_norm": 1.5551151037216187,
"learning_rate": 3.8087048117094962e-06,
"loss": 0.1787,
"step": 2655
},
{
"epoch": 2.1858170606372047,
"grad_norm": 1.4573007822036743,
"learning_rate": 3.7731995577584224e-06,
"loss": 0.1301,
"step": 2660
},
{
"epoch": 2.189928057553957,
"grad_norm": 1.362298607826233,
"learning_rate": 3.737822049357662e-06,
"loss": 0.1516,
"step": 2665
},
{
"epoch": 2.1940390544707093,
"grad_norm": 1.576825499534607,
"learning_rate": 3.702573012292373e-06,
"loss": 0.146,
"step": 2670
},
{
"epoch": 2.1981500513874614,
"grad_norm": 1.3643438816070557,
"learning_rate": 3.6674531697120484e-06,
"loss": 0.1407,
"step": 2675
},
{
"epoch": 2.202261048304214,
"grad_norm": 1.7982343435287476,
"learning_rate": 3.6324632421157147e-06,
"loss": 0.1542,
"step": 2680
},
{
"epoch": 2.206372045220966,
"grad_norm": 1.5771673917770386,
"learning_rate": 3.5976039473371273e-06,
"loss": 0.1519,
"step": 2685
},
{
"epoch": 2.210483042137718,
"grad_norm": 1.6472742557525635,
"learning_rate": 3.562876000530048e-06,
"loss": 0.1784,
"step": 2690
},
{
"epoch": 2.2145940390544707,
"grad_norm": 1.4306912422180176,
"learning_rate": 3.5282801141535915e-06,
"loss": 0.1517,
"step": 2695
},
{
"epoch": 2.218705035971223,
"grad_norm": 1.7105224132537842,
"learning_rate": 3.493816997957582e-06,
"loss": 0.1869,
"step": 2700
},
{
"epoch": 2.2228160328879754,
"grad_norm": 1.3035056591033936,
"learning_rate": 3.4594873589680047e-06,
"loss": 0.1611,
"step": 2705
},
{
"epoch": 2.2269270298047275,
"grad_norm": 1.4090354442596436,
"learning_rate": 3.4252919014725137e-06,
"loss": 0.2124,
"step": 2710
},
{
"epoch": 2.23103802672148,
"grad_norm": 1.7435358762741089,
"learning_rate": 3.391231327005955e-06,
"loss": 0.1755,
"step": 2715
},
{
"epoch": 2.235149023638232,
"grad_norm": 1.137243390083313,
"learning_rate": 3.3573063343360048e-06,
"loss": 0.1713,
"step": 2720
},
{
"epoch": 2.2392600205549846,
"grad_norm": 1.4972217082977295,
"learning_rate": 3.3235176194488073e-06,
"loss": 0.2209,
"step": 2725
},
{
"epoch": 2.2433710174717367,
"grad_norm": 1.6285967826843262,
"learning_rate": 3.289865875534709e-06,
"loss": 0.1592,
"step": 2730
},
{
"epoch": 2.2474820143884893,
"grad_norm": 1.5644609928131104,
"learning_rate": 3.2563517929740484e-06,
"loss": 0.1639,
"step": 2735
},
{
"epoch": 2.2515930113052414,
"grad_norm": 1.5566542148590088,
"learning_rate": 3.2229760593229686e-06,
"loss": 0.1865,
"step": 2740
},
{
"epoch": 2.255704008221994,
"grad_norm": 1.6575636863708496,
"learning_rate": 3.1897393592993244e-06,
"loss": 0.1659,
"step": 2745
},
{
"epoch": 2.259815005138746,
"grad_norm": 1.5925614833831787,
"learning_rate": 3.1566423747686402e-06,
"loss": 0.1692,
"step": 2750
},
{
"epoch": 2.2639260020554985,
"grad_norm": 1.0502550601959229,
"learning_rate": 3.123685784730118e-06,
"loss": 0.1978,
"step": 2755
},
{
"epoch": 2.2680369989722506,
"grad_norm": 2.042742967605591,
"learning_rate": 3.090870265302697e-06,
"loss": 0.1601,
"step": 2760
},
{
"epoch": 2.272147995889003,
"grad_norm": 1.3015090227127075,
"learning_rate": 3.058196489711194e-06,
"loss": 0.1819,
"step": 2765
},
{
"epoch": 2.2762589928057553,
"grad_norm": 1.362306833267212,
"learning_rate": 3.0256651282724857e-06,
"loss": 0.2053,
"step": 2770
},
{
"epoch": 2.280369989722508,
"grad_norm": 1.5630079507827759,
"learning_rate": 2.993276848381769e-06,
"loss": 0.1844,
"step": 2775
},
{
"epoch": 2.28448098663926,
"grad_norm": 1.4856841564178467,
"learning_rate": 2.9610323144988505e-06,
"loss": 0.1983,
"step": 2780
},
{
"epoch": 2.2885919835560125,
"grad_norm": 1.6463249921798706,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.1903,
"step": 2785
},
{
"epoch": 2.2927029804727646,
"grad_norm": 1.6934734582901,
"learning_rate": 2.8969771278370105e-06,
"loss": 0.196,
"step": 2790
},
{
"epoch": 2.296813977389517,
"grad_norm": 0.9253600239753723,
"learning_rate": 2.8651677891784267e-06,
"loss": 0.1458,
"step": 2795
},
{
"epoch": 2.300924974306269,
"grad_norm": 1.6479263305664062,
"learning_rate": 2.833504824741349e-06,
"loss": 0.179,
"step": 2800
},
{
"epoch": 2.3050359712230217,
"grad_norm": 1.1921919584274292,
"learning_rate": 2.8019888841054166e-06,
"loss": 0.1749,
"step": 2805
},
{
"epoch": 2.309146968139774,
"grad_norm": 1.347475290298462,
"learning_rate": 2.770620613834023e-06,
"loss": 0.1989,
"step": 2810
},
{
"epoch": 2.3132579650565264,
"grad_norm": 1.2929786443710327,
"learning_rate": 2.73940065746103e-06,
"loss": 0.1993,
"step": 2815
},
{
"epoch": 2.3173689619732785,
"grad_norm": 1.3094338178634644,
"learning_rate": 2.708329655477575e-06,
"loss": 0.2202,
"step": 2820
},
{
"epoch": 2.321479958890031,
"grad_norm": 1.7936643362045288,
"learning_rate": 2.6774082453189296e-06,
"loss": 0.176,
"step": 2825
},
{
"epoch": 2.325590955806783,
"grad_norm": 1.5653270483016968,
"learning_rate": 2.646637061351429e-06,
"loss": 0.1869,
"step": 2830
},
{
"epoch": 2.3297019527235356,
"grad_norm": 1.5138646364212036,
"learning_rate": 2.6160167348594534e-06,
"loss": 0.1846,
"step": 2835
},
{
"epoch": 2.3338129496402877,
"grad_norm": 1.4555552005767822,
"learning_rate": 2.585547894032465e-06,
"loss": 0.1993,
"step": 2840
},
{
"epoch": 2.33792394655704,
"grad_norm": 1.6724660396575928,
"learning_rate": 2.5552311639521376e-06,
"loss": 0.1799,
"step": 2845
},
{
"epoch": 2.3420349434737924,
"grad_norm": 1.3273122310638428,
"learning_rate": 2.525067166579528e-06,
"loss": 0.2047,
"step": 2850
},
{
"epoch": 2.346145940390545,
"grad_norm": 1.859212040901184,
"learning_rate": 2.4950565207423116e-06,
"loss": 0.1683,
"step": 2855
},
{
"epoch": 2.350256937307297,
"grad_norm": 1.3489201068878174,
"learning_rate": 2.4651998421220847e-06,
"loss": 0.1577,
"step": 2860
},
{
"epoch": 2.354367934224049,
"grad_norm": 1.8253015279769897,
"learning_rate": 2.43549774324175e-06,
"loss": 0.1904,
"step": 2865
},
{
"epoch": 2.3584789311408016,
"grad_norm": 1.5466407537460327,
"learning_rate": 2.405950833452928e-06,
"loss": 0.1754,
"step": 2870
},
{
"epoch": 2.362589928057554,
"grad_norm": 1.4899457693099976,
"learning_rate": 2.3765597189234756e-06,
"loss": 0.1667,
"step": 2875
},
{
"epoch": 2.3667009249743063,
"grad_norm": 1.3761615753173828,
"learning_rate": 2.347325002625034e-06,
"loss": 0.184,
"step": 2880
},
{
"epoch": 2.3708119218910584,
"grad_norm": 1.7067718505859375,
"learning_rate": 2.3182472843206647e-06,
"loss": 0.148,
"step": 2885
},
{
"epoch": 2.374922918807811,
"grad_norm": 1.401242971420288,
"learning_rate": 2.289327160552559e-06,
"loss": 0.215,
"step": 2890
},
{
"epoch": 2.379033915724563,
"grad_norm": 1.429301142692566,
"learning_rate": 2.2605652246297737e-06,
"loss": 0.1692,
"step": 2895
},
{
"epoch": 2.3831449126413156,
"grad_norm": 0.8169743418693542,
"learning_rate": 2.2319620666160735e-06,
"loss": 0.184,
"step": 2900
},
{
"epoch": 2.3872559095580677,
"grad_norm": 2.1176815032958984,
"learning_rate": 2.203518273317835e-06,
"loss": 0.2451,
"step": 2905
},
{
"epoch": 2.39136690647482,
"grad_norm": 1.482387661933899,
"learning_rate": 2.175234428271984e-06,
"loss": 0.1903,
"step": 2910
},
{
"epoch": 2.3954779033915723,
"grad_norm": 1.2381435632705688,
"learning_rate": 2.1471111117340505e-06,
"loss": 0.1905,
"step": 2915
},
{
"epoch": 2.399588900308325,
"grad_norm": 1.7667262554168701,
"learning_rate": 2.1191489006662415e-06,
"loss": 0.1952,
"step": 2920
},
{
"epoch": 2.403699897225077,
"grad_norm": 1.508764386177063,
"learning_rate": 2.091348368725614e-06,
"loss": 0.1693,
"step": 2925
},
{
"epoch": 2.4078108941418295,
"grad_norm": 1.4408055543899536,
"learning_rate": 2.0637100862523186e-06,
"loss": 0.1635,
"step": 2930
},
{
"epoch": 2.4119218910585816,
"grad_norm": 1.7050563097000122,
"learning_rate": 2.0362346202578753e-06,
"loss": 0.197,
"step": 2935
},
{
"epoch": 2.416032887975334,
"grad_norm": 1.7732040882110596,
"learning_rate": 2.008922534413551e-06,
"loss": 0.2079,
"step": 2940
},
{
"epoch": 2.420143884892086,
"grad_norm": 1.7007726430892944,
"learning_rate": 1.9817743890388098e-06,
"loss": 0.1562,
"step": 2945
},
{
"epoch": 2.4242548818088387,
"grad_norm": 1.3671354055404663,
"learning_rate": 1.9547907410897902e-06,
"loss": 0.1806,
"step": 2950
},
{
"epoch": 2.428365878725591,
"grad_norm": 1.6518571376800537,
"learning_rate": 1.927972144147905e-06,
"loss": 0.1814,
"step": 2955
},
{
"epoch": 2.4324768756423434,
"grad_norm": 1.8467329740524292,
"learning_rate": 1.901319148408467e-06,
"loss": 0.1511,
"step": 2960
},
{
"epoch": 2.4365878725590955,
"grad_norm": 1.5873874425888062,
"learning_rate": 1.8748323006694058e-06,
"loss": 0.1925,
"step": 2965
},
{
"epoch": 2.440698869475848,
"grad_norm": 1.7341911792755127,
"learning_rate": 1.8485121443200594e-06,
"loss": 0.1746,
"step": 2970
},
{
"epoch": 2.4448098663926,
"grad_norm": 1.740069031715393,
"learning_rate": 1.8223592193300111e-06,
"loss": 0.145,
"step": 2975
},
{
"epoch": 2.4489208633093527,
"grad_norm": 1.0969732999801636,
"learning_rate": 1.7963740622380199e-06,
"loss": 0.1566,
"step": 2980
},
{
"epoch": 2.4530318602261048,
"grad_norm": 1.5586555004119873,
"learning_rate": 1.7705572061410204e-06,
"loss": 0.151,
"step": 2985
},
{
"epoch": 2.4571428571428573,
"grad_norm": 1.4157530069351196,
"learning_rate": 1.7449091806831664e-06,
"loss": 0.2008,
"step": 2990
},
{
"epoch": 2.4612538540596094,
"grad_norm": 1.7738921642303467,
"learning_rate": 1.7194305120449895e-06,
"loss": 0.1865,
"step": 2995
},
{
"epoch": 2.465364850976362,
"grad_norm": 2.144289970397949,
"learning_rate": 1.6941217229325812e-06,
"loss": 0.1891,
"step": 3000
},
{
"epoch": 2.469475847893114,
"grad_norm": 1.4171119928359985,
"learning_rate": 1.6689833325668814e-06,
"loss": 0.1725,
"step": 3005
},
{
"epoch": 2.4735868448098666,
"grad_norm": 1.459351897239685,
"learning_rate": 1.6440158566730314e-06,
"loss": 0.1578,
"step": 3010
},
{
"epoch": 2.4776978417266187,
"grad_norm": 1.8061171770095825,
"learning_rate": 1.619219807469785e-06,
"loss": 0.1696,
"step": 3015
},
{
"epoch": 2.481808838643371,
"grad_norm": 1.492310643196106,
"learning_rate": 1.5945956936589924e-06,
"loss": 0.1902,
"step": 3020
},
{
"epoch": 2.4859198355601233,
"grad_norm": 1.3936516046524048,
"learning_rate": 1.5701440204151864e-06,
"loss": 0.171,
"step": 3025
},
{
"epoch": 2.490030832476876,
"grad_norm": 1.264660120010376,
"learning_rate": 1.5458652893751959e-06,
"loss": 0.1473,
"step": 3030
},
{
"epoch": 2.494141829393628,
"grad_norm": 1.69791841506958,
"learning_rate": 1.521759998627873e-06,
"loss": 0.1703,
"step": 3035
},
{
"epoch": 2.49825282631038,
"grad_norm": 1.560808539390564,
"learning_rate": 1.4978286427038602e-06,
"loss": 0.1656,
"step": 3040
},
{
"epoch": 2.5023638232271326,
"grad_norm": 1.5153892040252686,
"learning_rate": 1.4740717125654492e-06,
"loss": 0.1831,
"step": 3045
},
{
"epoch": 2.506474820143885,
"grad_norm": 1.5965219736099243,
"learning_rate": 1.4504896955965152e-06,
"loss": 0.1464,
"step": 3050
},
{
"epoch": 2.510585817060637,
"grad_norm": 1.9182418584823608,
"learning_rate": 1.4270830755925148e-06,
"loss": 0.1896,
"step": 3055
},
{
"epoch": 2.5146968139773893,
"grad_norm": 1.592840552330017,
"learning_rate": 1.403852332750545e-06,
"loss": 0.1584,
"step": 3060
},
{
"epoch": 2.518807810894142,
"grad_norm": 1.7259905338287354,
"learning_rate": 1.3807979436595187e-06,
"loss": 0.1739,
"step": 3065
},
{
"epoch": 2.5229188078108944,
"grad_norm": 1.8401083946228027,
"learning_rate": 1.357920381290374e-06,
"loss": 0.2003,
"step": 3070
},
{
"epoch": 2.5270298047276465,
"grad_norm": 1.3413900136947632,
"learning_rate": 1.3352201149863631e-06,
"loss": 0.17,
"step": 3075
},
{
"epoch": 2.5311408016443986,
"grad_norm": 1.2386707067489624,
"learning_rate": 1.3126976104534362e-06,
"loss": 0.2034,
"step": 3080
},
{
"epoch": 2.535251798561151,
"grad_norm": 1.4070823192596436,
"learning_rate": 1.2903533297506787e-06,
"loss": 0.1438,
"step": 3085
},
{
"epoch": 2.5393627954779037,
"grad_norm": 1.8814294338226318,
"learning_rate": 1.268187731280842e-06,
"loss": 0.1869,
"step": 3090
},
{
"epoch": 2.5434737923946558,
"grad_norm": 1.6609413623809814,
"learning_rate": 1.2462012697809333e-06,
"loss": 0.1698,
"step": 3095
},
{
"epoch": 2.547584789311408,
"grad_norm": 1.7581918239593506,
"learning_rate": 1.2243943963128735e-06,
"loss": 0.1803,
"step": 3100
},
{
"epoch": 2.5516957862281604,
"grad_norm": 1.710852861404419,
"learning_rate": 1.2027675582542698e-06,
"loss": 0.2221,
"step": 3105
},
{
"epoch": 2.5558067831449125,
"grad_norm": 1.390642523765564,
"learning_rate": 1.1813211992892204e-06,
"loss": 0.1467,
"step": 3110
},
{
"epoch": 2.559917780061665,
"grad_norm": 1.2157833576202393,
"learning_rate": 1.1600557593992135e-06,
"loss": 0.146,
"step": 3115
},
{
"epoch": 2.564028776978417,
"grad_norm": 1.5769902467727661,
"learning_rate": 1.138971674854099e-06,
"loss": 0.1735,
"step": 3120
},
{
"epoch": 2.5681397738951697,
"grad_norm": 1.6799767017364502,
"learning_rate": 1.1180693782031516e-06,
"loss": 0.1735,
"step": 3125
},
{
"epoch": 2.5722507708119218,
"grad_norm": 1.628298282623291,
"learning_rate": 1.0973492982661792e-06,
"loss": 0.1828,
"step": 3130
},
{
"epoch": 2.5763617677286743,
"grad_norm": 1.8699073791503906,
"learning_rate": 1.0768118601247413e-06,
"loss": 0.1723,
"step": 3135
},
{
"epoch": 2.5804727646454264,
"grad_norm": 1.7919323444366455,
"learning_rate": 1.056457485113408e-06,
"loss": 0.1913,
"step": 3140
},
{
"epoch": 2.584583761562179,
"grad_norm": 1.3920973539352417,
"learning_rate": 1.0362865908111418e-06,
"loss": 0.1601,
"step": 3145
},
{
"epoch": 2.588694758478931,
"grad_norm": 1.3566086292266846,
"learning_rate": 1.0162995910327145e-06,
"loss": 0.1667,
"step": 3150
},
{
"epoch": 2.5928057553956836,
"grad_norm": 1.5238653421401978,
"learning_rate": 9.964968958202171e-07,
"loss": 0.1973,
"step": 3155
},
{
"epoch": 2.5969167523124357,
"grad_norm": 1.5982362031936646,
"learning_rate": 9.7687891143465e-07,
"loss": 0.1626,
"step": 3160
},
{
"epoch": 2.6010277492291882,
"grad_norm": 1.6868717670440674,
"learning_rate": 9.574460403475993e-07,
"loss": 0.1749,
"step": 3165
},
{
"epoch": 2.6051387461459403,
"grad_norm": 1.9734680652618408,
"learning_rate": 9.381986812329579e-07,
"loss": 0.199,
"step": 3170
},
{
"epoch": 2.609249743062693,
"grad_norm": 1.6164628267288208,
"learning_rate": 9.19137228958773e-07,
"loss": 0.1394,
"step": 3175
},
{
"epoch": 2.613360739979445,
"grad_norm": 1.8937848806381226,
"learning_rate": 9.002620745791147e-07,
"loss": 0.1667,
"step": 3180
},
{
"epoch": 2.6174717368961975,
"grad_norm": 1.7090622186660767,
"learning_rate": 8.815736053260826e-07,
"loss": 0.2311,
"step": 3185
},
{
"epoch": 2.6215827338129496,
"grad_norm": 1.7733856439590454,
"learning_rate": 8.630722046018458e-07,
"loss": 0.1749,
"step": 3190
},
{
"epoch": 2.6256937307297017,
"grad_norm": 2.024850368499756,
"learning_rate": 8.447582519707786e-07,
"loss": 0.1662,
"step": 3195
},
{
"epoch": 2.6298047276464542,
"grad_norm": 1.9420026540756226,
"learning_rate": 8.266321231516727e-07,
"loss": 0.1871,
"step": 3200
},
{
"epoch": 2.6339157245632068,
"grad_norm": 1.433854341506958,
"learning_rate": 8.086941900100387e-07,
"loss": 0.1751,
"step": 3205
},
{
"epoch": 2.638026721479959,
"grad_norm": 2.0369086265563965,
"learning_rate": 7.909448205504633e-07,
"loss": 0.1296,
"step": 3210
},
{
"epoch": 2.642137718396711,
"grad_norm": 1.7490417957305908,
"learning_rate": 7.733843789090722e-07,
"loss": 0.1592,
"step": 3215
},
{
"epoch": 2.6462487153134635,
"grad_norm": 1.821723222732544,
"learning_rate": 7.560132253460484e-07,
"loss": 0.2148,
"step": 3220
},
{
"epoch": 2.650359712230216,
"grad_norm": 1.665130615234375,
"learning_rate": 7.388317162382475e-07,
"loss": 0.14,
"step": 3225
},
{
"epoch": 2.654470709146968,
"grad_norm": 1.7118257284164429,
"learning_rate": 7.218402040718908e-07,
"loss": 0.1748,
"step": 3230
},
{
"epoch": 2.6585817060637202,
"grad_norm": 2.0985677242279053,
"learning_rate": 7.050390374353244e-07,
"loss": 0.1789,
"step": 3235
},
{
"epoch": 2.662692702980473,
"grad_norm": 1.4616549015045166,
"learning_rate": 6.884285610118702e-07,
"loss": 0.1791,
"step": 3240
},
{
"epoch": 2.6668036998972253,
"grad_norm": 1.662348985671997,
"learning_rate": 6.720091155727626e-07,
"loss": 0.2191,
"step": 3245
},
{
"epoch": 2.6709146968139774,
"grad_norm": 1.784906029701233,
"learning_rate": 6.557810379701446e-07,
"loss": 0.1645,
"step": 3250
},
{
"epoch": 2.6750256937307295,
"grad_norm": 1.4837048053741455,
"learning_rate": 6.397446611301705e-07,
"loss": 0.1963,
"step": 3255
},
{
"epoch": 2.679136690647482,
"grad_norm": 1.8312329053878784,
"learning_rate": 6.239003140461641e-07,
"loss": 0.1987,
"step": 3260
},
{
"epoch": 2.6832476875642346,
"grad_norm": 1.5286824703216553,
"learning_rate": 6.082483217718737e-07,
"loss": 0.1833,
"step": 3265
},
{
"epoch": 2.6873586844809867,
"grad_norm": 1.5359361171722412,
"learning_rate": 5.927890054148111e-07,
"loss": 0.2086,
"step": 3270
},
{
"epoch": 2.691469681397739,
"grad_norm": 1.8544061183929443,
"learning_rate": 5.775226821296487e-07,
"loss": 0.1975,
"step": 3275
},
{
"epoch": 2.6955806783144913,
"grad_norm": 1.8769956827163696,
"learning_rate": 5.624496651117251e-07,
"loss": 0.1692,
"step": 3280
},
{
"epoch": 2.6996916752312434,
"grad_norm": 1.2962095737457275,
"learning_rate": 5.475702635906166e-07,
"loss": 0.1495,
"step": 3285
},
{
"epoch": 2.703802672147996,
"grad_norm": 1.6315231323242188,
"learning_rate": 5.328847828237882e-07,
"loss": 0.1544,
"step": 3290
},
{
"epoch": 2.707913669064748,
"grad_norm": 1.7691444158554077,
"learning_rate": 5.183935240903415e-07,
"loss": 0.1694,
"step": 3295
},
{
"epoch": 2.7120246659815006,
"grad_norm": 1.34203040599823,
"learning_rate": 5.040967846848232e-07,
"loss": 0.1705,
"step": 3300
},
{
"epoch": 2.7161356628982527,
"grad_norm": 1.6373629570007324,
"learning_rate": 4.899948579111291e-07,
"loss": 0.1534,
"step": 3305
},
{
"epoch": 2.7202466598150052,
"grad_norm": 1.2242546081542969,
"learning_rate": 4.760880330764939e-07,
"loss": 0.1516,
"step": 3310
},
{
"epoch": 2.7243576567317573,
"grad_norm": 1.08647620677948,
"learning_rate": 4.6237659548554636e-07,
"loss": 0.1986,
"step": 3315
},
{
"epoch": 2.72846865364851,
"grad_norm": 1.6842622756958008,
"learning_rate": 4.488608264344574e-07,
"loss": 0.2017,
"step": 3320
},
{
"epoch": 2.732579650565262,
"grad_norm": 1.7128268480300903,
"learning_rate": 4.3554100320517767e-07,
"loss": 0.1598,
"step": 3325
},
{
"epoch": 2.7366906474820145,
"grad_norm": 1.817187786102295,
"learning_rate": 4.2241739905974243e-07,
"loss": 0.2204,
"step": 3330
},
{
"epoch": 2.7408016443987666,
"grad_norm": 1.6121926307678223,
"learning_rate": 4.094902832346581e-07,
"loss": 0.1805,
"step": 3335
},
{
"epoch": 2.744912641315519,
"grad_norm": 1.8614606857299805,
"learning_rate": 3.9675992093539674e-07,
"loss": 0.2132,
"step": 3340
},
{
"epoch": 2.7490236382322712,
"grad_norm": 1.5935100317001343,
"learning_rate": 3.8422657333093916e-07,
"loss": 0.1775,
"step": 3345
},
{
"epoch": 2.753134635149024,
"grad_norm": 1.5736486911773682,
"learning_rate": 3.718904975484283e-07,
"loss": 0.1941,
"step": 3350
},
{
"epoch": 2.757245632065776,
"grad_norm": 1.0311506986618042,
"learning_rate": 3.5975194666788224e-07,
"loss": 0.1708,
"step": 3355
},
{
"epoch": 2.7613566289825284,
"grad_norm": 1.4687517881393433,
"learning_rate": 3.478111697170128e-07,
"loss": 0.1694,
"step": 3360
},
{
"epoch": 2.7654676258992805,
"grad_norm": 1.7468496561050415,
"learning_rate": 3.360684116661117e-07,
"loss": 0.2014,
"step": 3365
},
{
"epoch": 2.7695786228160326,
"grad_norm": 1.520976185798645,
"learning_rate": 3.245239134230305e-07,
"loss": 0.1587,
"step": 3370
},
{
"epoch": 2.773689619732785,
"grad_norm": 2.00669527053833,
"learning_rate": 3.131779118282219e-07,
"loss": 0.1857,
"step": 3375
},
{
"epoch": 2.7778006166495377,
"grad_norm": 1.321519374847412,
"learning_rate": 3.020306396499062e-07,
"loss": 0.128,
"step": 3380
},
{
"epoch": 2.78191161356629,
"grad_norm": 1.5371605157852173,
"learning_rate": 2.9108232557927164e-07,
"loss": 0.1696,
"step": 3385
},
{
"epoch": 2.786022610483042,
"grad_norm": 1.4579428434371948,
"learning_rate": 2.803331942258003e-07,
"loss": 0.1653,
"step": 3390
},
{
"epoch": 2.7901336073997944,
"grad_norm": 1.6946520805358887,
"learning_rate": 2.6978346611265083e-07,
"loss": 0.2164,
"step": 3395
},
{
"epoch": 2.794244604316547,
"grad_norm": 1.9009003639221191,
"learning_rate": 2.594333576721331e-07,
"loss": 0.1721,
"step": 3400
},
{
"epoch": 2.798355601233299,
"grad_norm": 1.7347790002822876,
"learning_rate": 2.492830812412783e-07,
"loss": 0.1625,
"step": 3405
},
{
"epoch": 2.802466598150051,
"grad_norm": 1.2449841499328613,
"learning_rate": 2.393328450574728e-07,
"loss": 0.1681,
"step": 3410
},
{
"epoch": 2.8065775950668037,
"grad_norm": 2.043942451477051,
"learning_rate": 2.295828532541855e-07,
"loss": 0.2155,
"step": 3415
},
{
"epoch": 2.8106885919835563,
"grad_norm": 1.5729506015777588,
"learning_rate": 2.200333058567905e-07,
"loss": 0.1848,
"step": 3420
},
{
"epoch": 2.8147995889003083,
"grad_norm": 1.1342136859893799,
"learning_rate": 2.1068439877845237e-07,
"loss": 0.1788,
"step": 3425
},
{
"epoch": 2.8189105858170604,
"grad_norm": 1.41780424118042,
"learning_rate": 2.01536323816115e-07,
"loss": 0.2229,
"step": 3430
},
{
"epoch": 2.823021582733813,
"grad_norm": 1.172594428062439,
"learning_rate": 1.9258926864655692e-07,
"loss": 0.1501,
"step": 3435
},
{
"epoch": 2.8271325796505655,
"grad_norm": 1.7016807794570923,
"learning_rate": 1.8384341682255225e-07,
"loss": 0.1886,
"step": 3440
},
{
"epoch": 2.8312435765673176,
"grad_norm": 1.7683440446853638,
"learning_rate": 1.7529894776909917e-07,
"loss": 0.1846,
"step": 3445
},
{
"epoch": 2.8353545734840697,
"grad_norm": 1.4340758323669434,
"learning_rate": 1.669560367797396e-07,
"loss": 0.1786,
"step": 3450
},
{
"epoch": 2.8394655704008223,
"grad_norm": 1.2617582082748413,
"learning_rate": 1.588148550129609e-07,
"loss": 0.2281,
"step": 3455
},
{
"epoch": 2.8435765673175744,
"grad_norm": 1.28896164894104,
"learning_rate": 1.5087556948868876e-07,
"loss": 0.165,
"step": 3460
},
{
"epoch": 2.847687564234327,
"grad_norm": 0.8633646368980408,
"learning_rate": 1.4313834308486097e-07,
"loss": 0.1673,
"step": 3465
},
{
"epoch": 2.851798561151079,
"grad_norm": 1.3453377485275269,
"learning_rate": 1.3560333453407682e-07,
"loss": 0.1887,
"step": 3470
},
{
"epoch": 2.8559095580678315,
"grad_norm": 1.9705557823181152,
"learning_rate": 1.2827069842035412e-07,
"loss": 0.1759,
"step": 3475
},
{
"epoch": 2.8600205549845836,
"grad_norm": 1.7831838130950928,
"learning_rate": 1.211405851759484e-07,
"loss": 0.1835,
"step": 3480
},
{
"epoch": 2.864131551901336,
"grad_norm": 1.3642257452011108,
"learning_rate": 1.1421314107826764e-07,
"loss": 0.2027,
"step": 3485
},
{
"epoch": 2.8682425488180883,
"grad_norm": 1.5517573356628418,
"learning_rate": 1.0748850824687795e-07,
"loss": 0.1419,
"step": 3490
},
{
"epoch": 2.872353545734841,
"grad_norm": 1.8059098720550537,
"learning_rate": 1.0096682464057706e-07,
"loss": 0.1744,
"step": 3495
},
{
"epoch": 2.876464542651593,
"grad_norm": 1.645180344581604,
"learning_rate": 9.46482240545743e-08,
"loss": 0.1418,
"step": 3500
},
{
"epoch": 2.8805755395683454,
"grad_norm": 1.353611946105957,
"learning_rate": 8.853283611774177e-08,
"loss": 0.1595,
"step": 3505
},
{
"epoch": 2.8846865364850975,
"grad_norm": 1.6561661958694458,
"learning_rate": 8.262078628995085e-08,
"loss": 0.1823,
"step": 3510
},
{
"epoch": 2.88879753340185,
"grad_norm": 1.1401209831237793,
"learning_rate": 7.691219585950538e-08,
"loss": 0.1914,
"step": 3515
},
{
"epoch": 2.892908530318602,
"grad_norm": 1.54417085647583,
"learning_rate": 7.140718194065033e-08,
"loss": 0.2103,
"step": 3520
},
{
"epoch": 2.8970195272353547,
"grad_norm": 1.6267364025115967,
"learning_rate": 6.610585747116705e-08,
"loss": 0.1746,
"step": 3525
},
{
"epoch": 2.901130524152107,
"grad_norm": 1.1939152479171753,
"learning_rate": 6.100833121005956e-08,
"loss": 0.1652,
"step": 3530
},
{
"epoch": 2.9052415210688594,
"grad_norm": 1.812522053718567,
"learning_rate": 5.6114707735320795e-08,
"loss": 0.1894,
"step": 3535
},
{
"epoch": 2.9093525179856115,
"grad_norm": 1.9067127704620361,
"learning_rate": 5.142508744178987e-08,
"loss": 0.1686,
"step": 3540
},
{
"epoch": 2.9134635149023635,
"grad_norm": 1.1483782529830933,
"learning_rate": 4.6939566539089265e-08,
"loss": 0.168,
"step": 3545
},
{
"epoch": 2.917574511819116,
"grad_norm": 2.0629680156707764,
"learning_rate": 4.2658237049655325e-08,
"loss": 0.1886,
"step": 3550
},
{
"epoch": 2.9216855087358686,
"grad_norm": 1.3101327419281006,
"learning_rate": 3.8581186806843086e-08,
"loss": 0.2038,
"step": 3555
},
{
"epoch": 2.9257965056526207,
"grad_norm": 1.8617042303085327,
"learning_rate": 3.470849945313548e-08,
"loss": 0.1801,
"step": 3560
},
{
"epoch": 2.929907502569373,
"grad_norm": 1.6082433462142944,
"learning_rate": 3.104025443841363e-08,
"loss": 0.1972,
"step": 3565
},
{
"epoch": 2.9340184994861254,
"grad_norm": 1.5044277906417847,
"learning_rate": 2.757652701834035e-08,
"loss": 0.1869,
"step": 3570
},
{
"epoch": 2.938129496402878,
"grad_norm": 1.6845835447311401,
"learning_rate": 2.431738825280583e-08,
"loss": 0.1965,
"step": 3575
},
{
"epoch": 2.94224049331963,
"grad_norm": 1.7309244871139526,
"learning_rate": 2.1262905004475477e-08,
"loss": 0.1399,
"step": 3580
},
{
"epoch": 2.946351490236382,
"grad_norm": 1.3211627006530762,
"learning_rate": 1.8413139937418776e-08,
"loss": 0.2233,
"step": 3585
},
{
"epoch": 2.9504624871531346,
"grad_norm": 1.3240909576416016,
"learning_rate": 1.5768151515818118e-08,
"loss": 0.1891,
"step": 3590
},
{
"epoch": 2.954573484069887,
"grad_norm": 1.5092717409133911,
"learning_rate": 1.332799400277418e-08,
"loss": 0.1894,
"step": 3595
},
{
"epoch": 2.9586844809866393,
"grad_norm": 1.5229946374893188,
"learning_rate": 1.1092717459192381e-08,
"loss": 0.1709,
"step": 3600
},
{
"epoch": 2.9627954779033914,
"grad_norm": 1.7171857357025146,
"learning_rate": 9.062367742754819e-09,
"loss": 0.2038,
"step": 3605
},
{
"epoch": 2.966906474820144,
"grad_norm": 1.5181477069854736,
"learning_rate": 7.236986506978793e-09,
"loss": 0.2173,
"step": 3610
},
{
"epoch": 2.9710174717368965,
"grad_norm": 1.5067954063415527,
"learning_rate": 5.616611200364164e-09,
"loss": 0.1744,
"step": 3615
},
{
"epoch": 2.9751284686536486,
"grad_norm": 1.9040520191192627,
"learning_rate": 4.201275065620625e-09,
"loss": 0.1684,
"step": 3620
},
{
"epoch": 2.9792394655704006,
"grad_norm": 1.7340034246444702,
"learning_rate": 2.991007138993807e-09,
"loss": 0.1679,
"step": 3625
},
{
"epoch": 2.983350462487153,
"grad_norm": 1.7730134725570679,
"learning_rate": 1.985832249662423e-09,
"loss": 0.2154,
"step": 3630
},
{
"epoch": 2.9874614594039053,
"grad_norm": 2.1094627380371094,
"learning_rate": 1.185771019230897e-09,
"loss": 0.1583,
"step": 3635
},
{
"epoch": 2.991572456320658,
"grad_norm": 1.5417195558547974,
"learning_rate": 5.908398613074795e-10,
"loss": 0.1622,
"step": 3640
},
{
"epoch": 2.99568345323741,
"grad_norm": 1.7417429685592651,
"learning_rate": 2.0105098116673938e-10,
"loss": 0.1632,
"step": 3645
},
{
"epoch": 2.9997944501541625,
"grad_norm": 1.8266246318817139,
"learning_rate": 1.64123754997636e-11,
"loss": 0.2147,
"step": 3650
},
{
"epoch": 3.0,
"step": 3651,
"total_flos": 1.482904391886766e+18,
"train_loss": 0.2381349169034169,
"train_runtime": 7521.888,
"train_samples_per_second": 1.94,
"train_steps_per_second": 0.485
}
],
"logging_steps": 5,
"max_steps": 3651,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.482904391886766e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}