web_general_LR_1e-5_bs_48_epoch_2 / trainer_state.json
Rubywong123's picture
Upload folder using huggingface_hub
8aff194 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2818,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007097232079489,
"grad_norm": 3.7790934086436856,
"learning_rate": 3.546099290780142e-08,
"loss": 1.9134,
"step": 1
},
{
"epoch": 0.0035486160397444995,
"grad_norm": 3.7538740105859385,
"learning_rate": 1.7730496453900713e-07,
"loss": 1.9866,
"step": 5
},
{
"epoch": 0.007097232079488999,
"grad_norm": 3.8599272316924025,
"learning_rate": 3.5460992907801425e-07,
"loss": 1.9938,
"step": 10
},
{
"epoch": 0.0106458481192335,
"grad_norm": 2.9752159955654793,
"learning_rate": 5.319148936170213e-07,
"loss": 1.8826,
"step": 15
},
{
"epoch": 0.014194464158977998,
"grad_norm": 1.8057605671917776,
"learning_rate": 7.092198581560285e-07,
"loss": 1.6773,
"step": 20
},
{
"epoch": 0.017743080198722498,
"grad_norm": 1.7146277967624175,
"learning_rate": 8.865248226950356e-07,
"loss": 1.4572,
"step": 25
},
{
"epoch": 0.021291696238467,
"grad_norm": 1.7165328516172886,
"learning_rate": 1.0638297872340427e-06,
"loss": 1.1992,
"step": 30
},
{
"epoch": 0.0248403122782115,
"grad_norm": 0.5635195619261276,
"learning_rate": 1.2411347517730497e-06,
"loss": 0.9227,
"step": 35
},
{
"epoch": 0.028388928317955996,
"grad_norm": 0.49646858212588296,
"learning_rate": 1.418439716312057e-06,
"loss": 0.8536,
"step": 40
},
{
"epoch": 0.0319375443577005,
"grad_norm": 0.4154764959523544,
"learning_rate": 1.595744680851064e-06,
"loss": 0.8191,
"step": 45
},
{
"epoch": 0.035486160397444996,
"grad_norm": 0.33168194477789015,
"learning_rate": 1.7730496453900712e-06,
"loss": 0.7748,
"step": 50
},
{
"epoch": 0.03903477643718949,
"grad_norm": 0.29274747606971463,
"learning_rate": 1.9503546099290782e-06,
"loss": 0.7399,
"step": 55
},
{
"epoch": 0.042583392476934,
"grad_norm": 0.24110671556201152,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.7207,
"step": 60
},
{
"epoch": 0.046132008516678494,
"grad_norm": 0.23131165469891526,
"learning_rate": 2.3049645390070924e-06,
"loss": 0.6855,
"step": 65
},
{
"epoch": 0.049680624556423,
"grad_norm": 0.25085769209845776,
"learning_rate": 2.4822695035460995e-06,
"loss": 0.6591,
"step": 70
},
{
"epoch": 0.053229240596167494,
"grad_norm": 0.2451669785806806,
"learning_rate": 2.6595744680851065e-06,
"loss": 0.645,
"step": 75
},
{
"epoch": 0.05677785663591199,
"grad_norm": 0.24390146989920652,
"learning_rate": 2.836879432624114e-06,
"loss": 0.6082,
"step": 80
},
{
"epoch": 0.060326472675656495,
"grad_norm": 0.27358878103492035,
"learning_rate": 3.0141843971631207e-06,
"loss": 0.5811,
"step": 85
},
{
"epoch": 0.063875088715401,
"grad_norm": 0.29467232189308734,
"learning_rate": 3.191489361702128e-06,
"loss": 0.5701,
"step": 90
},
{
"epoch": 0.06742370475514549,
"grad_norm": 0.4181550513151881,
"learning_rate": 3.368794326241135e-06,
"loss": 0.524,
"step": 95
},
{
"epoch": 0.07097232079488999,
"grad_norm": 0.276956239050407,
"learning_rate": 3.5460992907801423e-06,
"loss": 0.4865,
"step": 100
},
{
"epoch": 0.0745209368346345,
"grad_norm": 0.30743981542976856,
"learning_rate": 3.723404255319149e-06,
"loss": 0.5006,
"step": 105
},
{
"epoch": 0.07806955287437899,
"grad_norm": 0.2837124193323978,
"learning_rate": 3.9007092198581565e-06,
"loss": 0.4981,
"step": 110
},
{
"epoch": 0.08161816891412349,
"grad_norm": 0.2786326515630205,
"learning_rate": 4.078014184397163e-06,
"loss": 0.4601,
"step": 115
},
{
"epoch": 0.085166784953868,
"grad_norm": 0.3146144342829529,
"learning_rate": 4.255319148936171e-06,
"loss": 0.4583,
"step": 120
},
{
"epoch": 0.0887154009936125,
"grad_norm": 0.2404177890339331,
"learning_rate": 4.432624113475177e-06,
"loss": 0.4337,
"step": 125
},
{
"epoch": 0.09226401703335699,
"grad_norm": 0.18648319401117486,
"learning_rate": 4.609929078014185e-06,
"loss": 0.4361,
"step": 130
},
{
"epoch": 0.09581263307310149,
"grad_norm": 0.16993920498441797,
"learning_rate": 4.787234042553192e-06,
"loss": 0.411,
"step": 135
},
{
"epoch": 0.099361249112846,
"grad_norm": 0.16754156901325204,
"learning_rate": 4.964539007092199e-06,
"loss": 0.4012,
"step": 140
},
{
"epoch": 0.10290986515259049,
"grad_norm": 0.16281157940182128,
"learning_rate": 5.141843971631206e-06,
"loss": 0.4309,
"step": 145
},
{
"epoch": 0.10645848119233499,
"grad_norm": 0.15295342422669697,
"learning_rate": 5.319148936170213e-06,
"loss": 0.4189,
"step": 150
},
{
"epoch": 0.11000709723207949,
"grad_norm": 0.14548305879678064,
"learning_rate": 5.49645390070922e-06,
"loss": 0.4344,
"step": 155
},
{
"epoch": 0.11355571327182398,
"grad_norm": 0.19182159903871332,
"learning_rate": 5.673758865248228e-06,
"loss": 0.4073,
"step": 160
},
{
"epoch": 0.11710432931156849,
"grad_norm": 0.1762922550852886,
"learning_rate": 5.851063829787235e-06,
"loss": 0.4083,
"step": 165
},
{
"epoch": 0.12065294535131299,
"grad_norm": 0.15177995786475712,
"learning_rate": 6.028368794326241e-06,
"loss": 0.4205,
"step": 170
},
{
"epoch": 0.1242015613910575,
"grad_norm": 0.16458108499284144,
"learning_rate": 6.205673758865248e-06,
"loss": 0.3927,
"step": 175
},
{
"epoch": 0.127750177430802,
"grad_norm": 0.1616320835815001,
"learning_rate": 6.382978723404256e-06,
"loss": 0.4262,
"step": 180
},
{
"epoch": 0.1312987934705465,
"grad_norm": 0.150543173630326,
"learning_rate": 6.560283687943263e-06,
"loss": 0.3905,
"step": 185
},
{
"epoch": 0.13484740951029098,
"grad_norm": 0.15847627568820752,
"learning_rate": 6.73758865248227e-06,
"loss": 0.4382,
"step": 190
},
{
"epoch": 0.1383960255500355,
"grad_norm": 0.13858755795554453,
"learning_rate": 6.914893617021278e-06,
"loss": 0.3888,
"step": 195
},
{
"epoch": 0.14194464158977999,
"grad_norm": 0.15295111638974812,
"learning_rate": 7.092198581560285e-06,
"loss": 0.4093,
"step": 200
},
{
"epoch": 0.14549325762952448,
"grad_norm": 0.16201792028865017,
"learning_rate": 7.269503546099291e-06,
"loss": 0.4052,
"step": 205
},
{
"epoch": 0.149041873669269,
"grad_norm": 0.13193151344566895,
"learning_rate": 7.446808510638298e-06,
"loss": 0.3881,
"step": 210
},
{
"epoch": 0.15259048970901348,
"grad_norm": 0.15580522552816817,
"learning_rate": 7.624113475177306e-06,
"loss": 0.3876,
"step": 215
},
{
"epoch": 0.15613910574875797,
"grad_norm": 0.15756164816411716,
"learning_rate": 7.801418439716313e-06,
"loss": 0.4062,
"step": 220
},
{
"epoch": 0.1596877217885025,
"grad_norm": 0.15037461588661732,
"learning_rate": 7.97872340425532e-06,
"loss": 0.3908,
"step": 225
},
{
"epoch": 0.16323633782824698,
"grad_norm": 0.1576349396611341,
"learning_rate": 8.156028368794326e-06,
"loss": 0.4103,
"step": 230
},
{
"epoch": 0.16678495386799147,
"grad_norm": 0.15304572818824008,
"learning_rate": 8.333333333333334e-06,
"loss": 0.427,
"step": 235
},
{
"epoch": 0.170333569907736,
"grad_norm": 0.1431286779687159,
"learning_rate": 8.510638297872341e-06,
"loss": 0.4414,
"step": 240
},
{
"epoch": 0.17388218594748048,
"grad_norm": 0.16850495455132444,
"learning_rate": 8.687943262411349e-06,
"loss": 0.3809,
"step": 245
},
{
"epoch": 0.177430801987225,
"grad_norm": 0.14343292774726926,
"learning_rate": 8.865248226950355e-06,
"loss": 0.3994,
"step": 250
},
{
"epoch": 0.18097941802696949,
"grad_norm": 0.1497969593973364,
"learning_rate": 9.042553191489362e-06,
"loss": 0.4227,
"step": 255
},
{
"epoch": 0.18452803406671398,
"grad_norm": 0.1314065169237124,
"learning_rate": 9.21985815602837e-06,
"loss": 0.4059,
"step": 260
},
{
"epoch": 0.1880766501064585,
"grad_norm": 0.14223349303898583,
"learning_rate": 9.397163120567377e-06,
"loss": 0.39,
"step": 265
},
{
"epoch": 0.19162526614620298,
"grad_norm": 0.1379045957241283,
"learning_rate": 9.574468085106385e-06,
"loss": 0.3816,
"step": 270
},
{
"epoch": 0.19517388218594747,
"grad_norm": 0.1403705022056377,
"learning_rate": 9.75177304964539e-06,
"loss": 0.3942,
"step": 275
},
{
"epoch": 0.198722498225692,
"grad_norm": 0.13724576150188525,
"learning_rate": 9.929078014184398e-06,
"loss": 0.3845,
"step": 280
},
{
"epoch": 0.20227111426543648,
"grad_norm": 0.13221415114664842,
"learning_rate": 9.999965471058488e-06,
"loss": 0.4264,
"step": 285
},
{
"epoch": 0.20581973030518097,
"grad_norm": 0.1563854622261848,
"learning_rate": 9.999754462587396e-06,
"loss": 0.395,
"step": 290
},
{
"epoch": 0.2093683463449255,
"grad_norm": 0.14463348906300427,
"learning_rate": 9.999351636476109e-06,
"loss": 0.4042,
"step": 295
},
{
"epoch": 0.21291696238466998,
"grad_norm": 0.1307773434720309,
"learning_rate": 9.998757008179218e-06,
"loss": 0.3525,
"step": 300
},
{
"epoch": 0.21646557842441447,
"grad_norm": 0.13423547285339485,
"learning_rate": 9.997970600509882e-06,
"loss": 0.3987,
"step": 305
},
{
"epoch": 0.22001419446415899,
"grad_norm": 0.13050960351673896,
"learning_rate": 9.996992443638958e-06,
"loss": 0.3777,
"step": 310
},
{
"epoch": 0.22356281050390348,
"grad_norm": 0.10959515812941,
"learning_rate": 9.995822575093833e-06,
"loss": 0.3919,
"step": 315
},
{
"epoch": 0.22711142654364797,
"grad_norm": 0.11578468532823626,
"learning_rate": 9.994461039756998e-06,
"loss": 0.3689,
"step": 320
},
{
"epoch": 0.23066004258339248,
"grad_norm": 0.1319679054511137,
"learning_rate": 9.992907889864318e-06,
"loss": 0.3692,
"step": 325
},
{
"epoch": 0.23420865862313697,
"grad_norm": 0.12266049392466183,
"learning_rate": 9.991163185003028e-06,
"loss": 0.3784,
"step": 330
},
{
"epoch": 0.23775727466288146,
"grad_norm": 0.10623404301137673,
"learning_rate": 9.989226992109449e-06,
"loss": 0.3635,
"step": 335
},
{
"epoch": 0.24130589070262598,
"grad_norm": 0.1385466776158659,
"learning_rate": 9.987099385466419e-06,
"loss": 0.3904,
"step": 340
},
{
"epoch": 0.24485450674237047,
"grad_norm": 0.11969768577390712,
"learning_rate": 9.984780446700445e-06,
"loss": 0.356,
"step": 345
},
{
"epoch": 0.248403122782115,
"grad_norm": 0.13182349040173033,
"learning_rate": 9.982270264778565e-06,
"loss": 0.3955,
"step": 350
},
{
"epoch": 0.25195173882185945,
"grad_norm": 0.11893674203895117,
"learning_rate": 9.979568936004943e-06,
"loss": 0.3769,
"step": 355
},
{
"epoch": 0.255500354861604,
"grad_norm": 0.11711134082747417,
"learning_rate": 9.976676564017176e-06,
"loss": 0.3712,
"step": 360
},
{
"epoch": 0.2590489709013485,
"grad_norm": 0.10725474620055833,
"learning_rate": 9.973593259782301e-06,
"loss": 0.3816,
"step": 365
},
{
"epoch": 0.262597586941093,
"grad_norm": 0.10097105511827506,
"learning_rate": 9.970319141592559e-06,
"loss": 0.3822,
"step": 370
},
{
"epoch": 0.26614620298083747,
"grad_norm": 0.12727415953962343,
"learning_rate": 9.966854335060842e-06,
"loss": 0.3886,
"step": 375
},
{
"epoch": 0.26969481902058196,
"grad_norm": 0.10658949668409544,
"learning_rate": 9.963198973115881e-06,
"loss": 0.3442,
"step": 380
},
{
"epoch": 0.27324343506032645,
"grad_norm": 0.10788804271311757,
"learning_rate": 9.959353195997144e-06,
"loss": 0.403,
"step": 385
},
{
"epoch": 0.276792051100071,
"grad_norm": 0.10918138560586861,
"learning_rate": 9.955317151249453e-06,
"loss": 0.3738,
"step": 390
},
{
"epoch": 0.2803406671398155,
"grad_norm": 0.10741665213286025,
"learning_rate": 9.951090993717329e-06,
"loss": 0.3487,
"step": 395
},
{
"epoch": 0.28388928317955997,
"grad_norm": 0.14916065904934225,
"learning_rate": 9.946674885539046e-06,
"loss": 0.3993,
"step": 400
},
{
"epoch": 0.28743789921930446,
"grad_norm": 0.12083845437290831,
"learning_rate": 9.942068996140414e-06,
"loss": 0.3702,
"step": 405
},
{
"epoch": 0.29098651525904895,
"grad_norm": 0.15082333185529684,
"learning_rate": 9.937273502228283e-06,
"loss": 0.4074,
"step": 410
},
{
"epoch": 0.2945351312987935,
"grad_norm": 0.11237617745955839,
"learning_rate": 9.932288587783745e-06,
"loss": 0.3826,
"step": 415
},
{
"epoch": 0.298083747338538,
"grad_norm": 0.11179234467428462,
"learning_rate": 9.927114444055102e-06,
"loss": 0.3701,
"step": 420
},
{
"epoch": 0.3016323633782825,
"grad_norm": 0.11669605008027141,
"learning_rate": 9.921751269550508e-06,
"loss": 0.3919,
"step": 425
},
{
"epoch": 0.30518097941802697,
"grad_norm": 0.11450404830103858,
"learning_rate": 9.916199270030364e-06,
"loss": 0.3742,
"step": 430
},
{
"epoch": 0.30872959545777146,
"grad_norm": 0.1391241125471054,
"learning_rate": 9.910458658499418e-06,
"loss": 0.3529,
"step": 435
},
{
"epoch": 0.31227821149751595,
"grad_norm": 0.13578871498887587,
"learning_rate": 9.904529655198598e-06,
"loss": 0.3619,
"step": 440
},
{
"epoch": 0.3158268275372605,
"grad_norm": 0.1315689143085967,
"learning_rate": 9.89841248759656e-06,
"loss": 0.3776,
"step": 445
},
{
"epoch": 0.319375443577005,
"grad_norm": 0.11500948891475682,
"learning_rate": 9.892107390380959e-06,
"loss": 0.3771,
"step": 450
},
{
"epoch": 0.32292405961674947,
"grad_norm": 0.10981369243455155,
"learning_rate": 9.885614605449444e-06,
"loss": 0.3205,
"step": 455
},
{
"epoch": 0.32647267565649396,
"grad_norm": 0.09615207571856538,
"learning_rate": 9.87893438190039e-06,
"loss": 0.3314,
"step": 460
},
{
"epoch": 0.33002129169623845,
"grad_norm": 0.10500244992949563,
"learning_rate": 9.872066976023323e-06,
"loss": 0.3671,
"step": 465
},
{
"epoch": 0.33356990773598294,
"grad_norm": 0.15546160291286917,
"learning_rate": 9.8650126512891e-06,
"loss": 0.3628,
"step": 470
},
{
"epoch": 0.3371185237757275,
"grad_norm": 0.10760107017085022,
"learning_rate": 9.857771678339796e-06,
"loss": 0.3592,
"step": 475
},
{
"epoch": 0.340667139815472,
"grad_norm": 0.11896816109137531,
"learning_rate": 9.850344334978324e-06,
"loss": 0.3522,
"step": 480
},
{
"epoch": 0.34421575585521647,
"grad_norm": 0.12163674770311829,
"learning_rate": 9.84273090615777e-06,
"loss": 0.3879,
"step": 485
},
{
"epoch": 0.34776437189496096,
"grad_norm": 0.10502541527485883,
"learning_rate": 9.834931683970468e-06,
"loss": 0.3767,
"step": 490
},
{
"epoch": 0.35131298793470545,
"grad_norm": 0.10841614750514875,
"learning_rate": 9.826946967636793e-06,
"loss": 0.3565,
"step": 495
},
{
"epoch": 0.35486160397445,
"grad_norm": 0.11668607605340724,
"learning_rate": 9.818777063493675e-06,
"loss": 0.3389,
"step": 500
},
{
"epoch": 0.3584102200141945,
"grad_norm": 0.10690348721316159,
"learning_rate": 9.810422284982856e-06,
"loss": 0.3624,
"step": 505
},
{
"epoch": 0.36195883605393897,
"grad_norm": 0.11228948356181981,
"learning_rate": 9.801882952638853e-06,
"loss": 0.358,
"step": 510
},
{
"epoch": 0.36550745209368346,
"grad_norm": 0.10163494242107411,
"learning_rate": 9.793159394076672e-06,
"loss": 0.3316,
"step": 515
},
{
"epoch": 0.36905606813342795,
"grad_norm": 0.11632955695131274,
"learning_rate": 9.784251943979232e-06,
"loss": 0.3661,
"step": 520
},
{
"epoch": 0.37260468417317244,
"grad_norm": 0.08976905195822309,
"learning_rate": 9.775160944084527e-06,
"loss": 0.3629,
"step": 525
},
{
"epoch": 0.376153300212917,
"grad_norm": 0.09263950333267311,
"learning_rate": 9.765886743172512e-06,
"loss": 0.3671,
"step": 530
},
{
"epoch": 0.3797019162526615,
"grad_norm": 0.11551198054702863,
"learning_rate": 9.756429697051728e-06,
"loss": 0.3613,
"step": 535
},
{
"epoch": 0.38325053229240597,
"grad_norm": 0.12470125246343837,
"learning_rate": 9.746790168545647e-06,
"loss": 0.3374,
"step": 540
},
{
"epoch": 0.38679914833215046,
"grad_norm": 0.10042530922313107,
"learning_rate": 9.73696852747875e-06,
"loss": 0.3384,
"step": 545
},
{
"epoch": 0.39034776437189495,
"grad_norm": 0.11394862907049574,
"learning_rate": 9.726965150662346e-06,
"loss": 0.3305,
"step": 550
},
{
"epoch": 0.39389638041163944,
"grad_norm": 0.11455790540510169,
"learning_rate": 9.716780421880108e-06,
"loss": 0.3913,
"step": 555
},
{
"epoch": 0.397444996451384,
"grad_norm": 0.10952009279238427,
"learning_rate": 9.706414731873352e-06,
"loss": 0.3512,
"step": 560
},
{
"epoch": 0.40099361249112847,
"grad_norm": 0.12065253965315029,
"learning_rate": 9.695868478326047e-06,
"loss": 0.3577,
"step": 565
},
{
"epoch": 0.40454222853087296,
"grad_norm": 0.11897837393286707,
"learning_rate": 9.685142065849556e-06,
"loss": 0.3547,
"step": 570
},
{
"epoch": 0.40809084457061745,
"grad_norm": 0.1129638044494709,
"learning_rate": 9.674235905967113e-06,
"loss": 0.3859,
"step": 575
},
{
"epoch": 0.41163946061036194,
"grad_norm": 0.09994263942761819,
"learning_rate": 9.663150417098037e-06,
"loss": 0.3185,
"step": 580
},
{
"epoch": 0.4151880766501065,
"grad_norm": 0.11368184205433603,
"learning_rate": 9.651886024541675e-06,
"loss": 0.3541,
"step": 585
},
{
"epoch": 0.418736692689851,
"grad_norm": 0.11147587733602603,
"learning_rate": 9.64044316046109e-06,
"loss": 0.313,
"step": 590
},
{
"epoch": 0.42228530872959547,
"grad_norm": 0.09996967800842779,
"learning_rate": 9.628822263866479e-06,
"loss": 0.3319,
"step": 595
},
{
"epoch": 0.42583392476933996,
"grad_norm": 0.1010520040303903,
"learning_rate": 9.617023780598326e-06,
"loss": 0.3225,
"step": 600
},
{
"epoch": 0.42938254080908445,
"grad_norm": 0.1015501769107125,
"learning_rate": 9.605048163310305e-06,
"loss": 0.3715,
"step": 605
},
{
"epoch": 0.43293115684882894,
"grad_norm": 0.1169861715254158,
"learning_rate": 9.592895871451908e-06,
"loss": 0.3725,
"step": 610
},
{
"epoch": 0.4364797728885735,
"grad_norm": 0.11989686434864336,
"learning_rate": 9.58056737125082e-06,
"loss": 0.3394,
"step": 615
},
{
"epoch": 0.44002838892831797,
"grad_norm": 0.09167015266114685,
"learning_rate": 9.56806313569503e-06,
"loss": 0.3128,
"step": 620
},
{
"epoch": 0.44357700496806246,
"grad_norm": 0.10129575520323858,
"learning_rate": 9.555383644514686e-06,
"loss": 0.3292,
"step": 625
},
{
"epoch": 0.44712562100780695,
"grad_norm": 0.1122229023098103,
"learning_rate": 9.542529384163697e-06,
"loss": 0.3339,
"step": 630
},
{
"epoch": 0.45067423704755144,
"grad_norm": 0.11565836109150361,
"learning_rate": 9.529500847801055e-06,
"loss": 0.3532,
"step": 635
},
{
"epoch": 0.45422285308729593,
"grad_norm": 0.1030400294285783,
"learning_rate": 9.516298535271926e-06,
"loss": 0.3215,
"step": 640
},
{
"epoch": 0.4577714691270405,
"grad_norm": 0.11547528917517318,
"learning_rate": 9.502922953088472e-06,
"loss": 0.353,
"step": 645
},
{
"epoch": 0.46132008516678497,
"grad_norm": 0.10741016689638407,
"learning_rate": 9.489374614410413e-06,
"loss": 0.3476,
"step": 650
},
{
"epoch": 0.46486870120652946,
"grad_norm": 0.12833627735831074,
"learning_rate": 9.475654039025348e-06,
"loss": 0.3464,
"step": 655
},
{
"epoch": 0.46841731724627395,
"grad_norm": 0.11954795379672073,
"learning_rate": 9.461761753328804e-06,
"loss": 0.3023,
"step": 660
},
{
"epoch": 0.47196593328601844,
"grad_norm": 0.11945349774879428,
"learning_rate": 9.447698290304045e-06,
"loss": 0.3516,
"step": 665
},
{
"epoch": 0.4755145493257629,
"grad_norm": 0.0996781692656219,
"learning_rate": 9.433464189501626e-06,
"loss": 0.3335,
"step": 670
},
{
"epoch": 0.47906316536550747,
"grad_norm": 0.1259272910725644,
"learning_rate": 9.419059997018691e-06,
"loss": 0.3369,
"step": 675
},
{
"epoch": 0.48261178140525196,
"grad_norm": 0.09710972204219799,
"learning_rate": 9.40448626547802e-06,
"loss": 0.3129,
"step": 680
},
{
"epoch": 0.48616039744499645,
"grad_norm": 0.12189314593499007,
"learning_rate": 9.389743554006826e-06,
"loss": 0.3531,
"step": 685
},
{
"epoch": 0.48970901348474094,
"grad_norm": 0.10189023981699444,
"learning_rate": 9.37483242821531e-06,
"loss": 0.3406,
"step": 690
},
{
"epoch": 0.49325762952448543,
"grad_norm": 0.10707194416506903,
"learning_rate": 9.359753460174961e-06,
"loss": 0.3525,
"step": 695
},
{
"epoch": 0.49680624556423,
"grad_norm": 0.09999616462098168,
"learning_rate": 9.344507228396599e-06,
"loss": 0.3116,
"step": 700
},
{
"epoch": 0.5003548616039745,
"grad_norm": 0.11514672293485381,
"learning_rate": 9.329094317808189e-06,
"loss": 0.3336,
"step": 705
},
{
"epoch": 0.5039034776437189,
"grad_norm": 0.1025498465472322,
"learning_rate": 9.313515319732397e-06,
"loss": 0.3503,
"step": 710
},
{
"epoch": 0.5074520936834634,
"grad_norm": 0.10666205607230526,
"learning_rate": 9.297770831863906e-06,
"loss": 0.3371,
"step": 715
},
{
"epoch": 0.511000709723208,
"grad_norm": 0.1426918044554978,
"learning_rate": 9.281861458246474e-06,
"loss": 0.3486,
"step": 720
},
{
"epoch": 0.5145493257629524,
"grad_norm": 0.11698375842933832,
"learning_rate": 9.265787809249784e-06,
"loss": 0.3335,
"step": 725
},
{
"epoch": 0.518097941802697,
"grad_norm": 0.11754977354544109,
"learning_rate": 9.249550501545998e-06,
"loss": 0.3286,
"step": 730
},
{
"epoch": 0.5216465578424414,
"grad_norm": 0.10644481322832934,
"learning_rate": 9.233150158086118e-06,
"loss": 0.3134,
"step": 735
},
{
"epoch": 0.525195173882186,
"grad_norm": 0.09533490147947708,
"learning_rate": 9.216587408076078e-06,
"loss": 0.3356,
"step": 740
},
{
"epoch": 0.5287437899219305,
"grad_norm": 0.1066273318171933,
"learning_rate": 9.19986288695261e-06,
"loss": 0.3498,
"step": 745
},
{
"epoch": 0.5322924059616749,
"grad_norm": 0.10332411562375157,
"learning_rate": 9.182977236358856e-06,
"loss": 0.3577,
"step": 750
},
{
"epoch": 0.5358410220014195,
"grad_norm": 0.11021107127349049,
"learning_rate": 9.16593110411976e-06,
"loss": 0.3219,
"step": 755
},
{
"epoch": 0.5393896380411639,
"grad_norm": 0.10440475869396082,
"learning_rate": 9.148725144217208e-06,
"loss": 0.2917,
"step": 760
},
{
"epoch": 0.5429382540809085,
"grad_norm": 0.0976004543223068,
"learning_rate": 9.131360016764945e-06,
"loss": 0.3269,
"step": 765
},
{
"epoch": 0.5464868701206529,
"grad_norm": 0.1094060951712041,
"learning_rate": 9.113836387983239e-06,
"loss": 0.3326,
"step": 770
},
{
"epoch": 0.5500354861603974,
"grad_norm": 0.10014441364612882,
"learning_rate": 9.09615493017333e-06,
"loss": 0.339,
"step": 775
},
{
"epoch": 0.553584102200142,
"grad_norm": 0.10425087388210275,
"learning_rate": 9.078316321691629e-06,
"loss": 0.3303,
"step": 780
},
{
"epoch": 0.5571327182398864,
"grad_norm": 0.10483907396840525,
"learning_rate": 9.060321246923707e-06,
"loss": 0.3327,
"step": 785
},
{
"epoch": 0.560681334279631,
"grad_norm": 0.11066528205798155,
"learning_rate": 9.042170396258019e-06,
"loss": 0.3393,
"step": 790
},
{
"epoch": 0.5642299503193754,
"grad_norm": 0.1033774277527613,
"learning_rate": 9.023864466059432e-06,
"loss": 0.3404,
"step": 795
},
{
"epoch": 0.5677785663591199,
"grad_norm": 0.09997819583349826,
"learning_rate": 9.0054041586425e-06,
"loss": 0.3041,
"step": 800
},
{
"epoch": 0.5713271823988645,
"grad_norm": 0.09876141575238233,
"learning_rate": 8.986790182244525e-06,
"loss": 0.3174,
"step": 805
},
{
"epoch": 0.5748757984386089,
"grad_norm": 0.09858116084186744,
"learning_rate": 8.96802325099838e-06,
"loss": 0.3535,
"step": 810
},
{
"epoch": 0.5784244144783535,
"grad_norm": 0.12290123344340215,
"learning_rate": 8.949104084905119e-06,
"loss": 0.3314,
"step": 815
},
{
"epoch": 0.5819730305180979,
"grad_norm": 0.08404992114362587,
"learning_rate": 8.930033409806342e-06,
"loss": 0.3317,
"step": 820
},
{
"epoch": 0.5855216465578424,
"grad_norm": 0.09744842736144836,
"learning_rate": 8.910811957356357e-06,
"loss": 0.303,
"step": 825
},
{
"epoch": 0.589070262597587,
"grad_norm": 0.10838904893914392,
"learning_rate": 8.89144046499411e-06,
"loss": 0.3287,
"step": 830
},
{
"epoch": 0.5926188786373314,
"grad_norm": 0.08971531911720379,
"learning_rate": 8.871919675914888e-06,
"loss": 0.3176,
"step": 835
},
{
"epoch": 0.596167494677076,
"grad_norm": 0.10931433846590326,
"learning_rate": 8.852250339041806e-06,
"loss": 0.3343,
"step": 840
},
{
"epoch": 0.5997161107168204,
"grad_norm": 0.10474178145117448,
"learning_rate": 8.83243320899708e-06,
"loss": 0.3277,
"step": 845
},
{
"epoch": 0.603264726756565,
"grad_norm": 0.10805986952084197,
"learning_rate": 8.812469046073069e-06,
"loss": 0.3286,
"step": 850
},
{
"epoch": 0.6068133427963094,
"grad_norm": 0.112790321693706,
"learning_rate": 8.792358616203109e-06,
"loss": 0.3413,
"step": 855
},
{
"epoch": 0.6103619588360539,
"grad_norm": 0.1034470142488599,
"learning_rate": 8.772102690932133e-06,
"loss": 0.3309,
"step": 860
},
{
"epoch": 0.6139105748757985,
"grad_norm": 0.11304306005031962,
"learning_rate": 8.751702047387057e-06,
"loss": 0.33,
"step": 865
},
{
"epoch": 0.6174591909155429,
"grad_norm": 0.10896974549399982,
"learning_rate": 8.731157468246979e-06,
"loss": 0.3058,
"step": 870
},
{
"epoch": 0.6210078069552875,
"grad_norm": 0.10779951716283877,
"learning_rate": 8.710469741713141e-06,
"loss": 0.3313,
"step": 875
},
{
"epoch": 0.6245564229950319,
"grad_norm": 0.10136064352534128,
"learning_rate": 8.689639661478699e-06,
"loss": 0.3419,
"step": 880
},
{
"epoch": 0.6281050390347764,
"grad_norm": 0.09887428301669085,
"learning_rate": 8.668668026698263e-06,
"loss": 0.3209,
"step": 885
},
{
"epoch": 0.631653655074521,
"grad_norm": 0.09210292865485042,
"learning_rate": 8.647555641957243e-06,
"loss": 0.2781,
"step": 890
},
{
"epoch": 0.6352022711142654,
"grad_norm": 0.10530372404303365,
"learning_rate": 8.62630331724098e-06,
"loss": 0.3262,
"step": 895
},
{
"epoch": 0.63875088715401,
"grad_norm": 0.09680900541508322,
"learning_rate": 8.604911867903671e-06,
"loss": 0.3533,
"step": 900
},
{
"epoch": 0.6422995031937544,
"grad_norm": 0.10870932081301858,
"learning_rate": 8.58338211463708e-06,
"loss": 0.3484,
"step": 905
},
{
"epoch": 0.6458481192334989,
"grad_norm": 0.10288274105650778,
"learning_rate": 8.561714883439067e-06,
"loss": 0.2943,
"step": 910
},
{
"epoch": 0.6493967352732435,
"grad_norm": 0.10988711427086194,
"learning_rate": 8.539911005581884e-06,
"loss": 0.2842,
"step": 915
},
{
"epoch": 0.6529453513129879,
"grad_norm": 0.1145142305247729,
"learning_rate": 8.517971317580288e-06,
"loss": 0.3256,
"step": 920
},
{
"epoch": 0.6564939673527325,
"grad_norm": 0.13342731996336954,
"learning_rate": 8.495896661159453e-06,
"loss": 0.3568,
"step": 925
},
{
"epoch": 0.6600425833924769,
"grad_norm": 0.09991798770628711,
"learning_rate": 8.473687883222665e-06,
"loss": 0.2963,
"step": 930
},
{
"epoch": 0.6635911994322214,
"grad_norm": 0.11208456473524073,
"learning_rate": 8.451345835818844e-06,
"loss": 0.3133,
"step": 935
},
{
"epoch": 0.6671398154719659,
"grad_norm": 0.12390532253496145,
"learning_rate": 8.428871376109844e-06,
"loss": 0.3009,
"step": 940
},
{
"epoch": 0.6706884315117104,
"grad_norm": 0.13127239548600328,
"learning_rate": 8.40626536633757e-06,
"loss": 0.347,
"step": 945
},
{
"epoch": 0.674237047551455,
"grad_norm": 0.10204128565162945,
"learning_rate": 8.38352867379091e-06,
"loss": 0.3213,
"step": 950
},
{
"epoch": 0.6777856635911994,
"grad_norm": 0.11635498416693124,
"learning_rate": 8.360662170772436e-06,
"loss": 0.2978,
"step": 955
},
{
"epoch": 0.681334279630944,
"grad_norm": 0.09968217785760913,
"learning_rate": 8.337666734564958e-06,
"loss": 0.3214,
"step": 960
},
{
"epoch": 0.6848828956706884,
"grad_norm": 0.1159485044574924,
"learning_rate": 8.314543247397865e-06,
"loss": 0.3217,
"step": 965
},
{
"epoch": 0.6884315117104329,
"grad_norm": 0.11007757179999383,
"learning_rate": 8.291292596413272e-06,
"loss": 0.3331,
"step": 970
},
{
"epoch": 0.6919801277501775,
"grad_norm": 0.10114885482982008,
"learning_rate": 8.267915673631981e-06,
"loss": 0.32,
"step": 975
},
{
"epoch": 0.6955287437899219,
"grad_norm": 0.0962760318922265,
"learning_rate": 8.244413375919269e-06,
"loss": 0.3047,
"step": 980
},
{
"epoch": 0.6990773598296665,
"grad_norm": 0.11966643086285601,
"learning_rate": 8.220786604950473e-06,
"loss": 0.3396,
"step": 985
},
{
"epoch": 0.7026259758694109,
"grad_norm": 0.11712505156372695,
"learning_rate": 8.197036267176395e-06,
"loss": 0.3104,
"step": 990
},
{
"epoch": 0.7061745919091554,
"grad_norm": 0.10334481701710858,
"learning_rate": 8.173163273788533e-06,
"loss": 0.3067,
"step": 995
},
{
"epoch": 0.7097232079489,
"grad_norm": 0.1154229573550237,
"learning_rate": 8.149168540684114e-06,
"loss": 0.3668,
"step": 1000
},
{
"epoch": 0.7132718239886444,
"grad_norm": 0.10712457825642942,
"learning_rate": 8.12505298843096e-06,
"loss": 0.3092,
"step": 1005
},
{
"epoch": 0.716820440028389,
"grad_norm": 0.11197762765282619,
"learning_rate": 8.100817542232175e-06,
"loss": 0.2859,
"step": 1010
},
{
"epoch": 0.7203690560681334,
"grad_norm": 0.2460991704891257,
"learning_rate": 8.076463131890635e-06,
"loss": 0.3232,
"step": 1015
},
{
"epoch": 0.7239176721078779,
"grad_norm": 0.1283744065173745,
"learning_rate": 8.051990691773325e-06,
"loss": 0.3252,
"step": 1020
},
{
"epoch": 0.7274662881476224,
"grad_norm": 0.12259460816217238,
"learning_rate": 8.027401160775505e-06,
"loss": 0.3126,
"step": 1025
},
{
"epoch": 0.7310149041873669,
"grad_norm": 0.09656645726784309,
"learning_rate": 8.002695482284655e-06,
"loss": 0.288,
"step": 1030
},
{
"epoch": 0.7345635202271115,
"grad_norm": 0.11992383938615657,
"learning_rate": 7.977874604144314e-06,
"loss": 0.3215,
"step": 1035
},
{
"epoch": 0.7381121362668559,
"grad_norm": 0.10129933470220466,
"learning_rate": 7.952939478617698e-06,
"loss": 0.318,
"step": 1040
},
{
"epoch": 0.7416607523066004,
"grad_norm": 0.10080587163474378,
"learning_rate": 7.927891062351176e-06,
"loss": 0.2869,
"step": 1045
},
{
"epoch": 0.7452093683463449,
"grad_norm": 0.09788743558770831,
"learning_rate": 7.902730316337556e-06,
"loss": 0.3058,
"step": 1050
},
{
"epoch": 0.7487579843860894,
"grad_norm": 0.10032978871750338,
"learning_rate": 7.87745820587923e-06,
"loss": 0.3137,
"step": 1055
},
{
"epoch": 0.752306600425834,
"grad_norm": 0.10706257074063476,
"learning_rate": 7.852075700551129e-06,
"loss": 0.2996,
"step": 1060
},
{
"epoch": 0.7558552164655784,
"grad_norm": 0.09724476100094016,
"learning_rate": 7.826583774163527e-06,
"loss": 0.312,
"step": 1065
},
{
"epoch": 0.759403832505323,
"grad_norm": 0.0907409726975251,
"learning_rate": 7.800983404724687e-06,
"loss": 0.2906,
"step": 1070
},
{
"epoch": 0.7629524485450674,
"grad_norm": 0.10307321939032585,
"learning_rate": 7.77527557440333e-06,
"loss": 0.3194,
"step": 1075
},
{
"epoch": 0.7665010645848119,
"grad_norm": 0.1247032874494029,
"learning_rate": 7.74946126949096e-06,
"loss": 0.2855,
"step": 1080
},
{
"epoch": 0.7700496806245565,
"grad_norm": 0.11216912545924056,
"learning_rate": 7.723541480364021e-06,
"loss": 0.311,
"step": 1085
},
{
"epoch": 0.7735982966643009,
"grad_norm": 0.12612240338808522,
"learning_rate": 7.697517201445906e-06,
"loss": 0.309,
"step": 1090
},
{
"epoch": 0.7771469127040455,
"grad_norm": 0.1396854298326299,
"learning_rate": 7.671389431168799e-06,
"loss": 0.3136,
"step": 1095
},
{
"epoch": 0.7806955287437899,
"grad_norm": 0.1146849110049059,
"learning_rate": 7.64515917193537e-06,
"loss": 0.2864,
"step": 1100
},
{
"epoch": 0.7842441447835344,
"grad_norm": 0.1251118885044735,
"learning_rate": 7.618827430080326e-06,
"loss": 0.2965,
"step": 1105
},
{
"epoch": 0.7877927608232789,
"grad_norm": 0.12161720020363276,
"learning_rate": 7.592395215831793e-06,
"loss": 0.2897,
"step": 1110
},
{
"epoch": 0.7913413768630234,
"grad_norm": 0.1035220486176832,
"learning_rate": 7.565863543272563e-06,
"loss": 0.2934,
"step": 1115
},
{
"epoch": 0.794889992902768,
"grad_norm": 0.15506806874441564,
"learning_rate": 7.539233430301186e-06,
"loss": 0.2915,
"step": 1120
},
{
"epoch": 0.7984386089425124,
"grad_norm": 0.10316583850969425,
"learning_rate": 7.51250589859292e-06,
"loss": 0.2823,
"step": 1125
},
{
"epoch": 0.8019872249822569,
"grad_norm": 0.1172206546936995,
"learning_rate": 7.485681973560532e-06,
"loss": 0.2833,
"step": 1130
},
{
"epoch": 0.8055358410220014,
"grad_norm": 0.10897623530520172,
"learning_rate": 7.458762684314959e-06,
"loss": 0.307,
"step": 1135
},
{
"epoch": 0.8090844570617459,
"grad_norm": 0.11743567575075758,
"learning_rate": 7.431749063625827e-06,
"loss": 0.3106,
"step": 1140
},
{
"epoch": 0.8126330731014905,
"grad_norm": 0.11398426901184441,
"learning_rate": 7.404642147881824e-06,
"loss": 0.305,
"step": 1145
},
{
"epoch": 0.8161816891412349,
"grad_norm": 0.14382819872280167,
"learning_rate": 7.377442977050942e-06,
"loss": 0.3275,
"step": 1150
},
{
"epoch": 0.8197303051809794,
"grad_norm": 0.12031276138448121,
"learning_rate": 7.350152594640577e-06,
"loss": 0.2836,
"step": 1155
},
{
"epoch": 0.8232789212207239,
"grad_norm": 0.11646232833990451,
"learning_rate": 7.322772047657498e-06,
"loss": 0.2964,
"step": 1160
},
{
"epoch": 0.8268275372604684,
"grad_norm": 0.1447674271815869,
"learning_rate": 7.2953023865676716e-06,
"loss": 0.2987,
"step": 1165
},
{
"epoch": 0.830376153300213,
"grad_norm": 0.10627909955783361,
"learning_rate": 7.267744665255966e-06,
"loss": 0.2988,
"step": 1170
},
{
"epoch": 0.8339247693399574,
"grad_norm": 0.1296134679162657,
"learning_rate": 7.240099940985712e-06,
"loss": 0.3121,
"step": 1175
},
{
"epoch": 0.837473385379702,
"grad_norm": 0.11305715942336415,
"learning_rate": 7.212369274358151e-06,
"loss": 0.3065,
"step": 1180
},
{
"epoch": 0.8410220014194464,
"grad_norm": 0.13019600153474778,
"learning_rate": 7.184553729271732e-06,
"loss": 0.3017,
"step": 1185
},
{
"epoch": 0.8445706174591909,
"grad_norm": 0.11791715639518001,
"learning_rate": 7.156654372881308e-06,
"loss": 0.2913,
"step": 1190
},
{
"epoch": 0.8481192334989354,
"grad_norm": 0.11997685850730358,
"learning_rate": 7.1286722755571795e-06,
"loss": 0.3039,
"step": 1195
},
{
"epoch": 0.8516678495386799,
"grad_norm": 0.1484639272635457,
"learning_rate": 7.100608510844041e-06,
"loss": 0.3207,
"step": 1200
},
{
"epoch": 0.8552164655784245,
"grad_norm": 0.11663065340767796,
"learning_rate": 7.072464155419794e-06,
"loss": 0.295,
"step": 1205
},
{
"epoch": 0.8587650816181689,
"grad_norm": 0.12649076850391006,
"learning_rate": 7.044240289054227e-06,
"loss": 0.3019,
"step": 1210
},
{
"epoch": 0.8623136976579134,
"grad_norm": 0.12371486860650412,
"learning_rate": 7.015937994567607e-06,
"loss": 0.287,
"step": 1215
},
{
"epoch": 0.8658623136976579,
"grad_norm": 0.11836636428084986,
"learning_rate": 6.987558357789122e-06,
"loss": 0.3073,
"step": 1220
},
{
"epoch": 0.8694109297374024,
"grad_norm": 0.10623220843129759,
"learning_rate": 6.959102467515232e-06,
"loss": 0.2748,
"step": 1225
},
{
"epoch": 0.872959545777147,
"grad_norm": 0.1255560594379443,
"learning_rate": 6.930571415467893e-06,
"loss": 0.2792,
"step": 1230
},
{
"epoch": 0.8765081618168914,
"grad_norm": 0.10744914430702476,
"learning_rate": 6.901966296252673e-06,
"loss": 0.2641,
"step": 1235
},
{
"epoch": 0.8800567778566359,
"grad_norm": 0.1331015713855634,
"learning_rate": 6.873288207316761e-06,
"loss": 0.3025,
"step": 1240
},
{
"epoch": 0.8836053938963804,
"grad_norm": 0.13689393825686838,
"learning_rate": 6.844538248906851e-06,
"loss": 0.2664,
"step": 1245
},
{
"epoch": 0.8871540099361249,
"grad_norm": 0.11787704953996495,
"learning_rate": 6.8157175240269495e-06,
"loss": 0.2887,
"step": 1250
},
{
"epoch": 0.8907026259758695,
"grad_norm": 0.10637450717254661,
"learning_rate": 6.78682713839604e-06,
"loss": 0.2877,
"step": 1255
},
{
"epoch": 0.8942512420156139,
"grad_norm": 0.12360008103572975,
"learning_rate": 6.757868200405673e-06,
"loss": 0.2837,
"step": 1260
},
{
"epoch": 0.8977998580553584,
"grad_norm": 0.13686203601114932,
"learning_rate": 6.728841821077436e-06,
"loss": 0.2873,
"step": 1265
},
{
"epoch": 0.9013484740951029,
"grad_norm": 0.12339315922661516,
"learning_rate": 6.699749114020332e-06,
"loss": 0.2741,
"step": 1270
},
{
"epoch": 0.9048970901348474,
"grad_norm": 0.12157386757849972,
"learning_rate": 6.6705911953880585e-06,
"loss": 0.2836,
"step": 1275
},
{
"epoch": 0.9084457061745919,
"grad_norm": 0.1176810835621486,
"learning_rate": 6.641369183836178e-06,
"loss": 0.2843,
"step": 1280
},
{
"epoch": 0.9119943222143364,
"grad_norm": 0.13333835009147263,
"learning_rate": 6.6120842004792055e-06,
"loss": 0.295,
"step": 1285
},
{
"epoch": 0.915542938254081,
"grad_norm": 0.1472022748970272,
"learning_rate": 6.5827373688475925e-06,
"loss": 0.2954,
"step": 1290
},
{
"epoch": 0.9190915542938254,
"grad_norm": 0.14206193918153712,
"learning_rate": 6.553329814844629e-06,
"loss": 0.3085,
"step": 1295
},
{
"epoch": 0.9226401703335699,
"grad_norm": 0.14396025800454598,
"learning_rate": 6.5238626667032425e-06,
"loss": 0.2697,
"step": 1300
},
{
"epoch": 0.9261887863733144,
"grad_norm": 0.16109422401483736,
"learning_rate": 6.494337054942714e-06,
"loss": 0.2798,
"step": 1305
},
{
"epoch": 0.9297374024130589,
"grad_norm": 0.1350268037714893,
"learning_rate": 6.464754112325305e-06,
"loss": 0.2907,
"step": 1310
},
{
"epoch": 0.9332860184528035,
"grad_norm": 0.11664234411687084,
"learning_rate": 6.435114973812797e-06,
"loss": 0.2987,
"step": 1315
},
{
"epoch": 0.9368346344925479,
"grad_norm": 0.14333497651497332,
"learning_rate": 6.4054207765229544e-06,
"loss": 0.2848,
"step": 1320
},
{
"epoch": 0.9403832505322924,
"grad_norm": 0.14082980667224726,
"learning_rate": 6.375672659685894e-06,
"loss": 0.3095,
"step": 1325
},
{
"epoch": 0.9439318665720369,
"grad_norm": 0.14994727943151237,
"learning_rate": 6.3458717646003746e-06,
"loss": 0.2722,
"step": 1330
},
{
"epoch": 0.9474804826117814,
"grad_norm": 0.10375072765429182,
"learning_rate": 6.3160192345900155e-06,
"loss": 0.2551,
"step": 1335
},
{
"epoch": 0.9510290986515259,
"grad_norm": 0.14942446492004308,
"learning_rate": 6.286116214959432e-06,
"loss": 0.2776,
"step": 1340
},
{
"epoch": 0.9545777146912704,
"grad_norm": 0.17229906045875112,
"learning_rate": 6.256163852950296e-06,
"loss": 0.2762,
"step": 1345
},
{
"epoch": 0.9581263307310149,
"grad_norm": 0.1386612762842367,
"learning_rate": 6.2261632976973164e-06,
"loss": 0.2888,
"step": 1350
},
{
"epoch": 0.9616749467707594,
"grad_norm": 0.17808691974793162,
"learning_rate": 6.196115700184159e-06,
"loss": 0.2896,
"step": 1355
},
{
"epoch": 0.9652235628105039,
"grad_norm": 0.14015036438418899,
"learning_rate": 6.166022213199282e-06,
"loss": 0.299,
"step": 1360
},
{
"epoch": 0.9687721788502484,
"grad_norm": 0.14149091233095099,
"learning_rate": 6.1358839912917165e-06,
"loss": 0.2578,
"step": 1365
},
{
"epoch": 0.9723207948899929,
"grad_norm": 0.1372325532437649,
"learning_rate": 6.105702190726765e-06,
"loss": 0.3001,
"step": 1370
},
{
"epoch": 0.9758694109297374,
"grad_norm": 0.1415174100824182,
"learning_rate": 6.075477969441642e-06,
"loss": 0.2491,
"step": 1375
},
{
"epoch": 0.9794180269694819,
"grad_norm": 0.2073339797913589,
"learning_rate": 6.045212487001052e-06,
"loss": 0.31,
"step": 1380
},
{
"epoch": 0.9829666430092264,
"grad_norm": 0.1339294032059611,
"learning_rate": 6.014906904552699e-06,
"loss": 0.2628,
"step": 1385
},
{
"epoch": 0.9865152590489709,
"grad_norm": 0.17625465432960563,
"learning_rate": 5.9845623847827425e-06,
"loss": 0.2711,
"step": 1390
},
{
"epoch": 0.9900638750887154,
"grad_norm": 0.1353573542231085,
"learning_rate": 5.954180091871188e-06,
"loss": 0.2712,
"step": 1395
},
{
"epoch": 0.99361249112846,
"grad_norm": 0.20855231236963354,
"learning_rate": 5.923761191447223e-06,
"loss": 0.3101,
"step": 1400
},
{
"epoch": 0.9971611071682044,
"grad_norm": 0.13398046910239875,
"learning_rate": 5.893306850544495e-06,
"loss": 0.2758,
"step": 1405
},
{
"epoch": 1.0,
"eval_loss": 0.29164838790893555,
"eval_runtime": 35.366,
"eval_samples_per_second": 19.341,
"eval_steps_per_second": 4.835,
"step": 1409
},
{
"epoch": 1.000709723207949,
"grad_norm": 0.1499506546951247,
"learning_rate": 5.862818237556344e-06,
"loss": 0.2967,
"step": 1410
},
{
"epoch": 1.0042583392476934,
"grad_norm": 0.18756092379135153,
"learning_rate": 5.832296522190969e-06,
"loss": 0.2652,
"step": 1415
},
{
"epoch": 1.0078069552874378,
"grad_norm": 0.15776724183531865,
"learning_rate": 5.801742875426558e-06,
"loss": 0.2609,
"step": 1420
},
{
"epoch": 1.0113555713271825,
"grad_norm": 0.17053262827345367,
"learning_rate": 5.771158469466359e-06,
"loss": 0.2307,
"step": 1425
},
{
"epoch": 1.014904187366927,
"grad_norm": 0.1755338253006045,
"learning_rate": 5.740544477693709e-06,
"loss": 0.2442,
"step": 1430
},
{
"epoch": 1.0184528034066713,
"grad_norm": 0.1948154760618653,
"learning_rate": 5.7099020746270185e-06,
"loss": 0.2583,
"step": 1435
},
{
"epoch": 1.022001419446416,
"grad_norm": 0.2084861499491775,
"learning_rate": 5.679232435874708e-06,
"loss": 0.2599,
"step": 1440
},
{
"epoch": 1.0255500354861604,
"grad_norm": 0.17091074245999738,
"learning_rate": 5.648536738090103e-06,
"loss": 0.2638,
"step": 1445
},
{
"epoch": 1.0290986515259049,
"grad_norm": 0.1768275466377568,
"learning_rate": 5.617816158926303e-06,
"loss": 0.239,
"step": 1450
},
{
"epoch": 1.0326472675656495,
"grad_norm": 0.23283282968815805,
"learning_rate": 5.587071876990982e-06,
"loss": 0.2576,
"step": 1455
},
{
"epoch": 1.036195883605394,
"grad_norm": 0.1662603889344521,
"learning_rate": 5.556305071801189e-06,
"loss": 0.2487,
"step": 1460
},
{
"epoch": 1.0397444996451384,
"grad_norm": 0.1744733235172125,
"learning_rate": 5.525516923738079e-06,
"loss": 0.2299,
"step": 1465
},
{
"epoch": 1.0432931156848828,
"grad_norm": 0.19177175405238575,
"learning_rate": 5.494708614001643e-06,
"loss": 0.2524,
"step": 1470
},
{
"epoch": 1.0468417317246275,
"grad_norm": 0.17211356983404122,
"learning_rate": 5.463881324565376e-06,
"loss": 0.243,
"step": 1475
},
{
"epoch": 1.050390347764372,
"grad_norm": 0.17813842895621507,
"learning_rate": 5.433036238130941e-06,
"loss": 0.243,
"step": 1480
},
{
"epoch": 1.0539389638041163,
"grad_norm": 0.1596674898349891,
"learning_rate": 5.402174538082792e-06,
"loss": 0.2612,
"step": 1485
},
{
"epoch": 1.057487579843861,
"grad_norm": 0.1749700091068219,
"learning_rate": 5.371297408442765e-06,
"loss": 0.2533,
"step": 1490
},
{
"epoch": 1.0610361958836054,
"grad_norm": 0.17571107412705392,
"learning_rate": 5.3404060338246636e-06,
"loss": 0.258,
"step": 1495
},
{
"epoch": 1.0645848119233499,
"grad_norm": 0.21965889477210226,
"learning_rate": 5.309501599388804e-06,
"loss": 0.2746,
"step": 1500
},
{
"epoch": 1.0681334279630943,
"grad_norm": 0.19562018899583306,
"learning_rate": 5.278585290796549e-06,
"loss": 0.2347,
"step": 1505
},
{
"epoch": 1.071682044002839,
"grad_norm": 0.19383380723342078,
"learning_rate": 5.247658294164817e-06,
"loss": 0.2243,
"step": 1510
},
{
"epoch": 1.0752306600425834,
"grad_norm": 0.19858266410790124,
"learning_rate": 5.216721796020576e-06,
"loss": 0.2418,
"step": 1515
},
{
"epoch": 1.0787792760823278,
"grad_norm": 0.1802758926817163,
"learning_rate": 5.1857769832553275e-06,
"loss": 0.2358,
"step": 1520
},
{
"epoch": 1.0823278921220725,
"grad_norm": 0.1919887272398654,
"learning_rate": 5.154825043079563e-06,
"loss": 0.24,
"step": 1525
},
{
"epoch": 1.085876508161817,
"grad_norm": 0.18341392588170258,
"learning_rate": 5.123867162977224e-06,
"loss": 0.2174,
"step": 1530
},
{
"epoch": 1.0894251242015613,
"grad_norm": 0.24023347300153372,
"learning_rate": 5.092904530660135e-06,
"loss": 0.2583,
"step": 1535
},
{
"epoch": 1.0929737402413058,
"grad_norm": 0.18142776996257998,
"learning_rate": 5.061938334022444e-06,
"loss": 0.1988,
"step": 1540
},
{
"epoch": 1.0965223562810504,
"grad_norm": 0.20278233658851152,
"learning_rate": 5.030969761095044e-06,
"loss": 0.2305,
"step": 1545
},
{
"epoch": 1.1000709723207949,
"grad_norm": 0.16422096910156064,
"learning_rate": 5e-06,
"loss": 0.2428,
"step": 1550
},
{
"epoch": 1.1036195883605393,
"grad_norm": 0.21414754101292438,
"learning_rate": 4.9690302389049564e-06,
"loss": 0.2518,
"step": 1555
},
{
"epoch": 1.107168204400284,
"grad_norm": 0.2021325216975729,
"learning_rate": 4.938061665977558e-06,
"loss": 0.2288,
"step": 1560
},
{
"epoch": 1.1107168204400284,
"grad_norm": 0.2242028785347262,
"learning_rate": 4.907095469339867e-06,
"loss": 0.2218,
"step": 1565
},
{
"epoch": 1.1142654364797728,
"grad_norm": 0.2439659859739963,
"learning_rate": 4.876132837022778e-06,
"loss": 0.2222,
"step": 1570
},
{
"epoch": 1.1178140525195175,
"grad_norm": 0.22109636064621938,
"learning_rate": 4.845174956920437e-06,
"loss": 0.2179,
"step": 1575
},
{
"epoch": 1.121362668559262,
"grad_norm": 0.2020515616574426,
"learning_rate": 4.814223016744673e-06,
"loss": 0.2483,
"step": 1580
},
{
"epoch": 1.1249112845990064,
"grad_norm": 0.1929602920987722,
"learning_rate": 4.7832782039794244e-06,
"loss": 0.2094,
"step": 1585
},
{
"epoch": 1.1284599006387508,
"grad_norm": 0.22946871127048335,
"learning_rate": 4.752341705835185e-06,
"loss": 0.2471,
"step": 1590
},
{
"epoch": 1.1320085166784954,
"grad_norm": 0.2353650018312691,
"learning_rate": 4.7214147092034515e-06,
"loss": 0.2173,
"step": 1595
},
{
"epoch": 1.1355571327182399,
"grad_norm": 0.22231376916574838,
"learning_rate": 4.690498400611197e-06,
"loss": 0.254,
"step": 1600
},
{
"epoch": 1.1391057487579843,
"grad_norm": 0.2708368587410945,
"learning_rate": 4.659593966175337e-06,
"loss": 0.1993,
"step": 1605
},
{
"epoch": 1.142654364797729,
"grad_norm": 0.19302945406029343,
"learning_rate": 4.628702591557237e-06,
"loss": 0.2291,
"step": 1610
},
{
"epoch": 1.1462029808374734,
"grad_norm": 0.24384185885067486,
"learning_rate": 4.597825461917211e-06,
"loss": 0.2014,
"step": 1615
},
{
"epoch": 1.1497515968772178,
"grad_norm": 0.19288576362791393,
"learning_rate": 4.566963761869059e-06,
"loss": 0.2247,
"step": 1620
},
{
"epoch": 1.1533002129169625,
"grad_norm": 0.22844985585862837,
"learning_rate": 4.536118675434625e-06,
"loss": 0.2427,
"step": 1625
},
{
"epoch": 1.156848828956707,
"grad_norm": 0.2110712890072982,
"learning_rate": 4.505291385998359e-06,
"loss": 0.2327,
"step": 1630
},
{
"epoch": 1.1603974449964514,
"grad_norm": 0.21359454140314296,
"learning_rate": 4.474483076261922e-06,
"loss": 0.2157,
"step": 1635
},
{
"epoch": 1.1639460610361958,
"grad_norm": 0.18349595492014717,
"learning_rate": 4.443694928198813e-06,
"loss": 0.2207,
"step": 1640
},
{
"epoch": 1.1674946770759405,
"grad_norm": 0.21517822823819194,
"learning_rate": 4.4129281230090185e-06,
"loss": 0.2109,
"step": 1645
},
{
"epoch": 1.171043293115685,
"grad_norm": 0.20915961721436363,
"learning_rate": 4.382183841073698e-06,
"loss": 0.2233,
"step": 1650
},
{
"epoch": 1.1745919091554293,
"grad_norm": 0.1899258672522937,
"learning_rate": 4.351463261909898e-06,
"loss": 0.2059,
"step": 1655
},
{
"epoch": 1.178140525195174,
"grad_norm": 0.3176618175633807,
"learning_rate": 4.3207675641252955e-06,
"loss": 0.2098,
"step": 1660
},
{
"epoch": 1.1816891412349184,
"grad_norm": 0.24686245724887768,
"learning_rate": 4.290097925372982e-06,
"loss": 0.226,
"step": 1665
},
{
"epoch": 1.1852377572746629,
"grad_norm": 0.18589537427322667,
"learning_rate": 4.259455522306292e-06,
"loss": 0.2093,
"step": 1670
},
{
"epoch": 1.1887863733144073,
"grad_norm": 0.23952935889638277,
"learning_rate": 4.228841530533642e-06,
"loss": 0.2083,
"step": 1675
},
{
"epoch": 1.192334989354152,
"grad_norm": 0.19816591776664266,
"learning_rate": 4.198257124573443e-06,
"loss": 0.2153,
"step": 1680
},
{
"epoch": 1.1958836053938964,
"grad_norm": 0.2325874189419696,
"learning_rate": 4.167703477809032e-06,
"loss": 0.2256,
"step": 1685
},
{
"epoch": 1.1994322214336408,
"grad_norm": 0.27655356789940516,
"learning_rate": 4.137181762443658e-06,
"loss": 0.2014,
"step": 1690
},
{
"epoch": 1.2029808374733855,
"grad_norm": 0.20530958975208544,
"learning_rate": 4.106693149455508e-06,
"loss": 0.2307,
"step": 1695
},
{
"epoch": 1.20652945351313,
"grad_norm": 0.27989364785563464,
"learning_rate": 4.07623880855278e-06,
"loss": 0.1987,
"step": 1700
},
{
"epoch": 1.2100780695528743,
"grad_norm": 0.24191371122093697,
"learning_rate": 4.045819908128814e-06,
"loss": 0.2062,
"step": 1705
},
{
"epoch": 1.2136266855926188,
"grad_norm": 0.22977069417153437,
"learning_rate": 4.015437615217258e-06,
"loss": 0.2085,
"step": 1710
},
{
"epoch": 1.2171753016323634,
"grad_norm": 0.22788529768519772,
"learning_rate": 3.985093095447302e-06,
"loss": 0.2102,
"step": 1715
},
{
"epoch": 1.2207239176721079,
"grad_norm": 0.28946012716172914,
"learning_rate": 3.954787512998949e-06,
"loss": 0.1949,
"step": 1720
},
{
"epoch": 1.2242725337118523,
"grad_norm": 0.2605804901457581,
"learning_rate": 3.924522030558359e-06,
"loss": 0.1855,
"step": 1725
},
{
"epoch": 1.227821149751597,
"grad_norm": 0.21575771982283123,
"learning_rate": 3.894297809273237e-06,
"loss": 0.1856,
"step": 1730
},
{
"epoch": 1.2313697657913414,
"grad_norm": 0.2403596875003987,
"learning_rate": 3.864116008708285e-06,
"loss": 0.2019,
"step": 1735
},
{
"epoch": 1.2349183818310858,
"grad_norm": 0.24729712341655072,
"learning_rate": 3.83397778680072e-06,
"loss": 0.1893,
"step": 1740
},
{
"epoch": 1.2384669978708303,
"grad_norm": 0.22692358206492375,
"learning_rate": 3.8038842998158444e-06,
"loss": 0.1795,
"step": 1745
},
{
"epoch": 1.242015613910575,
"grad_norm": 0.22396407149449804,
"learning_rate": 3.773836702302686e-06,
"loss": 0.189,
"step": 1750
},
{
"epoch": 1.2455642299503193,
"grad_norm": 0.1904280560297174,
"learning_rate": 3.7438361470497047e-06,
"loss": 0.1764,
"step": 1755
},
{
"epoch": 1.2491128459900638,
"grad_norm": 0.29211597322985733,
"learning_rate": 3.7138837850405683e-06,
"loss": 0.1849,
"step": 1760
},
{
"epoch": 1.2526614620298084,
"grad_norm": 0.1955284532552154,
"learning_rate": 3.683980765409986e-06,
"loss": 0.2173,
"step": 1765
},
{
"epoch": 1.2562100780695529,
"grad_norm": 0.23466345155686616,
"learning_rate": 3.6541282353996275e-06,
"loss": 0.2064,
"step": 1770
},
{
"epoch": 1.2597586941092973,
"grad_norm": 0.2327320959889328,
"learning_rate": 3.6243273403141076e-06,
"loss": 0.1987,
"step": 1775
},
{
"epoch": 1.2633073101490417,
"grad_norm": 0.2524353456777873,
"learning_rate": 3.594579223477046e-06,
"loss": 0.1811,
"step": 1780
},
{
"epoch": 1.2668559261887864,
"grad_norm": 0.23557064123762672,
"learning_rate": 3.564885026187205e-06,
"loss": 0.2076,
"step": 1785
},
{
"epoch": 1.2704045422285308,
"grad_norm": 0.2816214716887357,
"learning_rate": 3.535245887674698e-06,
"loss": 0.1682,
"step": 1790
},
{
"epoch": 1.2739531582682755,
"grad_norm": 0.20319000372483886,
"learning_rate": 3.505662945057289e-06,
"loss": 0.1749,
"step": 1795
},
{
"epoch": 1.27750177430802,
"grad_norm": 0.27886243079793344,
"learning_rate": 3.4761373332967587e-06,
"loss": 0.1717,
"step": 1800
},
{
"epoch": 1.2810503903477644,
"grad_norm": 0.2131502794484524,
"learning_rate": 3.446670185155372e-06,
"loss": 0.1845,
"step": 1805
},
{
"epoch": 1.2845990063875088,
"grad_norm": 0.23513886183273328,
"learning_rate": 3.417262631152409e-06,
"loss": 0.1659,
"step": 1810
},
{
"epoch": 1.2881476224272534,
"grad_norm": 0.20993042071904863,
"learning_rate": 3.3879157995207965e-06,
"loss": 0.1797,
"step": 1815
},
{
"epoch": 1.2916962384669979,
"grad_norm": 0.2569811057408275,
"learning_rate": 3.3586308161638224e-06,
"loss": 0.1909,
"step": 1820
},
{
"epoch": 1.2952448545067423,
"grad_norm": 0.2422324199998196,
"learning_rate": 3.3294088046119423e-06,
"loss": 0.174,
"step": 1825
},
{
"epoch": 1.298793470546487,
"grad_norm": 0.22486064755230015,
"learning_rate": 3.300250885979669e-06,
"loss": 0.1569,
"step": 1830
},
{
"epoch": 1.3023420865862314,
"grad_norm": 0.24306600674795684,
"learning_rate": 3.2711581789225665e-06,
"loss": 0.172,
"step": 1835
},
{
"epoch": 1.3058907026259758,
"grad_norm": 0.20560747774916166,
"learning_rate": 3.24213179959433e-06,
"loss": 0.1816,
"step": 1840
},
{
"epoch": 1.3094393186657203,
"grad_norm": 0.29358167110568545,
"learning_rate": 3.2131728616039613e-06,
"loss": 0.1791,
"step": 1845
},
{
"epoch": 1.312987934705465,
"grad_norm": 0.22625877927643648,
"learning_rate": 3.1842824759730518e-06,
"loss": 0.178,
"step": 1850
},
{
"epoch": 1.3165365507452094,
"grad_norm": 0.24275796444257852,
"learning_rate": 3.1554617510931494e-06,
"loss": 0.189,
"step": 1855
},
{
"epoch": 1.3200851667849538,
"grad_norm": 0.2249237016657022,
"learning_rate": 3.1267117926832406e-06,
"loss": 0.153,
"step": 1860
},
{
"epoch": 1.3236337828246985,
"grad_norm": 0.24223527027968392,
"learning_rate": 3.098033703747327e-06,
"loss": 0.1745,
"step": 1865
},
{
"epoch": 1.327182398864443,
"grad_norm": 0.2589988574085145,
"learning_rate": 3.069428584532108e-06,
"loss": 0.1526,
"step": 1870
},
{
"epoch": 1.3307310149041873,
"grad_norm": 0.2548569392318924,
"learning_rate": 3.04089753248477e-06,
"loss": 0.1526,
"step": 1875
},
{
"epoch": 1.3342796309439318,
"grad_norm": 0.2910264508784293,
"learning_rate": 3.0124416422108797e-06,
"loss": 0.1907,
"step": 1880
},
{
"epoch": 1.3378282469836764,
"grad_norm": 0.2160613132957401,
"learning_rate": 2.9840620054323947e-06,
"loss": 0.1562,
"step": 1885
},
{
"epoch": 1.3413768630234209,
"grad_norm": 0.23671464694515207,
"learning_rate": 2.955759710945773e-06,
"loss": 0.1729,
"step": 1890
},
{
"epoch": 1.3449254790631653,
"grad_norm": 0.19218155125021036,
"learning_rate": 2.9275358445802073e-06,
"loss": 0.1525,
"step": 1895
},
{
"epoch": 1.34847409510291,
"grad_norm": 0.2619376354184329,
"learning_rate": 2.8993914891559583e-06,
"loss": 0.1697,
"step": 1900
},
{
"epoch": 1.3520227111426544,
"grad_norm": 0.2806820968632524,
"learning_rate": 2.8713277244428235e-06,
"loss": 0.1525,
"step": 1905
},
{
"epoch": 1.3555713271823988,
"grad_norm": 0.25736297201303776,
"learning_rate": 2.8433456271186955e-06,
"loss": 0.1505,
"step": 1910
},
{
"epoch": 1.3591199432221432,
"grad_norm": 0.2337092369279189,
"learning_rate": 2.8154462707282697e-06,
"loss": 0.1689,
"step": 1915
},
{
"epoch": 1.362668559261888,
"grad_norm": 0.3001037057919328,
"learning_rate": 2.7876307256418517e-06,
"loss": 0.1431,
"step": 1920
},
{
"epoch": 1.3662171753016323,
"grad_norm": 0.2839033066706657,
"learning_rate": 2.75990005901429e-06,
"loss": 0.1412,
"step": 1925
},
{
"epoch": 1.369765791341377,
"grad_norm": 0.2622064687845773,
"learning_rate": 2.7322553347440368e-06,
"loss": 0.1623,
"step": 1930
},
{
"epoch": 1.3733144073811214,
"grad_norm": 0.2507224631344576,
"learning_rate": 2.7046976134323284e-06,
"loss": 0.1434,
"step": 1935
},
{
"epoch": 1.3768630234208659,
"grad_norm": 0.2090303255365663,
"learning_rate": 2.677227952342502e-06,
"loss": 0.1389,
"step": 1940
},
{
"epoch": 1.3804116394606103,
"grad_norm": 0.265845863480322,
"learning_rate": 2.649847405359423e-06,
"loss": 0.1512,
"step": 1945
},
{
"epoch": 1.3839602555003547,
"grad_norm": 0.28374755138809177,
"learning_rate": 2.622557022949059e-06,
"loss": 0.1448,
"step": 1950
},
{
"epoch": 1.3875088715400994,
"grad_norm": 0.27168379934408826,
"learning_rate": 2.5953578521181778e-06,
"loss": 0.1601,
"step": 1955
},
{
"epoch": 1.3910574875798438,
"grad_norm": 0.2340237687304427,
"learning_rate": 2.5682509363741738e-06,
"loss": 0.1325,
"step": 1960
},
{
"epoch": 1.3946061036195885,
"grad_norm": 0.2303866731358228,
"learning_rate": 2.541237315685041e-06,
"loss": 0.1378,
"step": 1965
},
{
"epoch": 1.398154719659333,
"grad_norm": 0.2502513251307768,
"learning_rate": 2.514318026439469e-06,
"loss": 0.1465,
"step": 1970
},
{
"epoch": 1.4017033356990773,
"grad_norm": 0.20376176504359841,
"learning_rate": 2.4874941014070815e-06,
"loss": 0.1152,
"step": 1975
},
{
"epoch": 1.4052519517388218,
"grad_norm": 0.21092138836167548,
"learning_rate": 2.4607665696988153e-06,
"loss": 0.1557,
"step": 1980
},
{
"epoch": 1.4088005677785664,
"grad_norm": 0.21875898364131807,
"learning_rate": 2.4341364567274385e-06,
"loss": 0.1214,
"step": 1985
},
{
"epoch": 1.4123491838183109,
"grad_norm": 0.2425908539113475,
"learning_rate": 2.407604784168208e-06,
"loss": 0.1449,
"step": 1990
},
{
"epoch": 1.4158977998580553,
"grad_norm": 0.22643993500762022,
"learning_rate": 2.381172569919676e-06,
"loss": 0.1333,
"step": 1995
},
{
"epoch": 1.4194464158978,
"grad_norm": 0.2695392174717145,
"learning_rate": 2.354840828064632e-06,
"loss": 0.1361,
"step": 2000
},
{
"epoch": 1.4229950319375444,
"grad_norm": 0.19332277870019038,
"learning_rate": 2.3286105688312043e-06,
"loss": 0.1157,
"step": 2005
},
{
"epoch": 1.4265436479772888,
"grad_norm": 0.21279219762574736,
"learning_rate": 2.302482798554096e-06,
"loss": 0.1389,
"step": 2010
},
{
"epoch": 1.4300922640170333,
"grad_norm": 0.25582716492751517,
"learning_rate": 2.276458519635981e-06,
"loss": 0.1313,
"step": 2015
},
{
"epoch": 1.433640880056778,
"grad_norm": 0.19406601566905687,
"learning_rate": 2.2505387305090422e-06,
"loss": 0.1169,
"step": 2020
},
{
"epoch": 1.4371894960965224,
"grad_norm": 0.24103344206226857,
"learning_rate": 2.224724425596672e-06,
"loss": 0.1237,
"step": 2025
},
{
"epoch": 1.4407381121362668,
"grad_norm": 0.22132596734131824,
"learning_rate": 2.199016595275313e-06,
"loss": 0.1302,
"step": 2030
},
{
"epoch": 1.4442867281760114,
"grad_norm": 0.1941988247320679,
"learning_rate": 2.1734162258364723e-06,
"loss": 0.1101,
"step": 2035
},
{
"epoch": 1.4478353442157559,
"grad_norm": 0.2532240345198083,
"learning_rate": 2.1479242994488715e-06,
"loss": 0.1456,
"step": 2040
},
{
"epoch": 1.4513839602555003,
"grad_norm": 0.2192071100155077,
"learning_rate": 2.1225417941207693e-06,
"loss": 0.1347,
"step": 2045
},
{
"epoch": 1.4549325762952448,
"grad_norm": 0.23048316879795078,
"learning_rate": 2.097269683662444e-06,
"loss": 0.1206,
"step": 2050
},
{
"epoch": 1.4584811923349894,
"grad_norm": 0.22175626911805313,
"learning_rate": 2.0721089376488253e-06,
"loss": 0.1323,
"step": 2055
},
{
"epoch": 1.4620298083747338,
"grad_norm": 0.2502555002765058,
"learning_rate": 2.047060521382303e-06,
"loss": 0.1328,
"step": 2060
},
{
"epoch": 1.4655784244144783,
"grad_norm": 0.2398815628977275,
"learning_rate": 2.022125395855688e-06,
"loss": 0.1197,
"step": 2065
},
{
"epoch": 1.469127040454223,
"grad_norm": 0.1490857229171638,
"learning_rate": 1.9973045177153474e-06,
"loss": 0.1213,
"step": 2070
},
{
"epoch": 1.4726756564939674,
"grad_norm": 0.24352054577970925,
"learning_rate": 1.9725988392244973e-06,
"loss": 0.1293,
"step": 2075
},
{
"epoch": 1.4762242725337118,
"grad_norm": 0.2752743649146217,
"learning_rate": 1.948009308226674e-06,
"loss": 0.1257,
"step": 2080
},
{
"epoch": 1.4797728885734562,
"grad_norm": 0.23848651147423147,
"learning_rate": 1.923536868109368e-06,
"loss": 0.1133,
"step": 2085
},
{
"epoch": 1.483321504613201,
"grad_norm": 0.226375123844369,
"learning_rate": 1.8991824577678269e-06,
"loss": 0.1248,
"step": 2090
},
{
"epoch": 1.4868701206529453,
"grad_norm": 0.17831272705991047,
"learning_rate": 1.8749470115690405e-06,
"loss": 0.1191,
"step": 2095
},
{
"epoch": 1.49041873669269,
"grad_norm": 0.19989309118105109,
"learning_rate": 1.8508314593158876e-06,
"loss": 0.1089,
"step": 2100
},
{
"epoch": 1.4939673527324344,
"grad_norm": 0.25291957563867945,
"learning_rate": 1.8268367262114688e-06,
"loss": 0.1107,
"step": 2105
},
{
"epoch": 1.4975159687721789,
"grad_norm": 0.26111553623184586,
"learning_rate": 1.8029637328236066e-06,
"loss": 0.127,
"step": 2110
},
{
"epoch": 1.5010645848119233,
"grad_norm": 0.17734621444425636,
"learning_rate": 1.7792133950495294e-06,
"loss": 0.1221,
"step": 2115
},
{
"epoch": 1.5046132008516677,
"grad_norm": 0.21866895385006405,
"learning_rate": 1.7555866240807313e-06,
"loss": 0.1175,
"step": 2120
},
{
"epoch": 1.5081618168914124,
"grad_norm": 0.21206145415522118,
"learning_rate": 1.7320843263680197e-06,
"loss": 0.1112,
"step": 2125
},
{
"epoch": 1.5117104329311568,
"grad_norm": 0.23217405715138023,
"learning_rate": 1.7087074035867284e-06,
"loss": 0.1236,
"step": 2130
},
{
"epoch": 1.5152590489709015,
"grad_norm": 0.23613246716958128,
"learning_rate": 1.6854567526021344e-06,
"loss": 0.1259,
"step": 2135
},
{
"epoch": 1.518807665010646,
"grad_norm": 0.2770947677393959,
"learning_rate": 1.662333265435042e-06,
"loss": 0.1325,
"step": 2140
},
{
"epoch": 1.5223562810503903,
"grad_norm": 0.22743676823776157,
"learning_rate": 1.6393378292275658e-06,
"loss": 0.1046,
"step": 2145
},
{
"epoch": 1.5259048970901348,
"grad_norm": 0.22463849593545182,
"learning_rate": 1.6164713262090925e-06,
"loss": 0.1222,
"step": 2150
},
{
"epoch": 1.5294535131298792,
"grad_norm": 0.25992298503818306,
"learning_rate": 1.5937346336624304e-06,
"loss": 0.1054,
"step": 2155
},
{
"epoch": 1.5330021291696239,
"grad_norm": 0.24333389296266456,
"learning_rate": 1.571128623890159e-06,
"loss": 0.1234,
"step": 2160
},
{
"epoch": 1.5365507452093683,
"grad_norm": 0.1576217297203735,
"learning_rate": 1.548654164181157e-06,
"loss": 0.0812,
"step": 2165
},
{
"epoch": 1.540099361249113,
"grad_norm": 0.25124974098461594,
"learning_rate": 1.526312116777336e-06,
"loss": 0.1223,
"step": 2170
},
{
"epoch": 1.5436479772888574,
"grad_norm": 0.1805572426097591,
"learning_rate": 1.5041033388405484e-06,
"loss": 0.1011,
"step": 2175
},
{
"epoch": 1.5471965933286018,
"grad_norm": 0.2635173275494755,
"learning_rate": 1.4820286824197123e-06,
"loss": 0.1022,
"step": 2180
},
{
"epoch": 1.5507452093683463,
"grad_norm": 0.2519229374420747,
"learning_rate": 1.4600889944181174e-06,
"loss": 0.096,
"step": 2185
},
{
"epoch": 1.5542938254080907,
"grad_norm": 0.2649903063198177,
"learning_rate": 1.4382851165609334e-06,
"loss": 0.1109,
"step": 2190
},
{
"epoch": 1.5578424414478353,
"grad_norm": 0.23665945872290622,
"learning_rate": 1.4166178853629203e-06,
"loss": 0.0968,
"step": 2195
},
{
"epoch": 1.56139105748758,
"grad_norm": 0.25098309375854744,
"learning_rate": 1.3950881320963304e-06,
"loss": 0.1017,
"step": 2200
},
{
"epoch": 1.5649396735273244,
"grad_norm": 0.18767287759613047,
"learning_rate": 1.3736966827590204e-06,
"loss": 0.0873,
"step": 2205
},
{
"epoch": 1.5684882895670689,
"grad_norm": 0.21049901799309081,
"learning_rate": 1.3524443580427565e-06,
"loss": 0.0869,
"step": 2210
},
{
"epoch": 1.5720369056068133,
"grad_norm": 0.1425904930686491,
"learning_rate": 1.3313319733017376e-06,
"loss": 0.0775,
"step": 2215
},
{
"epoch": 1.5755855216465577,
"grad_norm": 0.1721847853811724,
"learning_rate": 1.310360338521302e-06,
"loss": 0.0976,
"step": 2220
},
{
"epoch": 1.5791341376863024,
"grad_norm": 0.2072546605022138,
"learning_rate": 1.2895302582868612e-06,
"loss": 0.1038,
"step": 2225
},
{
"epoch": 1.5826827537260468,
"grad_norm": 0.18546013706742767,
"learning_rate": 1.268842531753024e-06,
"loss": 0.0963,
"step": 2230
},
{
"epoch": 1.5862313697657915,
"grad_norm": 0.24037696856938554,
"learning_rate": 1.2482979526129452e-06,
"loss": 0.0952,
"step": 2235
},
{
"epoch": 1.589779985805536,
"grad_norm": 0.20569567459581428,
"learning_rate": 1.2278973090678692e-06,
"loss": 0.094,
"step": 2240
},
{
"epoch": 1.5933286018452804,
"grad_norm": 0.17589280363526694,
"learning_rate": 1.207641383796892e-06,
"loss": 0.1005,
"step": 2245
},
{
"epoch": 1.5968772178850248,
"grad_norm": 0.16432603828811707,
"learning_rate": 1.1875309539269332e-06,
"loss": 0.1067,
"step": 2250
},
{
"epoch": 1.6004258339247692,
"grad_norm": 0.14498826888722152,
"learning_rate": 1.167566791002921e-06,
"loss": 0.0861,
"step": 2255
},
{
"epoch": 1.6039744499645139,
"grad_norm": 0.2110225317057283,
"learning_rate": 1.1477496609581946e-06,
"loss": 0.0964,
"step": 2260
},
{
"epoch": 1.6075230660042583,
"grad_norm": 0.20764105569594873,
"learning_rate": 1.1280803240851129e-06,
"loss": 0.0868,
"step": 2265
},
{
"epoch": 1.611071682044003,
"grad_norm": 0.19107043293753664,
"learning_rate": 1.1085595350058904e-06,
"loss": 0.0993,
"step": 2270
},
{
"epoch": 1.6146202980837474,
"grad_norm": 0.2322126900568226,
"learning_rate": 1.0891880426436435e-06,
"loss": 0.0964,
"step": 2275
},
{
"epoch": 1.6181689141234918,
"grad_norm": 0.13623595050892906,
"learning_rate": 1.0699665901936595e-06,
"loss": 0.1009,
"step": 2280
},
{
"epoch": 1.6217175301632363,
"grad_norm": 0.1591220315778765,
"learning_rate": 1.0508959150948822e-06,
"loss": 0.1001,
"step": 2285
},
{
"epoch": 1.6252661462029807,
"grad_norm": 0.24004716673738535,
"learning_rate": 1.0319767490016196e-06,
"loss": 0.1018,
"step": 2290
},
{
"epoch": 1.6288147622427254,
"grad_norm": 0.20108223866992236,
"learning_rate": 1.0132098177554761e-06,
"loss": 0.0772,
"step": 2295
},
{
"epoch": 1.6323633782824698,
"grad_norm": 0.1902841124570637,
"learning_rate": 9.945958413575007e-07,
"loss": 0.0836,
"step": 2300
},
{
"epoch": 1.6359119943222145,
"grad_norm": 0.19566816160223913,
"learning_rate": 9.761355339405692e-07,
"loss": 0.0782,
"step": 2305
},
{
"epoch": 1.639460610361959,
"grad_norm": 0.15951200187445064,
"learning_rate": 9.57829603741982e-07,
"loss": 0.0953,
"step": 2310
},
{
"epoch": 1.6430092264017033,
"grad_norm": 0.18548232850931468,
"learning_rate": 9.396787530762947e-07,
"loss": 0.096,
"step": 2315
},
{
"epoch": 1.6465578424414478,
"grad_norm": 0.19921321830785457,
"learning_rate": 9.216836783083722e-07,
"loss": 0.088,
"step": 2320
},
{
"epoch": 1.6501064584811922,
"grad_norm": 0.19447804509558328,
"learning_rate": 9.038450698266732e-07,
"loss": 0.0923,
"step": 2325
},
{
"epoch": 1.6536550745209369,
"grad_norm": 0.15880291937153415,
"learning_rate": 8.861636120167632e-07,
"loss": 0.0752,
"step": 2330
},
{
"epoch": 1.6572036905606813,
"grad_norm": 0.1216232913360999,
"learning_rate": 8.686399832350567e-07,
"loss": 0.0929,
"step": 2335
},
{
"epoch": 1.660752306600426,
"grad_norm": 0.25799634010332295,
"learning_rate": 8.512748557827927e-07,
"loss": 0.0801,
"step": 2340
},
{
"epoch": 1.6643009226401704,
"grad_norm": 0.22041336862684355,
"learning_rate": 8.340688958802407e-07,
"loss": 0.0886,
"step": 2345
},
{
"epoch": 1.6678495386799148,
"grad_norm": 0.17513196460398783,
"learning_rate": 8.170227636411448e-07,
"loss": 0.0807,
"step": 2350
},
{
"epoch": 1.6713981547196592,
"grad_norm": 0.1575347574847102,
"learning_rate": 8.001371130473906e-07,
"loss": 0.0826,
"step": 2355
},
{
"epoch": 1.6749467707594037,
"grad_norm": 0.20356373103632977,
"learning_rate": 7.834125919239222e-07,
"loss": 0.0933,
"step": 2360
},
{
"epoch": 1.6784953867991483,
"grad_norm": 0.12469606641503765,
"learning_rate": 7.668498419138831e-07,
"loss": 0.0709,
"step": 2365
},
{
"epoch": 1.682044002838893,
"grad_norm": 0.17056082766780378,
"learning_rate": 7.504494984540033e-07,
"loss": 0.0811,
"step": 2370
},
{
"epoch": 1.6855926188786374,
"grad_norm": 0.12812663368743,
"learning_rate": 7.34212190750217e-07,
"loss": 0.083,
"step": 2375
},
{
"epoch": 1.6891412349183819,
"grad_norm": 0.1901394592444526,
"learning_rate": 7.181385417535253e-07,
"loss": 0.0742,
"step": 2380
},
{
"epoch": 1.6926898509581263,
"grad_norm": 0.26009578065156624,
"learning_rate": 7.02229168136096e-07,
"loss": 0.0846,
"step": 2385
},
{
"epoch": 1.6962384669978707,
"grad_norm": 0.15155874952679413,
"learning_rate": 6.864846802676028e-07,
"loss": 0.0791,
"step": 2390
},
{
"epoch": 1.6997870830376152,
"grad_norm": 0.14760161262690638,
"learning_rate": 6.709056821918109e-07,
"loss": 0.072,
"step": 2395
},
{
"epoch": 1.7033356990773598,
"grad_norm": 0.19330420416295252,
"learning_rate": 6.554927716034015e-07,
"loss": 0.0856,
"step": 2400
},
{
"epoch": 1.7068843151171045,
"grad_norm": 0.2023270743554052,
"learning_rate": 6.402465398250396e-07,
"loss": 0.0792,
"step": 2405
},
{
"epoch": 1.710432931156849,
"grad_norm": 0.19479492119153588,
"learning_rate": 6.251675717846905e-07,
"loss": 0.1031,
"step": 2410
},
{
"epoch": 1.7139815471965933,
"grad_norm": 0.17016708319833052,
"learning_rate": 6.102564459931765e-07,
"loss": 0.0939,
"step": 2415
},
{
"epoch": 1.7175301632363378,
"grad_norm": 0.1627417227150133,
"learning_rate": 5.95513734521983e-07,
"loss": 0.0965,
"step": 2420
},
{
"epoch": 1.7210787792760822,
"grad_norm": 0.1517445461802794,
"learning_rate": 5.809400029813106e-07,
"loss": 0.0745,
"step": 2425
},
{
"epoch": 1.7246273953158269,
"grad_norm": 0.22409152080710204,
"learning_rate": 5.665358104983753e-07,
"loss": 0.0815,
"step": 2430
},
{
"epoch": 1.7281760113555713,
"grad_norm": 0.2508370406777965,
"learning_rate": 5.523017096959555e-07,
"loss": 0.0873,
"step": 2435
},
{
"epoch": 1.731724627395316,
"grad_norm": 0.12671885690626034,
"learning_rate": 5.382382466711972e-07,
"loss": 0.0988,
"step": 2440
},
{
"epoch": 1.7352732434350604,
"grad_norm": 0.16316925407933794,
"learning_rate": 5.243459609746521e-07,
"loss": 0.0768,
"step": 2445
},
{
"epoch": 1.7388218594748048,
"grad_norm": 0.1830877281891777,
"learning_rate": 5.106253855895865e-07,
"loss": 0.0916,
"step": 2450
},
{
"epoch": 1.7423704755145493,
"grad_norm": 0.14755589729277402,
"learning_rate": 4.970770469115283e-07,
"loss": 0.0765,
"step": 2455
},
{
"epoch": 1.7459190915542937,
"grad_norm": 0.18250179497312569,
"learning_rate": 4.837014647280741e-07,
"loss": 0.0961,
"step": 2460
},
{
"epoch": 1.7494677075940384,
"grad_norm": 0.20587238731058435,
"learning_rate": 4.704991521989466e-07,
"loss": 0.0912,
"step": 2465
},
{
"epoch": 1.7530163236337828,
"grad_norm": 0.1632328577893508,
"learning_rate": 4.5747061583630404e-07,
"loss": 0.0828,
"step": 2470
},
{
"epoch": 1.7565649396735274,
"grad_norm": 0.1783668994748264,
"learning_rate": 4.4461635548531444e-07,
"loss": 0.0781,
"step": 2475
},
{
"epoch": 1.7601135557132719,
"grad_norm": 0.1551917662601825,
"learning_rate": 4.3193686430497204e-07,
"loss": 0.0673,
"step": 2480
},
{
"epoch": 1.7636621717530163,
"grad_norm": 0.34742297109827425,
"learning_rate": 4.194326287491818e-07,
"loss": 0.0807,
"step": 2485
},
{
"epoch": 1.7672107877927608,
"grad_norm": 0.11673745718310856,
"learning_rate": 4.0710412854809255e-07,
"loss": 0.0743,
"step": 2490
},
{
"epoch": 1.7707594038325052,
"grad_norm": 0.1369038071833693,
"learning_rate": 3.949518366896954e-07,
"loss": 0.0785,
"step": 2495
},
{
"epoch": 1.7743080198722498,
"grad_norm": 0.1796729155628991,
"learning_rate": 3.829762194016745e-07,
"loss": 0.0806,
"step": 2500
},
{
"epoch": 1.7778566359119943,
"grad_norm": 0.2641121549347932,
"learning_rate": 3.7117773613352226e-07,
"loss": 0.0954,
"step": 2505
},
{
"epoch": 1.781405251951739,
"grad_norm": 0.1563141596386351,
"learning_rate": 3.595568395389104e-07,
"loss": 0.0873,
"step": 2510
},
{
"epoch": 1.7849538679914834,
"grad_norm": 0.12073227545401369,
"learning_rate": 3.481139754583263e-07,
"loss": 0.0708,
"step": 2515
},
{
"epoch": 1.7885024840312278,
"grad_norm": 0.20809531241637985,
"learning_rate": 3.368495829019652e-07,
"loss": 0.0793,
"step": 2520
},
{
"epoch": 1.7920511000709722,
"grad_norm": 0.13218046003099626,
"learning_rate": 3.2576409403288764e-07,
"loss": 0.0761,
"step": 2525
},
{
"epoch": 1.7955997161107167,
"grad_norm": 0.15722684002637335,
"learning_rate": 3.1485793415044483e-07,
"loss": 0.0597,
"step": 2530
},
{
"epoch": 1.7991483321504613,
"grad_norm": 0.15659033530844094,
"learning_rate": 3.0413152167395375e-07,
"loss": 0.0708,
"step": 2535
},
{
"epoch": 1.802696948190206,
"grad_norm": 0.1638051999867514,
"learning_rate": 2.9358526812664933e-07,
"loss": 0.0775,
"step": 2540
},
{
"epoch": 1.8062455642299504,
"grad_norm": 0.11504369754799153,
"learning_rate": 2.832195781198932e-07,
"loss": 0.0716,
"step": 2545
},
{
"epoch": 1.8097941802696949,
"grad_norm": 0.12074103630664931,
"learning_rate": 2.73034849337655e-07,
"loss": 0.074,
"step": 2550
},
{
"epoch": 1.8133427963094393,
"grad_norm": 0.16166884798766148,
"learning_rate": 2.630314725212507e-07,
"loss": 0.0854,
"step": 2555
},
{
"epoch": 1.8168914123491837,
"grad_norm": 0.1769340537838461,
"learning_rate": 2.532098314543546e-07,
"loss": 0.0831,
"step": 2560
},
{
"epoch": 1.8204400283889282,
"grad_norm": 0.12448298205459797,
"learning_rate": 2.4357030294827333e-07,
"loss": 0.0938,
"step": 2565
},
{
"epoch": 1.8239886444286728,
"grad_norm": 0.1347162460758778,
"learning_rate": 2.3411325682748843e-07,
"loss": 0.0786,
"step": 2570
},
{
"epoch": 1.8275372604684175,
"grad_norm": 0.18021228911367754,
"learning_rate": 2.2483905591547396e-07,
"loss": 0.0706,
"step": 2575
},
{
"epoch": 1.831085876508162,
"grad_norm": 0.15388878946960718,
"learning_rate": 2.1574805602076808e-07,
"loss": 0.0742,
"step": 2580
},
{
"epoch": 1.8346344925479063,
"grad_norm": 0.1409815076088805,
"learning_rate": 2.0684060592332856e-07,
"loss": 0.0813,
"step": 2585
},
{
"epoch": 1.8381831085876508,
"grad_norm": 0.132105830265776,
"learning_rate": 1.9811704736114768e-07,
"loss": 0.0795,
"step": 2590
},
{
"epoch": 1.8417317246273952,
"grad_norm": 0.1608151994733542,
"learning_rate": 1.8957771501714572e-07,
"loss": 0.0747,
"step": 2595
},
{
"epoch": 1.8452803406671399,
"grad_norm": 0.25690392163293946,
"learning_rate": 1.812229365063256e-07,
"loss": 0.0949,
"step": 2600
},
{
"epoch": 1.8488289567068843,
"grad_norm": 0.12813812442563294,
"learning_rate": 1.7305303236320846e-07,
"loss": 0.0709,
"step": 2605
},
{
"epoch": 1.852377572746629,
"grad_norm": 0.2552017585952295,
"learning_rate": 1.6506831602953298e-07,
"loss": 0.082,
"step": 2610
},
{
"epoch": 1.8559261887863734,
"grad_norm": 0.11392874297086744,
"learning_rate": 1.5726909384223167e-07,
"loss": 0.0785,
"step": 2615
},
{
"epoch": 1.8594748048261178,
"grad_norm": 0.0926818641888793,
"learning_rate": 1.4965566502167738e-07,
"loss": 0.0732,
"step": 2620
},
{
"epoch": 1.8630234208658623,
"grad_norm": 0.1745896048261097,
"learning_rate": 1.422283216602044e-07,
"loss": 0.07,
"step": 2625
},
{
"epoch": 1.8665720369056067,
"grad_norm": 0.25569276554141235,
"learning_rate": 1.3498734871090047e-07,
"loss": 0.0744,
"step": 2630
},
{
"epoch": 1.8701206529453513,
"grad_norm": 0.1561840086119096,
"learning_rate": 1.2793302397667795e-07,
"loss": 0.1031,
"step": 2635
},
{
"epoch": 1.8736692689850958,
"grad_norm": 0.17002830056241805,
"learning_rate": 1.2106561809961115e-07,
"loss": 0.0604,
"step": 2640
},
{
"epoch": 1.8772178850248404,
"grad_norm": 0.09880195273905168,
"learning_rate": 1.1438539455055686e-07,
"loss": 0.0762,
"step": 2645
},
{
"epoch": 1.8807665010645849,
"grad_norm": 0.15881716388407485,
"learning_rate": 1.0789260961904357e-07,
"loss": 0.0732,
"step": 2650
},
{
"epoch": 1.8843151171043293,
"grad_norm": 0.12450413227358498,
"learning_rate": 1.01587512403441e-07,
"loss": 0.0753,
"step": 2655
},
{
"epoch": 1.8878637331440737,
"grad_norm": 0.16616205486910834,
"learning_rate": 9.547034480140216e-08,
"loss": 0.0653,
"step": 2660
},
{
"epoch": 1.8914123491838182,
"grad_norm": 0.1469708894112524,
"learning_rate": 8.954134150058247e-08,
"loss": 0.0811,
"step": 2665
},
{
"epoch": 1.8949609652235628,
"grad_norm": 0.12726157309621852,
"learning_rate": 8.38007299696375e-08,
"loss": 0.0868,
"step": 2670
},
{
"epoch": 1.8985095812633073,
"grad_norm": 0.19648699971776568,
"learning_rate": 7.824873044949332e-08,
"loss": 0.0807,
"step": 2675
},
{
"epoch": 1.902058197303052,
"grad_norm": 0.11546091276599789,
"learning_rate": 7.288555594489933e-08,
"loss": 0.0712,
"step": 2680
},
{
"epoch": 1.9056068133427964,
"grad_norm": 0.10138694402799019,
"learning_rate": 6.771141221625588e-08,
"loss": 0.0727,
"step": 2685
},
{
"epoch": 1.9091554293825408,
"grad_norm": 0.20198542990475563,
"learning_rate": 6.272649777171902e-08,
"loss": 0.0763,
"step": 2690
},
{
"epoch": 1.9127040454222852,
"grad_norm": 0.13013805786355714,
"learning_rate": 5.793100385958539e-08,
"loss": 0.077,
"step": 2695
},
{
"epoch": 1.9162526614620297,
"grad_norm": 0.1042225823119502,
"learning_rate": 5.332511446095534e-08,
"loss": 0.0656,
"step": 2700
},
{
"epoch": 1.9198012775017743,
"grad_norm": 0.14869482521151722,
"learning_rate": 4.890900628267303e-08,
"loss": 0.0843,
"step": 2705
},
{
"epoch": 1.923349893541519,
"grad_norm": 0.12632238867825482,
"learning_rate": 4.468284875054851e-08,
"loss": 0.0732,
"step": 2710
},
{
"epoch": 1.9268985095812634,
"grad_norm": 0.15464924395801358,
"learning_rate": 4.0646804002857363e-08,
"loss": 0.0672,
"step": 2715
},
{
"epoch": 1.9304471256210078,
"grad_norm": 0.13140515404824576,
"learning_rate": 3.680102688411957e-08,
"loss": 0.0763,
"step": 2720
},
{
"epoch": 1.9339957416607523,
"grad_norm": 0.12137789420912942,
"learning_rate": 3.3145664939158716e-08,
"loss": 0.0753,
"step": 2725
},
{
"epoch": 1.9375443577004967,
"grad_norm": 0.14998755773936648,
"learning_rate": 2.9680858407441503e-08,
"loss": 0.0774,
"step": 2730
},
{
"epoch": 1.9410929737402411,
"grad_norm": 0.10160902415138964,
"learning_rate": 2.640674021769929e-08,
"loss": 0.0715,
"step": 2735
},
{
"epoch": 1.9446415897799858,
"grad_norm": 0.15110326146665365,
"learning_rate": 2.3323435982825494e-08,
"loss": 0.0701,
"step": 2740
},
{
"epoch": 1.9481902058197305,
"grad_norm": 0.1628264579317661,
"learning_rate": 2.0431063995056676e-08,
"loss": 0.074,
"step": 2745
},
{
"epoch": 1.951738821859475,
"grad_norm": 0.16758064162952077,
"learning_rate": 1.772973522143673e-08,
"loss": 0.0691,
"step": 2750
},
{
"epoch": 1.9552874378992193,
"grad_norm": 0.18640301521721325,
"learning_rate": 1.5219553299556934e-08,
"loss": 0.077,
"step": 2755
},
{
"epoch": 1.9588360539389638,
"grad_norm": 0.12508839216721881,
"learning_rate": 1.290061453358138e-08,
"loss": 0.0703,
"step": 2760
},
{
"epoch": 1.9623846699787082,
"grad_norm": 0.12573094619311348,
"learning_rate": 1.0773007890551578e-08,
"loss": 0.0755,
"step": 2765
},
{
"epoch": 1.9659332860184529,
"grad_norm": 0.12490681654446255,
"learning_rate": 8.836814996971977e-09,
"loss": 0.0596,
"step": 2770
},
{
"epoch": 1.9694819020581973,
"grad_norm": 0.16829669835922828,
"learning_rate": 7.092110135681895e-09,
"loss": 0.078,
"step": 2775
},
{
"epoch": 1.973030518097942,
"grad_norm": 0.11404355662235376,
"learning_rate": 5.538960243002267e-09,
"loss": 0.0772,
"step": 2780
},
{
"epoch": 1.9765791341376864,
"grad_norm": 0.11299640900034978,
"learning_rate": 4.177424906168237e-09,
"loss": 0.0634,
"step": 2785
},
{
"epoch": 1.9801277501774308,
"grad_norm": 0.10294782038389917,
"learning_rate": 3.007556361043773e-09,
"loss": 0.0723,
"step": 2790
},
{
"epoch": 1.9836763662171752,
"grad_norm": 0.16091006854892712,
"learning_rate": 2.0293994901182666e-09,
"loss": 0.0669,
"step": 2795
},
{
"epoch": 1.9872249822569197,
"grad_norm": 0.23548530390053649,
"learning_rate": 1.2429918207829127e-09,
"loss": 0.0819,
"step": 2800
},
{
"epoch": 1.9907735982966643,
"grad_norm": 0.1739766519225282,
"learning_rate": 6.483635238918595e-10,
"loss": 0.0811,
"step": 2805
},
{
"epoch": 1.9943222143364088,
"grad_norm": 0.16700964731575352,
"learning_rate": 2.4553741260535667e-10,
"loss": 0.092,
"step": 2810
},
{
"epoch": 1.9978708303761534,
"grad_norm": 0.14284331771864398,
"learning_rate": 3.452894151267927e-11,
"loss": 0.0763,
"step": 2815
},
{
"epoch": 2.0,
"eval_loss": 0.1015239953994751,
"eval_runtime": 35.1893,
"eval_samples_per_second": 19.438,
"eval_steps_per_second": 4.859,
"step": 2818
},
{
"epoch": 2.0,
"step": 2818,
"total_flos": 7.824243780420108e+18,
"train_loss": 0.26107758036143736,
"train_runtime": 22639.1204,
"train_samples_per_second": 5.975,
"train_steps_per_second": 0.124
}
],
"logging_steps": 5,
"max_steps": 2818,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.824243780420108e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}