sanity_syntax_10p_200k / trainer_state.json
terry69's picture
Model save
968d5cc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996003729852138,
"eval_steps": 500,
"global_step": 1876,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005328360197149328,
"grad_norm": 0.3923943299533747,
"learning_rate": 1.0638297872340427e-06,
"loss": 1.5282,
"step": 1
},
{
"epoch": 0.0026641800985746636,
"grad_norm": 0.39997680726308193,
"learning_rate": 5.319148936170213e-06,
"loss": 1.5368,
"step": 5
},
{
"epoch": 0.005328360197149327,
"grad_norm": 0.42655574017759024,
"learning_rate": 1.0638297872340426e-05,
"loss": 1.5771,
"step": 10
},
{
"epoch": 0.007992540295723992,
"grad_norm": 0.4800547793020182,
"learning_rate": 1.595744680851064e-05,
"loss": 1.5624,
"step": 15
},
{
"epoch": 0.010656720394298654,
"grad_norm": 0.36870917165643946,
"learning_rate": 2.1276595744680852e-05,
"loss": 1.5192,
"step": 20
},
{
"epoch": 0.013320900492873319,
"grad_norm": 0.23219684210183744,
"learning_rate": 2.6595744680851064e-05,
"loss": 1.5223,
"step": 25
},
{
"epoch": 0.015985080591447983,
"grad_norm": 0.2091606557073564,
"learning_rate": 3.191489361702128e-05,
"loss": 1.4527,
"step": 30
},
{
"epoch": 0.018649260690022644,
"grad_norm": 0.18998890324763035,
"learning_rate": 3.723404255319149e-05,
"loss": 1.4828,
"step": 35
},
{
"epoch": 0.02131344078859731,
"grad_norm": 0.14996191641971635,
"learning_rate": 4.2553191489361704e-05,
"loss": 1.4084,
"step": 40
},
{
"epoch": 0.023977620887171973,
"grad_norm": 0.16249029940771254,
"learning_rate": 4.787234042553192e-05,
"loss": 1.4001,
"step": 45
},
{
"epoch": 0.026641800985746637,
"grad_norm": 0.1460528370416976,
"learning_rate": 5.319148936170213e-05,
"loss": 1.3865,
"step": 50
},
{
"epoch": 0.0293059810843213,
"grad_norm": 0.12748888043071394,
"learning_rate": 5.851063829787234e-05,
"loss": 1.3832,
"step": 55
},
{
"epoch": 0.031970161182895966,
"grad_norm": 0.1047553716550612,
"learning_rate": 6.382978723404256e-05,
"loss": 1.3383,
"step": 60
},
{
"epoch": 0.03463434128147063,
"grad_norm": 1.0945957696792898,
"learning_rate": 6.914893617021277e-05,
"loss": 1.3216,
"step": 65
},
{
"epoch": 0.03729852138004529,
"grad_norm": 0.0891343071370869,
"learning_rate": 7.446808510638298e-05,
"loss": 1.3098,
"step": 70
},
{
"epoch": 0.03996270147861995,
"grad_norm": 0.07870697792715504,
"learning_rate": 7.978723404255319e-05,
"loss": 1.3138,
"step": 75
},
{
"epoch": 0.04262688157719462,
"grad_norm": 0.08760750591416677,
"learning_rate": 8.510638297872341e-05,
"loss": 1.3006,
"step": 80
},
{
"epoch": 0.04529106167576928,
"grad_norm": 0.08120742351067671,
"learning_rate": 9.042553191489363e-05,
"loss": 1.2385,
"step": 85
},
{
"epoch": 0.047955241774343946,
"grad_norm": 0.08142820910966997,
"learning_rate": 9.574468085106384e-05,
"loss": 1.2945,
"step": 90
},
{
"epoch": 0.05061942187291861,
"grad_norm": 0.09177922715037878,
"learning_rate": 0.00010106382978723406,
"loss": 1.2761,
"step": 95
},
{
"epoch": 0.053283601971493275,
"grad_norm": 0.07677835076593886,
"learning_rate": 0.00010638297872340425,
"loss": 1.244,
"step": 100
},
{
"epoch": 0.05594778207006794,
"grad_norm": 0.08525332172522379,
"learning_rate": 0.00011170212765957446,
"loss": 1.2838,
"step": 105
},
{
"epoch": 0.0586119621686426,
"grad_norm": 0.10003274873557398,
"learning_rate": 0.00011702127659574468,
"loss": 1.2489,
"step": 110
},
{
"epoch": 0.06127614226721726,
"grad_norm": 0.09434455492112725,
"learning_rate": 0.0001223404255319149,
"loss": 1.2503,
"step": 115
},
{
"epoch": 0.06394032236579193,
"grad_norm": 0.0960230638906738,
"learning_rate": 0.00012765957446808513,
"loss": 1.2205,
"step": 120
},
{
"epoch": 0.0666045024643666,
"grad_norm": 0.09721981845211174,
"learning_rate": 0.00013297872340425532,
"loss": 1.2624,
"step": 125
},
{
"epoch": 0.06926868256294126,
"grad_norm": 0.08809066774892928,
"learning_rate": 0.00013829787234042554,
"loss": 1.2545,
"step": 130
},
{
"epoch": 0.07193286266151591,
"grad_norm": 0.26229245154975894,
"learning_rate": 0.00014361702127659576,
"loss": 1.2408,
"step": 135
},
{
"epoch": 0.07459704276009058,
"grad_norm": 0.10552899642439768,
"learning_rate": 0.00014893617021276596,
"loss": 1.2392,
"step": 140
},
{
"epoch": 0.07726122285866524,
"grad_norm": 0.10911221765360271,
"learning_rate": 0.00015425531914893618,
"loss": 1.2148,
"step": 145
},
{
"epoch": 0.0799254029572399,
"grad_norm": 0.11632059103832315,
"learning_rate": 0.00015957446808510637,
"loss": 1.2382,
"step": 150
},
{
"epoch": 0.08258958305581457,
"grad_norm": 0.10281933721760748,
"learning_rate": 0.00016489361702127662,
"loss": 1.226,
"step": 155
},
{
"epoch": 0.08525376315438923,
"grad_norm": 0.10561194502249595,
"learning_rate": 0.00017021276595744682,
"loss": 1.2531,
"step": 160
},
{
"epoch": 0.0879179432529639,
"grad_norm": 0.10407844313384682,
"learning_rate": 0.000175531914893617,
"loss": 1.2428,
"step": 165
},
{
"epoch": 0.09058212335153856,
"grad_norm": 0.08952286824052161,
"learning_rate": 0.00018085106382978726,
"loss": 1.2176,
"step": 170
},
{
"epoch": 0.09324630345011323,
"grad_norm": 0.0938821785588311,
"learning_rate": 0.00018617021276595746,
"loss": 1.2307,
"step": 175
},
{
"epoch": 0.09591048354868789,
"grad_norm": 0.1162063476232978,
"learning_rate": 0.00019148936170212768,
"loss": 1.2276,
"step": 180
},
{
"epoch": 0.09857466364726256,
"grad_norm": 0.09632362372953375,
"learning_rate": 0.00019680851063829787,
"loss": 1.213,
"step": 185
},
{
"epoch": 0.10123884374583722,
"grad_norm": 0.0984821085857903,
"learning_rate": 0.00019999930723752516,
"loss": 1.2093,
"step": 190
},
{
"epoch": 0.10390302384441188,
"grad_norm": 0.09843324070313318,
"learning_rate": 0.00019999151376991434,
"loss": 1.2405,
"step": 195
},
{
"epoch": 0.10656720394298655,
"grad_norm": 0.09038802438280195,
"learning_rate": 0.00019997506155872244,
"loss": 1.2226,
"step": 200
},
{
"epoch": 0.10923138404156121,
"grad_norm": 0.09204563702733434,
"learning_rate": 0.00019994995202862512,
"loss": 1.1841,
"step": 205
},
{
"epoch": 0.11189556414013588,
"grad_norm": 0.09153264803236888,
"learning_rate": 0.00019991618735397672,
"loss": 1.1963,
"step": 210
},
{
"epoch": 0.11455974423871054,
"grad_norm": 0.10122556405254433,
"learning_rate": 0.00019987377045862202,
"loss": 1.1912,
"step": 215
},
{
"epoch": 0.1172239243372852,
"grad_norm": 0.10731171538200276,
"learning_rate": 0.00019982270501564284,
"loss": 1.2206,
"step": 220
},
{
"epoch": 0.11988810443585986,
"grad_norm": 0.10261300609074893,
"learning_rate": 0.00019976299544704026,
"loss": 1.2063,
"step": 225
},
{
"epoch": 0.12255228453443452,
"grad_norm": 0.10220846662233612,
"learning_rate": 0.00019969464692335152,
"loss": 1.2176,
"step": 230
},
{
"epoch": 0.1252164646330092,
"grad_norm": 0.106711961435993,
"learning_rate": 0.00019961766536320225,
"loss": 1.2338,
"step": 235
},
{
"epoch": 0.12788064473158386,
"grad_norm": 0.08642302944089619,
"learning_rate": 0.0001995320574327941,
"loss": 1.1834,
"step": 240
},
{
"epoch": 0.13054482483015853,
"grad_norm": 0.10984213170710444,
"learning_rate": 0.00019943783054532732,
"loss": 1.2157,
"step": 245
},
{
"epoch": 0.1332090049287332,
"grad_norm": 0.09485880486202997,
"learning_rate": 0.00019933499286035894,
"loss": 1.204,
"step": 250
},
{
"epoch": 0.13587318502730786,
"grad_norm": 0.10445439668189137,
"learning_rate": 0.0001992235532830961,
"loss": 1.2193,
"step": 255
},
{
"epoch": 0.13853736512588252,
"grad_norm": 0.08363317544398702,
"learning_rate": 0.00019910352146362497,
"loss": 1.1989,
"step": 260
},
{
"epoch": 0.1412015452244572,
"grad_norm": 0.13246579577470613,
"learning_rate": 0.00019897490779607514,
"loss": 1.1942,
"step": 265
},
{
"epoch": 0.14386572532303182,
"grad_norm": 0.11140593579855256,
"learning_rate": 0.00019883772341771936,
"loss": 1.226,
"step": 270
},
{
"epoch": 0.1465299054216065,
"grad_norm": 0.09879562730310984,
"learning_rate": 0.0001986919802080093,
"loss": 1.2206,
"step": 275
},
{
"epoch": 0.14919408552018115,
"grad_norm": 0.09576696495195093,
"learning_rate": 0.00019853769078754686,
"loss": 1.2156,
"step": 280
},
{
"epoch": 0.15185826561875582,
"grad_norm": 0.09339602143232201,
"learning_rate": 0.00019837486851699104,
"loss": 1.2136,
"step": 285
},
{
"epoch": 0.15452244571733048,
"grad_norm": 0.1015702269360243,
"learning_rate": 0.0001982035274959014,
"loss": 1.228,
"step": 290
},
{
"epoch": 0.15718662581590515,
"grad_norm": 0.10731429321620178,
"learning_rate": 0.0001980236825615166,
"loss": 1.2084,
"step": 295
},
{
"epoch": 0.1598508059144798,
"grad_norm": 0.11015660781012064,
"learning_rate": 0.00019783534928747006,
"loss": 1.233,
"step": 300
},
{
"epoch": 0.16251498601305447,
"grad_norm": 0.09317505295867419,
"learning_rate": 0.000197638543982441,
"loss": 1.1949,
"step": 305
},
{
"epoch": 0.16517916611162914,
"grad_norm": 0.10444172159792364,
"learning_rate": 0.00019743328368874237,
"loss": 1.2077,
"step": 310
},
{
"epoch": 0.1678433462102038,
"grad_norm": 0.08551570812304626,
"learning_rate": 0.00019721958618084507,
"loss": 1.206,
"step": 315
},
{
"epoch": 0.17050752630877847,
"grad_norm": 0.08486005829321344,
"learning_rate": 0.00019699746996383878,
"loss": 1.2162,
"step": 320
},
{
"epoch": 0.17317170640735313,
"grad_norm": 0.08572790019112694,
"learning_rate": 0.00019676695427182938,
"loss": 1.1866,
"step": 325
},
{
"epoch": 0.1758358865059278,
"grad_norm": 0.0948088094425236,
"learning_rate": 0.00019652805906627356,
"loss": 1.1903,
"step": 330
},
{
"epoch": 0.17850006660450246,
"grad_norm": 0.09120766035226856,
"learning_rate": 0.00019628080503425013,
"loss": 1.2231,
"step": 335
},
{
"epoch": 0.18116424670307713,
"grad_norm": 0.10099249661711791,
"learning_rate": 0.0001960252135866687,
"loss": 1.192,
"step": 340
},
{
"epoch": 0.1838284268016518,
"grad_norm": 0.08142630900860537,
"learning_rate": 0.0001957613068564156,
"loss": 1.2093,
"step": 345
},
{
"epoch": 0.18649260690022645,
"grad_norm": 0.08243035183498343,
"learning_rate": 0.00019548910769643722,
"loss": 1.2232,
"step": 350
},
{
"epoch": 0.18915678699880112,
"grad_norm": 0.08113246622113654,
"learning_rate": 0.00019520863967776116,
"loss": 1.1773,
"step": 355
},
{
"epoch": 0.19182096709737578,
"grad_norm": 0.08217032990907831,
"learning_rate": 0.000194919927087455,
"loss": 1.1909,
"step": 360
},
{
"epoch": 0.19448514719595045,
"grad_norm": 0.08936356448152723,
"learning_rate": 0.00019462299492652336,
"loss": 1.1768,
"step": 365
},
{
"epoch": 0.1971493272945251,
"grad_norm": 0.09171279408988006,
"learning_rate": 0.00019431786890774264,
"loss": 1.1899,
"step": 370
},
{
"epoch": 0.19981350739309978,
"grad_norm": 0.08313643695167447,
"learning_rate": 0.00019400457545343464,
"loss": 1.224,
"step": 375
},
{
"epoch": 0.20247768749167444,
"grad_norm": 0.08891928174737382,
"learning_rate": 0.00019368314169317856,
"loss": 1.1723,
"step": 380
},
{
"epoch": 0.2051418675902491,
"grad_norm": 0.08770933776267686,
"learning_rate": 0.00019335359546146156,
"loss": 1.2028,
"step": 385
},
{
"epoch": 0.20780604768882377,
"grad_norm": 0.08301528792067146,
"learning_rate": 0.00019301596529526854,
"loss": 1.2056,
"step": 390
},
{
"epoch": 0.21047022778739843,
"grad_norm": 0.08890281842327881,
"learning_rate": 0.00019267028043161094,
"loss": 1.2138,
"step": 395
},
{
"epoch": 0.2131344078859731,
"grad_norm": 0.0900002819215321,
"learning_rate": 0.0001923165708049951,
"loss": 1.2051,
"step": 400
},
{
"epoch": 0.21579858798454776,
"grad_norm": 0.10504748978346506,
"learning_rate": 0.00019195486704482977,
"loss": 1.1954,
"step": 405
},
{
"epoch": 0.21846276808312243,
"grad_norm": 0.09718704604170597,
"learning_rate": 0.0001915852004727742,
"loss": 1.1639,
"step": 410
},
{
"epoch": 0.2211269481816971,
"grad_norm": 0.08996864675224003,
"learning_rate": 0.00019120760310002545,
"loss": 1.2265,
"step": 415
},
{
"epoch": 0.22379112828027176,
"grad_norm": 0.07943107198536507,
"learning_rate": 0.0001908221076245466,
"loss": 1.2169,
"step": 420
},
{
"epoch": 0.22645530837884642,
"grad_norm": 0.08284744366625013,
"learning_rate": 0.0001904287474282353,
"loss": 1.1828,
"step": 425
},
{
"epoch": 0.22911948847742108,
"grad_norm": 0.07736916644319017,
"learning_rate": 0.00019002755657403298,
"loss": 1.1979,
"step": 430
},
{
"epoch": 0.23178366857599575,
"grad_norm": 0.08280858698864835,
"learning_rate": 0.00018961856980297513,
"loss": 1.191,
"step": 435
},
{
"epoch": 0.2344478486745704,
"grad_norm": 0.07937399115370877,
"learning_rate": 0.0001892018225311831,
"loss": 1.2173,
"step": 440
},
{
"epoch": 0.23711202877314508,
"grad_norm": 0.08409239653616714,
"learning_rate": 0.00018877735084679693,
"loss": 1.1903,
"step": 445
},
{
"epoch": 0.23977620887171971,
"grad_norm": 0.0825557060792781,
"learning_rate": 0.00018834519150685071,
"loss": 1.1985,
"step": 450
},
{
"epoch": 0.24244038897029438,
"grad_norm": 0.07982212340626871,
"learning_rate": 0.00018790538193408937,
"loss": 1.2038,
"step": 455
},
{
"epoch": 0.24510456906886904,
"grad_norm": 0.0774659417434534,
"learning_rate": 0.000187457960213728,
"loss": 1.1788,
"step": 460
},
{
"epoch": 0.2477687491674437,
"grad_norm": 0.08395126176587492,
"learning_rate": 0.00018700296509015406,
"loss": 1.1862,
"step": 465
},
{
"epoch": 0.2504329292660184,
"grad_norm": 0.09287329697747186,
"learning_rate": 0.00018654043596357217,
"loss": 1.2092,
"step": 470
},
{
"epoch": 0.25309710936459306,
"grad_norm": 0.09110234430419713,
"learning_rate": 0.00018607041288659236,
"loss": 1.1974,
"step": 475
},
{
"epoch": 0.25576128946316773,
"grad_norm": 0.08308651742822264,
"learning_rate": 0.00018559293656076166,
"loss": 1.1869,
"step": 480
},
{
"epoch": 0.2584254695617424,
"grad_norm": 0.08214254142545631,
"learning_rate": 0.0001851080483330396,
"loss": 1.1831,
"step": 485
},
{
"epoch": 0.26108964966031706,
"grad_norm": 0.08701064777773787,
"learning_rate": 0.00018461579019221774,
"loss": 1.1879,
"step": 490
},
{
"epoch": 0.2637538297588917,
"grad_norm": 0.08680397859688439,
"learning_rate": 0.00018411620476528362,
"loss": 1.1929,
"step": 495
},
{
"epoch": 0.2664180098574664,
"grad_norm": 0.09030897168085561,
"learning_rate": 0.0001836093353137297,
"loss": 1.1902,
"step": 500
},
{
"epoch": 0.26908218995604105,
"grad_norm": 0.08473182672968443,
"learning_rate": 0.00018309522572980673,
"loss": 1.2044,
"step": 505
},
{
"epoch": 0.2717463700546157,
"grad_norm": 0.08536458790900281,
"learning_rate": 0.00018257392053272345,
"loss": 1.2259,
"step": 510
},
{
"epoch": 0.2744105501531904,
"grad_norm": 0.081003787470878,
"learning_rate": 0.00018204546486479096,
"loss": 1.213,
"step": 515
},
{
"epoch": 0.27707473025176504,
"grad_norm": 0.089272857126091,
"learning_rate": 0.00018150990448751394,
"loss": 1.1791,
"step": 520
},
{
"epoch": 0.2797389103503397,
"grad_norm": 0.08621840551694437,
"learning_rate": 0.0001809672857776278,
"loss": 1.2024,
"step": 525
},
{
"epoch": 0.2824030904489144,
"grad_norm": 0.07933042265020962,
"learning_rate": 0.00018041765572308278,
"loss": 1.2025,
"step": 530
},
{
"epoch": 0.28506727054748904,
"grad_norm": 0.0850752493351441,
"learning_rate": 0.00017986106191897493,
"loss": 1.1994,
"step": 535
},
{
"epoch": 0.28773145064606365,
"grad_norm": 0.07823580475459424,
"learning_rate": 0.00017929755256342479,
"loss": 1.2139,
"step": 540
},
{
"epoch": 0.2903956307446383,
"grad_norm": 0.07706877273113974,
"learning_rate": 0.0001787271764534035,
"loss": 1.1909,
"step": 545
},
{
"epoch": 0.293059810843213,
"grad_norm": 0.07946013242507238,
"learning_rate": 0.00017814998298050743,
"loss": 1.1795,
"step": 550
},
{
"epoch": 0.29572399094178764,
"grad_norm": 0.08180878258450536,
"learning_rate": 0.00017756602212668082,
"loss": 1.1906,
"step": 555
},
{
"epoch": 0.2983881710403623,
"grad_norm": 0.09250927603458256,
"learning_rate": 0.00017697534445988803,
"loss": 1.1779,
"step": 560
},
{
"epoch": 0.30105235113893697,
"grad_norm": 0.09177332060381718,
"learning_rate": 0.00017637800112973428,
"loss": 1.1723,
"step": 565
},
{
"epoch": 0.30371653123751163,
"grad_norm": 0.0837886550110598,
"learning_rate": 0.00017577404386303645,
"loss": 1.1954,
"step": 570
},
{
"epoch": 0.3063807113360863,
"grad_norm": 0.09291852958714042,
"learning_rate": 0.0001751635249593439,
"loss": 1.1913,
"step": 575
},
{
"epoch": 0.30904489143466096,
"grad_norm": 0.0751186613096299,
"learning_rate": 0.00017454649728640943,
"loss": 1.1884,
"step": 580
},
{
"epoch": 0.3117090715332356,
"grad_norm": 0.08335342725723932,
"learning_rate": 0.00017392301427561146,
"loss": 1.2182,
"step": 585
},
{
"epoch": 0.3143732516318103,
"grad_norm": 0.10512490135127568,
"learning_rate": 0.00017329312991732688,
"loss": 1.2022,
"step": 590
},
{
"epoch": 0.31703743173038496,
"grad_norm": 0.08736469147276345,
"learning_rate": 0.00017265689875625587,
"loss": 1.2034,
"step": 595
},
{
"epoch": 0.3197016118289596,
"grad_norm": 0.08181172036849664,
"learning_rate": 0.00017201437588669878,
"loss": 1.1734,
"step": 600
},
{
"epoch": 0.3223657919275343,
"grad_norm": 0.10161889430894355,
"learning_rate": 0.0001713656169477849,
"loss": 1.1819,
"step": 605
},
{
"epoch": 0.32502997202610895,
"grad_norm": 0.08576387885641809,
"learning_rate": 0.00017071067811865476,
"loss": 1.2189,
"step": 610
},
{
"epoch": 0.3276941521246836,
"grad_norm": 0.07896254402827822,
"learning_rate": 0.00017004961611359506,
"loss": 1.1975,
"step": 615
},
{
"epoch": 0.3303583322232583,
"grad_norm": 0.0843725771070502,
"learning_rate": 0.00016938248817712767,
"loss": 1.2049,
"step": 620
},
{
"epoch": 0.33302251232183294,
"grad_norm": 0.08922430755828152,
"learning_rate": 0.0001687093520790524,
"loss": 1.2,
"step": 625
},
{
"epoch": 0.3356866924204076,
"grad_norm": 0.09060345427129264,
"learning_rate": 0.00016803026610944462,
"loss": 1.2019,
"step": 630
},
{
"epoch": 0.33835087251898227,
"grad_norm": 0.08487192271777715,
"learning_rate": 0.0001673452890736074,
"loss": 1.2101,
"step": 635
},
{
"epoch": 0.34101505261755694,
"grad_norm": 0.10237244952453312,
"learning_rate": 0.00016665448028697961,
"loss": 1.1917,
"step": 640
},
{
"epoch": 0.3436792327161316,
"grad_norm": 0.08297722193267411,
"learning_rate": 0.0001659578995699991,
"loss": 1.1714,
"step": 645
},
{
"epoch": 0.34634341281470626,
"grad_norm": 0.08034790490692274,
"learning_rate": 0.00016525560724292305,
"loss": 1.2135,
"step": 650
},
{
"epoch": 0.34900759291328093,
"grad_norm": 0.07638178286632234,
"learning_rate": 0.00016454766412060402,
"loss": 1.1725,
"step": 655
},
{
"epoch": 0.3516717730118556,
"grad_norm": 0.08455753089941971,
"learning_rate": 0.00016383413150722415,
"loss": 1.1842,
"step": 660
},
{
"epoch": 0.35433595311043026,
"grad_norm": 0.07805666953810234,
"learning_rate": 0.00016311507119098627,
"loss": 1.2005,
"step": 665
},
{
"epoch": 0.3570001332090049,
"grad_norm": 0.07613396423899182,
"learning_rate": 0.00016239054543876343,
"loss": 1.1971,
"step": 670
},
{
"epoch": 0.3596643133075796,
"grad_norm": 0.07964041743179047,
"learning_rate": 0.00016166061699070703,
"loss": 1.2247,
"step": 675
},
{
"epoch": 0.36232849340615425,
"grad_norm": 0.07545427492471256,
"learning_rate": 0.00016092534905481367,
"loss": 1.1965,
"step": 680
},
{
"epoch": 0.3649926735047289,
"grad_norm": 0.07935466685302715,
"learning_rate": 0.00016018480530145175,
"loss": 1.1777,
"step": 685
},
{
"epoch": 0.3676568536033036,
"grad_norm": 0.07697591776205723,
"learning_rate": 0.00015943904985784796,
"loss": 1.2043,
"step": 690
},
{
"epoch": 0.37032103370187824,
"grad_norm": 0.08353307335035773,
"learning_rate": 0.00015868814730253422,
"loss": 1.2016,
"step": 695
},
{
"epoch": 0.3729852138004529,
"grad_norm": 0.12083203321167697,
"learning_rate": 0.00015793216265975538,
"loss": 1.2039,
"step": 700
},
{
"epoch": 0.3756493938990276,
"grad_norm": 0.08002384831591002,
"learning_rate": 0.0001571711613938387,
"loss": 1.2018,
"step": 705
},
{
"epoch": 0.37831357399760224,
"grad_norm": 0.07416696808155057,
"learning_rate": 0.00015640520940352474,
"loss": 1.1761,
"step": 710
},
{
"epoch": 0.3809777540961769,
"grad_norm": 0.07477584114343706,
"learning_rate": 0.00015563437301626095,
"loss": 1.1691,
"step": 715
},
{
"epoch": 0.38364193419475157,
"grad_norm": 0.08021029434297425,
"learning_rate": 0.00015485871898245822,
"loss": 1.1879,
"step": 720
},
{
"epoch": 0.38630611429332623,
"grad_norm": 0.07655243996754035,
"learning_rate": 0.0001540783144697103,
"loss": 1.1895,
"step": 725
},
{
"epoch": 0.3889702943919009,
"grad_norm": 0.07480027547778711,
"learning_rate": 0.0001532932270569778,
"loss": 1.1975,
"step": 730
},
{
"epoch": 0.39163447449047556,
"grad_norm": 0.0777051603884027,
"learning_rate": 0.00015250352472873603,
"loss": 1.1676,
"step": 735
},
{
"epoch": 0.3942986545890502,
"grad_norm": 0.09274122893001194,
"learning_rate": 0.00015170927586908786,
"loss": 1.185,
"step": 740
},
{
"epoch": 0.3969628346876249,
"grad_norm": 0.08369763065847857,
"learning_rate": 0.00015091054925584204,
"loss": 1.1839,
"step": 745
},
{
"epoch": 0.39962701478619955,
"grad_norm": 0.0785954093676615,
"learning_rate": 0.0001501074140545575,
"loss": 1.195,
"step": 750
},
{
"epoch": 0.4022911948847742,
"grad_norm": 0.08125955697905107,
"learning_rate": 0.00014929993981255388,
"loss": 1.167,
"step": 755
},
{
"epoch": 0.4049553749833489,
"grad_norm": 0.0795318945758808,
"learning_rate": 0.00014848819645288915,
"loss": 1.1809,
"step": 760
},
{
"epoch": 0.40761955508192355,
"grad_norm": 0.08215872783779934,
"learning_rate": 0.0001476722542683045,
"loss": 1.1982,
"step": 765
},
{
"epoch": 0.4102837351804982,
"grad_norm": 0.0902569586423544,
"learning_rate": 0.0001468521839151375,
"loss": 1.2058,
"step": 770
},
{
"epoch": 0.4129479152790729,
"grad_norm": 0.08977396388549971,
"learning_rate": 0.00014602805640720373,
"loss": 1.2046,
"step": 775
},
{
"epoch": 0.41561209537764754,
"grad_norm": 0.08797468277446341,
"learning_rate": 0.00014519994310964698,
"loss": 1.1767,
"step": 780
},
{
"epoch": 0.4182762754762222,
"grad_norm": 0.08154618218534682,
"learning_rate": 0.0001443679157327598,
"loss": 1.2016,
"step": 785
},
{
"epoch": 0.42094045557479687,
"grad_norm": 0.08123872265319151,
"learning_rate": 0.00014353204632577352,
"loss": 1.1836,
"step": 790
},
{
"epoch": 0.42360463567337153,
"grad_norm": 0.09034057444914761,
"learning_rate": 0.00014269240727061928,
"loss": 1.1984,
"step": 795
},
{
"epoch": 0.4262688157719462,
"grad_norm": 0.07751594264193425,
"learning_rate": 0.00014184907127566006,
"loss": 1.1721,
"step": 800
},
{
"epoch": 0.42893299587052086,
"grad_norm": 0.07817067745827988,
"learning_rate": 0.00014100211136939457,
"loss": 1.2066,
"step": 805
},
{
"epoch": 0.4315971759690955,
"grad_norm": 0.07666379004066795,
"learning_rate": 0.00014015160089413331,
"loss": 1.201,
"step": 810
},
{
"epoch": 0.4342613560676702,
"grad_norm": 0.07977661703965382,
"learning_rate": 0.00013929761349964755,
"loss": 1.1986,
"step": 815
},
{
"epoch": 0.43692553616624485,
"grad_norm": 0.07515238252186936,
"learning_rate": 0.00013844022313679166,
"loss": 1.1673,
"step": 820
},
{
"epoch": 0.4395897162648195,
"grad_norm": 0.08668062284259438,
"learning_rate": 0.00013757950405109926,
"loss": 1.2055,
"step": 825
},
{
"epoch": 0.4422538963633942,
"grad_norm": 0.07958110157591258,
"learning_rate": 0.00013671553077635403,
"loss": 1.2052,
"step": 830
},
{
"epoch": 0.44491807646196885,
"grad_norm": 0.07801662094774196,
"learning_rate": 0.00013584837812813554,
"loss": 1.191,
"step": 835
},
{
"epoch": 0.4475822565605435,
"grad_norm": 0.0766973195738648,
"learning_rate": 0.00013497812119734037,
"loss": 1.1918,
"step": 840
},
{
"epoch": 0.4502464366591182,
"grad_norm": 0.07475449320976814,
"learning_rate": 0.00013410483534367988,
"loss": 1.1837,
"step": 845
},
{
"epoch": 0.45291061675769284,
"grad_norm": 0.07991050463902843,
"learning_rate": 0.0001332285961891543,
"loss": 1.1941,
"step": 850
},
{
"epoch": 0.4555747968562675,
"grad_norm": 0.08254589522833469,
"learning_rate": 0.00013234947961150438,
"loss": 1.1782,
"step": 855
},
{
"epoch": 0.45823897695484217,
"grad_norm": 0.07717793943049406,
"learning_rate": 0.0001314675617376406,
"loss": 1.1773,
"step": 860
},
{
"epoch": 0.46090315705341683,
"grad_norm": 0.07981760014460237,
"learning_rate": 0.00013058291893705123,
"loss": 1.1587,
"step": 865
},
{
"epoch": 0.4635673371519915,
"grad_norm": 0.0746596071448608,
"learning_rate": 0.00012969562781518884,
"loss": 1.1672,
"step": 870
},
{
"epoch": 0.46623151725056616,
"grad_norm": 0.07546467676535663,
"learning_rate": 0.00012880576520683687,
"loss": 1.188,
"step": 875
},
{
"epoch": 0.4688956973491408,
"grad_norm": 0.0822763180376801,
"learning_rate": 0.00012791340816945609,
"loss": 1.1773,
"step": 880
},
{
"epoch": 0.4715598774477155,
"grad_norm": 0.08094564516346787,
"learning_rate": 0.00012701863397651176,
"loss": 1.1692,
"step": 885
},
{
"epoch": 0.47422405754629016,
"grad_norm": 0.08080906995900732,
"learning_rate": 0.00012612152011078233,
"loss": 1.1923,
"step": 890
},
{
"epoch": 0.47688823764486477,
"grad_norm": 0.07799247485478207,
"learning_rate": 0.00012522214425764953,
"loss": 1.1762,
"step": 895
},
{
"epoch": 0.47955241774343943,
"grad_norm": 0.07890128228587008,
"learning_rate": 0.00012432058429837152,
"loss": 1.1872,
"step": 900
},
{
"epoch": 0.4822165978420141,
"grad_norm": 0.07930485805581218,
"learning_rate": 0.00012341691830333867,
"loss": 1.1801,
"step": 905
},
{
"epoch": 0.48488077794058876,
"grad_norm": 0.07523574346910737,
"learning_rate": 0.000122511224525313,
"loss": 1.1705,
"step": 910
},
{
"epoch": 0.4875449580391634,
"grad_norm": 0.08019564137195122,
"learning_rate": 0.00012160358139265202,
"loss": 1.1968,
"step": 915
},
{
"epoch": 0.4902091381377381,
"grad_norm": 0.07799871453882441,
"learning_rate": 0.00012069406750251713,
"loss": 1.2037,
"step": 920
},
{
"epoch": 0.49287331823631275,
"grad_norm": 0.07616406333142303,
"learning_rate": 0.00011978276161406756,
"loss": 1.1771,
"step": 925
},
{
"epoch": 0.4955374983348874,
"grad_norm": 0.08287250388641591,
"learning_rate": 0.00011886974264164037,
"loss": 1.1817,
"step": 930
},
{
"epoch": 0.4982016784334621,
"grad_norm": 0.07709252333335805,
"learning_rate": 0.00011795508964791659,
"loss": 1.1837,
"step": 935
},
{
"epoch": 0.5008658585320368,
"grad_norm": 0.07774534231906956,
"learning_rate": 0.00011703888183707512,
"loss": 1.1801,
"step": 940
},
{
"epoch": 0.5035300386306114,
"grad_norm": 0.07836641922774835,
"learning_rate": 0.00011612119854793377,
"loss": 1.1928,
"step": 945
},
{
"epoch": 0.5061942187291861,
"grad_norm": 0.0855819667886154,
"learning_rate": 0.00011520211924707917,
"loss": 1.2115,
"step": 950
},
{
"epoch": 0.5088583988277607,
"grad_norm": 0.08249753825043267,
"learning_rate": 0.00011428172352198534,
"loss": 1.1902,
"step": 955
},
{
"epoch": 0.5115225789263355,
"grad_norm": 0.07960165957450928,
"learning_rate": 0.00011336009107412162,
"loss": 1.1846,
"step": 960
},
{
"epoch": 0.5141867590249101,
"grad_norm": 0.0844585882351588,
"learning_rate": 0.00011243730171205118,
"loss": 1.1546,
"step": 965
},
{
"epoch": 0.5168509391234848,
"grad_norm": 0.07615164060165457,
"learning_rate": 0.00011151343534451994,
"loss": 1.1909,
"step": 970
},
{
"epoch": 0.5195151192220594,
"grad_norm": 0.08628193709778877,
"learning_rate": 0.00011058857197353683,
"loss": 1.1832,
"step": 975
},
{
"epoch": 0.5221792993206341,
"grad_norm": 0.08464663568633256,
"learning_rate": 0.0001096627916874461,
"loss": 1.19,
"step": 980
},
{
"epoch": 0.5248434794192087,
"grad_norm": 0.07335066380801168,
"learning_rate": 0.00010873617465399209,
"loss": 1.1962,
"step": 985
},
{
"epoch": 0.5275076595177834,
"grad_norm": 0.07968663002138815,
"learning_rate": 0.00010780880111337703,
"loss": 1.1882,
"step": 990
},
{
"epoch": 0.530171839616358,
"grad_norm": 0.08264162145143913,
"learning_rate": 0.00010688075137131282,
"loss": 1.1731,
"step": 995
},
{
"epoch": 0.5328360197149328,
"grad_norm": 0.0801021939451612,
"learning_rate": 0.00010595210579206676,
"loss": 1.1947,
"step": 1000
},
{
"epoch": 0.5355001998135074,
"grad_norm": 0.08017946990238331,
"learning_rate": 0.0001050229447915027,
"loss": 1.2001,
"step": 1005
},
{
"epoch": 0.5381643799120821,
"grad_norm": 0.07658836800590599,
"learning_rate": 0.0001040933488301171,
"loss": 1.2002,
"step": 1010
},
{
"epoch": 0.5408285600106567,
"grad_norm": 0.07982866205360158,
"learning_rate": 0.00010316339840607194,
"loss": 1.1836,
"step": 1015
},
{
"epoch": 0.5434927401092314,
"grad_norm": 0.07960723865086924,
"learning_rate": 0.0001022331740482237,
"loss": 1.1777,
"step": 1020
},
{
"epoch": 0.546156920207806,
"grad_norm": 0.0748679705543507,
"learning_rate": 0.00010130275630915009,
"loss": 1.1921,
"step": 1025
},
{
"epoch": 0.5488211003063808,
"grad_norm": 0.0878356815626402,
"learning_rate": 0.00010037222575817475,
"loss": 1.1709,
"step": 1030
},
{
"epoch": 0.5514852804049554,
"grad_norm": 0.07848930190109027,
"learning_rate": 9.944166297439011e-05,
"loss": 1.1896,
"step": 1035
},
{
"epoch": 0.5541494605035301,
"grad_norm": 0.07598028143410929,
"learning_rate": 9.85111485396798e-05,
"loss": 1.1671,
"step": 1040
},
{
"epoch": 0.5568136406021047,
"grad_norm": 0.07455678199305199,
"learning_rate": 9.758076303174082e-05,
"loss": 1.1879,
"step": 1045
},
{
"epoch": 0.5594778207006794,
"grad_norm": 0.07933883340689663,
"learning_rate": 9.665058701710561e-05,
"loss": 1.1906,
"step": 1050
},
{
"epoch": 0.562142000799254,
"grad_norm": 0.0782311193459855,
"learning_rate": 9.572070104416566e-05,
"loss": 1.1814,
"step": 1055
},
{
"epoch": 0.5648061808978287,
"grad_norm": 0.07728561745740628,
"learning_rate": 9.479118563619636e-05,
"loss": 1.179,
"step": 1060
},
{
"epoch": 0.5674703609964034,
"grad_norm": 0.08077639691991045,
"learning_rate": 9.386212128438412e-05,
"loss": 1.1957,
"step": 1065
},
{
"epoch": 0.5701345410949781,
"grad_norm": 0.07773225366778684,
"learning_rate": 9.29335884408562e-05,
"loss": 1.221,
"step": 1070
},
{
"epoch": 0.5727987211935527,
"grad_norm": 0.07773099884244754,
"learning_rate": 9.2005667511714e-05,
"loss": 1.158,
"step": 1075
},
{
"epoch": 0.5754629012921273,
"grad_norm": 0.07725515386954404,
"learning_rate": 9.107843885007042e-05,
"loss": 1.1699,
"step": 1080
},
{
"epoch": 0.578127081390702,
"grad_norm": 0.08162033290693702,
"learning_rate": 9.015198274909151e-05,
"loss": 1.1885,
"step": 1085
},
{
"epoch": 0.5807912614892766,
"grad_norm": 0.07774378164612243,
"learning_rate": 8.922637943504361e-05,
"loss": 1.1924,
"step": 1090
},
{
"epoch": 0.5834554415878513,
"grad_norm": 0.08627267331483346,
"learning_rate": 8.830170906034625e-05,
"loss": 1.1971,
"step": 1095
},
{
"epoch": 0.586119621686426,
"grad_norm": 0.07645366300993464,
"learning_rate": 8.737805169663114e-05,
"loss": 1.1807,
"step": 1100
},
{
"epoch": 0.5887838017850007,
"grad_norm": 0.08443619410444902,
"learning_rate": 8.645548732780864e-05,
"loss": 1.1761,
"step": 1105
},
{
"epoch": 0.5914479818835753,
"grad_norm": 0.07706639361213953,
"learning_rate": 8.553409584314138e-05,
"loss": 1.1902,
"step": 1110
},
{
"epoch": 0.59411216198215,
"grad_norm": 0.08249092376191036,
"learning_rate": 8.461395703032638e-05,
"loss": 1.1839,
"step": 1115
},
{
"epoch": 0.5967763420807246,
"grad_norm": 0.08065147422026245,
"learning_rate": 8.369515056858575e-05,
"loss": 1.1731,
"step": 1120
},
{
"epoch": 0.5994405221792993,
"grad_norm": 0.07848658439918688,
"learning_rate": 8.277775602176702e-05,
"loss": 1.177,
"step": 1125
},
{
"epoch": 0.6021047022778739,
"grad_norm": 0.07816482146783796,
"learning_rate": 8.186185283145325e-05,
"loss": 1.1625,
"step": 1130
},
{
"epoch": 0.6047688823764487,
"grad_norm": 0.07727396916592474,
"learning_rate": 8.094752031008371e-05,
"loss": 1.2127,
"step": 1135
},
{
"epoch": 0.6074330624750233,
"grad_norm": 0.0789877501243841,
"learning_rate": 8.003483763408603e-05,
"loss": 1.1685,
"step": 1140
},
{
"epoch": 0.610097242573598,
"grad_norm": 0.07854117414343613,
"learning_rate": 7.912388383701982e-05,
"loss": 1.1826,
"step": 1145
},
{
"epoch": 0.6127614226721726,
"grad_norm": 0.07974248826415456,
"learning_rate": 7.821473780273279e-05,
"loss": 1.1867,
"step": 1150
},
{
"epoch": 0.6154256027707473,
"grad_norm": 0.08234486503543673,
"learning_rate": 7.730747825852975e-05,
"loss": 1.1928,
"step": 1155
},
{
"epoch": 0.6180897828693219,
"grad_norm": 0.08113984284337296,
"learning_rate": 7.64021837683554e-05,
"loss": 1.2018,
"step": 1160
},
{
"epoch": 0.6207539629678966,
"grad_norm": 0.07823635237833673,
"learning_rate": 7.549893272599098e-05,
"loss": 1.1756,
"step": 1165
},
{
"epoch": 0.6234181430664713,
"grad_norm": 0.07948360741609674,
"learning_rate": 7.459780334826578e-05,
"loss": 1.2052,
"step": 1170
},
{
"epoch": 0.626082323165046,
"grad_norm": 0.07981232728150925,
"learning_rate": 7.369887366828405e-05,
"loss": 1.1935,
"step": 1175
},
{
"epoch": 0.6287465032636206,
"grad_norm": 0.07772028594630517,
"learning_rate": 7.28022215286676e-05,
"loss": 1.1742,
"step": 1180
},
{
"epoch": 0.6314106833621953,
"grad_norm": 0.07942822498880386,
"learning_rate": 7.190792457481526e-05,
"loss": 1.2044,
"step": 1185
},
{
"epoch": 0.6340748634607699,
"grad_norm": 0.08072671416547043,
"learning_rate": 7.101606024817888e-05,
"loss": 1.2139,
"step": 1190
},
{
"epoch": 0.6367390435593446,
"grad_norm": 0.07657851251411404,
"learning_rate": 7.01267057795577e-05,
"loss": 1.1771,
"step": 1195
},
{
"epoch": 0.6394032236579192,
"grad_norm": 0.07629161839506614,
"learning_rate": 6.923993818241013e-05,
"loss": 1.1878,
"step": 1200
},
{
"epoch": 0.642067403756494,
"grad_norm": 0.07750592384625017,
"learning_rate": 6.83558342461851e-05,
"loss": 1.1965,
"step": 1205
},
{
"epoch": 0.6447315838550686,
"grad_norm": 0.08074061148056243,
"learning_rate": 6.747447052967246e-05,
"loss": 1.1598,
"step": 1210
},
{
"epoch": 0.6473957639536433,
"grad_norm": 0.08114992875515604,
"learning_rate": 6.659592335437321e-05,
"loss": 1.1863,
"step": 1215
},
{
"epoch": 0.6500599440522179,
"grad_norm": 0.07837122688644742,
"learning_rate": 6.572026879789064e-05,
"loss": 1.1789,
"step": 1220
},
{
"epoch": 0.6527241241507926,
"grad_norm": 0.07904963655043487,
"learning_rate": 6.484758268734226e-05,
"loss": 1.1988,
"step": 1225
},
{
"epoch": 0.6553883042493672,
"grad_norm": 0.07794516218687547,
"learning_rate": 6.397794059279376e-05,
"loss": 1.1797,
"step": 1230
},
{
"epoch": 0.658052484347942,
"grad_norm": 0.0782549564009468,
"learning_rate": 6.311141782071486e-05,
"loss": 1.1861,
"step": 1235
},
{
"epoch": 0.6607166644465166,
"grad_norm": 0.08113466467014144,
"learning_rate": 6.224808940745814e-05,
"loss": 1.1812,
"step": 1240
},
{
"epoch": 0.6633808445450913,
"grad_norm": 0.0751998700186739,
"learning_rate": 6.138803011276157e-05,
"loss": 1.1903,
"step": 1245
},
{
"epoch": 0.6660450246436659,
"grad_norm": 0.08263899166467202,
"learning_rate": 6.0531314413274306e-05,
"loss": 1.1652,
"step": 1250
},
{
"epoch": 0.6687092047422406,
"grad_norm": 0.07705824954536489,
"learning_rate": 5.9678016496107737e-05,
"loss": 1.1811,
"step": 1255
},
{
"epoch": 0.6713733848408152,
"grad_norm": 0.08540183694555796,
"learning_rate": 5.8828210252410995e-05,
"loss": 1.1896,
"step": 1260
},
{
"epoch": 0.6740375649393899,
"grad_norm": 0.0742001043082849,
"learning_rate": 5.798196927097259e-05,
"loss": 1.1709,
"step": 1265
},
{
"epoch": 0.6767017450379645,
"grad_norm": 0.2483647268345394,
"learning_rate": 5.7139366831847955e-05,
"loss": 1.1841,
"step": 1270
},
{
"epoch": 0.6793659251365393,
"grad_norm": 0.07966799417452507,
"learning_rate": 5.63004759000136e-05,
"loss": 1.1739,
"step": 1275
},
{
"epoch": 0.6820301052351139,
"grad_norm": 0.07969094718480424,
"learning_rate": 5.546536911904896e-05,
"loss": 1.1903,
"step": 1280
},
{
"epoch": 0.6846942853336886,
"grad_norm": 0.0828241197230709,
"learning_rate": 5.463411880484577e-05,
"loss": 1.1802,
"step": 1285
},
{
"epoch": 0.6873584654322632,
"grad_norm": 0.09372413723978523,
"learning_rate": 5.3806796939345685e-05,
"loss": 1.1786,
"step": 1290
},
{
"epoch": 0.6900226455308379,
"grad_norm": 0.07631033783827931,
"learning_rate": 5.298347516430748e-05,
"loss": 1.1895,
"step": 1295
},
{
"epoch": 0.6926868256294125,
"grad_norm": 0.07690415214661994,
"learning_rate": 5.216422477510267e-05,
"loss": 1.1913,
"step": 1300
},
{
"epoch": 0.6953510057279872,
"grad_norm": 0.0767097655138434,
"learning_rate": 5.1349116714542144e-05,
"loss": 1.1685,
"step": 1305
},
{
"epoch": 0.6980151858265619,
"grad_norm": 0.07609380641851764,
"learning_rate": 5.053822156673276e-05,
"loss": 1.1907,
"step": 1310
},
{
"epoch": 0.7006793659251366,
"grad_norm": 0.07956518870952646,
"learning_rate": 4.973160955096496e-05,
"loss": 1.1668,
"step": 1315
},
{
"epoch": 0.7033435460237112,
"grad_norm": 0.0781847554601962,
"learning_rate": 4.892935051563242e-05,
"loss": 1.1898,
"step": 1320
},
{
"epoch": 0.7060077261222859,
"grad_norm": 0.07968044970093434,
"learning_rate": 4.8131513932183415e-05,
"loss": 1.2072,
"step": 1325
},
{
"epoch": 0.7086719062208605,
"grad_norm": 0.07875194128774458,
"learning_rate": 4.733816888910483e-05,
"loss": 1.178,
"step": 1330
},
{
"epoch": 0.7113360863194352,
"grad_norm": 0.08181988531463072,
"learning_rate": 4.654938408593974e-05,
"loss": 1.1679,
"step": 1335
},
{
"epoch": 0.7140002664180098,
"grad_norm": 0.08458887612797164,
"learning_rate": 4.576522782733802e-05,
"loss": 1.1925,
"step": 1340
},
{
"epoch": 0.7166644465165846,
"grad_norm": 0.07929665848737272,
"learning_rate": 4.4985768017142014e-05,
"loss": 1.1942,
"step": 1345
},
{
"epoch": 0.7193286266151592,
"grad_norm": 0.0793190517544045,
"learning_rate": 4.421107215250586e-05,
"loss": 1.1504,
"step": 1350
},
{
"epoch": 0.7219928067137339,
"grad_norm": 0.0793615584488964,
"learning_rate": 4.3441207318051005e-05,
"loss": 1.1704,
"step": 1355
},
{
"epoch": 0.7246569868123085,
"grad_norm": 0.08055131022376696,
"learning_rate": 4.2676240180056856e-05,
"loss": 1.1937,
"step": 1360
},
{
"epoch": 0.7273211669108832,
"grad_norm": 0.08173806907452158,
"learning_rate": 4.191623698068778e-05,
"loss": 1.1779,
"step": 1365
},
{
"epoch": 0.7299853470094578,
"grad_norm": 0.07773805545181321,
"learning_rate": 4.116126353225703e-05,
"loss": 1.1846,
"step": 1370
},
{
"epoch": 0.7326495271080325,
"grad_norm": 0.07667327254766519,
"learning_rate": 4.0411385211527684e-05,
"loss": 1.2095,
"step": 1375
},
{
"epoch": 0.7353137072066072,
"grad_norm": 0.08098668895333083,
"learning_rate": 3.96666669540512e-05,
"loss": 1.1682,
"step": 1380
},
{
"epoch": 0.7379778873051819,
"grad_norm": 0.07984137500350058,
"learning_rate": 3.892717324854459e-05,
"loss": 1.1729,
"step": 1385
},
{
"epoch": 0.7406420674037565,
"grad_norm": 0.07922048060290626,
"learning_rate": 3.8192968131305886e-05,
"loss": 1.1775,
"step": 1390
},
{
"epoch": 0.7433062475023312,
"grad_norm": 0.07652665688687964,
"learning_rate": 3.746411518066894e-05,
"loss": 1.1621,
"step": 1395
},
{
"epoch": 0.7459704276009058,
"grad_norm": 0.0798026979694231,
"learning_rate": 3.674067751149796e-05,
"loss": 1.1702,
"step": 1400
},
{
"epoch": 0.7486346076994805,
"grad_norm": 0.08300115098412487,
"learning_rate": 3.602271776972188e-05,
"loss": 1.1533,
"step": 1405
},
{
"epoch": 0.7512987877980551,
"grad_norm": 0.08575297897696614,
"learning_rate": 3.5310298126909816e-05,
"loss": 1.2051,
"step": 1410
},
{
"epoch": 0.7539629678966299,
"grad_norm": 0.07861668329891834,
"learning_rate": 3.46034802748872e-05,
"loss": 1.1804,
"step": 1415
},
{
"epoch": 0.7566271479952045,
"grad_norm": 0.07590825262005231,
"learning_rate": 3.390232542039352e-05,
"loss": 1.1846,
"step": 1420
},
{
"epoch": 0.7592913280937792,
"grad_norm": 0.07760055527146281,
"learning_rate": 3.320689427978232e-05,
"loss": 1.174,
"step": 1425
},
{
"epoch": 0.7619555081923538,
"grad_norm": 0.07989565355982597,
"learning_rate": 3.251724707376324e-05,
"loss": 1.1696,
"step": 1430
},
{
"epoch": 0.7646196882909284,
"grad_norm": 0.07764798745610466,
"learning_rate": 3.1833443522187454e-05,
"loss": 1.1761,
"step": 1435
},
{
"epoch": 0.7672838683895031,
"grad_norm": 0.07976913879065081,
"learning_rate": 3.115554283887614e-05,
"loss": 1.1909,
"step": 1440
},
{
"epoch": 0.7699480484880777,
"grad_norm": 0.08144781158937257,
"learning_rate": 3.0483603726492836e-05,
"loss": 1.1718,
"step": 1445
},
{
"epoch": 0.7726122285866525,
"grad_norm": 0.07748040591215276,
"learning_rate": 2.9817684371460153e-05,
"loss": 1.1867,
"step": 1450
},
{
"epoch": 0.7752764086852271,
"grad_norm": 0.07690798090395808,
"learning_rate": 2.9157842438921047e-05,
"loss": 1.201,
"step": 1455
},
{
"epoch": 0.7779405887838018,
"grad_norm": 0.07507016568699426,
"learning_rate": 2.8504135067745464e-05,
"loss": 1.1881,
"step": 1460
},
{
"epoch": 0.7806047688823764,
"grad_norm": 0.07900678246787794,
"learning_rate": 2.7856618865582318e-05,
"loss": 1.1734,
"step": 1465
},
{
"epoch": 0.7832689489809511,
"grad_norm": 0.08067040264461905,
"learning_rate": 2.721534990395752e-05,
"loss": 1.2003,
"step": 1470
},
{
"epoch": 0.7859331290795257,
"grad_norm": 0.08005195988665038,
"learning_rate": 2.658038371341859e-05,
"loss": 1.1898,
"step": 1475
},
{
"epoch": 0.7885973091781004,
"grad_norm": 0.07899201399633156,
"learning_rate": 2.5951775278725955e-05,
"loss": 1.1934,
"step": 1480
},
{
"epoch": 0.7912614892766751,
"grad_norm": 0.07642642805468737,
"learning_rate": 2.5329579034091455e-05,
"loss": 1.2073,
"step": 1485
},
{
"epoch": 0.7939256693752498,
"grad_norm": 0.08151590645902157,
"learning_rate": 2.4713848858464817e-05,
"loss": 1.181,
"step": 1490
},
{
"epoch": 0.7965898494738244,
"grad_norm": 0.07944386459666944,
"learning_rate": 2.410463807086786e-05,
"loss": 1.1955,
"step": 1495
},
{
"epoch": 0.7992540295723991,
"grad_norm": 0.07711356209897446,
"learning_rate": 2.3501999425777432e-05,
"loss": 1.1891,
"step": 1500
},
{
"epoch": 0.8019182096709737,
"grad_norm": 0.08004988252971652,
"learning_rate": 2.2905985108557114e-05,
"loss": 1.1851,
"step": 1505
},
{
"epoch": 0.8045823897695484,
"grad_norm": 0.08534573467737261,
"learning_rate": 2.2316646730938196e-05,
"loss": 1.1721,
"step": 1510
},
{
"epoch": 0.807246569868123,
"grad_norm": 0.07920097969121198,
"learning_rate": 2.173403532655046e-05,
"loss": 1.1694,
"step": 1515
},
{
"epoch": 0.8099107499666978,
"grad_norm": 0.07511500294571079,
"learning_rate": 2.1158201346502926e-05,
"loss": 1.1746,
"step": 1520
},
{
"epoch": 0.8125749300652724,
"grad_norm": 0.07737975818247868,
"learning_rate": 2.0589194655014898e-05,
"loss": 1.185,
"step": 1525
},
{
"epoch": 0.8152391101638471,
"grad_norm": 0.08190267832244168,
"learning_rate": 2.0027064525098236e-05,
"loss": 1.195,
"step": 1530
},
{
"epoch": 0.8179032902624217,
"grad_norm": 0.08131306910775678,
"learning_rate": 1.9471859634290336e-05,
"loss": 1.1742,
"step": 1535
},
{
"epoch": 0.8205674703609964,
"grad_norm": 0.0858733149118693,
"learning_rate": 1.8923628060439036e-05,
"loss": 1.1898,
"step": 1540
},
{
"epoch": 0.823231650459571,
"grad_norm": 0.07675405619600406,
"learning_rate": 1.838241727753931e-05,
"loss": 1.1881,
"step": 1545
},
{
"epoch": 0.8258958305581457,
"grad_norm": 0.07682847096326867,
"learning_rate": 1.7848274151622234e-05,
"loss": 1.1805,
"step": 1550
},
{
"epoch": 0.8285600106567204,
"grad_norm": 0.08364774954218579,
"learning_rate": 1.732124493669671e-05,
"loss": 1.192,
"step": 1555
},
{
"epoch": 0.8312241907552951,
"grad_norm": 0.07984768371406777,
"learning_rate": 1.6801375270743924e-05,
"loss": 1.1858,
"step": 1560
},
{
"epoch": 0.8338883708538697,
"grad_norm": 0.08106959134364375,
"learning_rate": 1.6288710171765576e-05,
"loss": 1.1813,
"step": 1565
},
{
"epoch": 0.8365525509524444,
"grad_norm": 0.07836491305718539,
"learning_rate": 1.578329403388541e-05,
"loss": 1.1881,
"step": 1570
},
{
"epoch": 0.839216731051019,
"grad_norm": 0.0758115839482243,
"learning_rate": 1.528517062350492e-05,
"loss": 1.1889,
"step": 1575
},
{
"epoch": 0.8418809111495937,
"grad_norm": 0.07588248787185455,
"learning_rate": 1.4794383075513452e-05,
"loss": 1.1768,
"step": 1580
},
{
"epoch": 0.8445450912481683,
"grad_norm": 0.08067020299948538,
"learning_rate": 1.431097388955297e-05,
"loss": 1.2063,
"step": 1585
},
{
"epoch": 0.8472092713467431,
"grad_norm": 0.07642268182718946,
"learning_rate": 1.3834984926337657e-05,
"loss": 1.1589,
"step": 1590
},
{
"epoch": 0.8498734514453177,
"grad_norm": 0.07651179067647247,
"learning_rate": 1.3366457404029275e-05,
"loss": 1.2185,
"step": 1595
},
{
"epoch": 0.8525376315438924,
"grad_norm": 0.07900907023829833,
"learning_rate": 1.2905431894667553e-05,
"loss": 1.163,
"step": 1600
},
{
"epoch": 0.855201811642467,
"grad_norm": 0.07852531932632778,
"learning_rate": 1.2451948320657114e-05,
"loss": 1.1827,
"step": 1605
},
{
"epoch": 0.8578659917410417,
"grad_norm": 0.08065290367816753,
"learning_rate": 1.200604595131033e-05,
"loss": 1.1723,
"step": 1610
},
{
"epoch": 0.8605301718396163,
"grad_norm": 0.0779423491553262,
"learning_rate": 1.1567763399446718e-05,
"loss": 1.1636,
"step": 1615
},
{
"epoch": 0.863194351938191,
"grad_norm": 0.07899604520177515,
"learning_rate": 1.1137138618049404e-05,
"loss": 1.2024,
"step": 1620
},
{
"epoch": 0.8658585320367657,
"grad_norm": 0.08253045338915561,
"learning_rate": 1.0714208896978484e-05,
"loss": 1.1735,
"step": 1625
},
{
"epoch": 0.8685227121353404,
"grad_norm": 0.07870321328036661,
"learning_rate": 1.0299010859742009e-05,
"loss": 1.1731,
"step": 1630
},
{
"epoch": 0.871186892233915,
"grad_norm": 0.08272725502586431,
"learning_rate": 9.891580460324523e-06,
"loss": 1.1929,
"step": 1635
},
{
"epoch": 0.8738510723324897,
"grad_norm": 0.07935943504829367,
"learning_rate": 9.491952980073604e-06,
"loss": 1.1709,
"step": 1640
},
{
"epoch": 0.8765152524310643,
"grad_norm": 0.07895683534188976,
"learning_rate": 9.100163024644815e-06,
"loss": 1.1712,
"step": 1645
},
{
"epoch": 0.879179432529639,
"grad_norm": 0.08082741880893497,
"learning_rate": 8.716244521004846e-06,
"loss": 1.1698,
"step": 1650
},
{
"epoch": 0.8818436126282136,
"grad_norm": 0.07991694698538618,
"learning_rate": 8.34023071449378e-06,
"loss": 1.1789,
"step": 1655
},
{
"epoch": 0.8845077927267884,
"grad_norm": 0.08214765689148273,
"learning_rate": 7.972154165946155e-06,
"loss": 1.1845,
"step": 1660
},
{
"epoch": 0.887171972825363,
"grad_norm": 0.08176590536508709,
"learning_rate": 7.612046748871327e-06,
"loss": 1.1771,
"step": 1665
},
{
"epoch": 0.8898361529239377,
"grad_norm": 0.0808798899357427,
"learning_rate": 7.25993964669347e-06,
"loss": 1.2,
"step": 1670
},
{
"epoch": 0.8925003330225123,
"grad_norm": 0.07776019044347303,
"learning_rate": 6.915863350051199e-06,
"loss": 1.204,
"step": 1675
},
{
"epoch": 0.895164513121087,
"grad_norm": 0.07706209163365262,
"learning_rate": 6.579847654157234e-06,
"loss": 1.1972,
"step": 1680
},
{
"epoch": 0.8978286932196616,
"grad_norm": 0.07653638119319102,
"learning_rate": 6.2519216562183516e-06,
"loss": 1.1623,
"step": 1685
},
{
"epoch": 0.9004928733182364,
"grad_norm": 0.07709508406908554,
"learning_rate": 5.932113752915658e-06,
"loss": 1.165,
"step": 1690
},
{
"epoch": 0.903157053416811,
"grad_norm": 0.07922295657062284,
"learning_rate": 5.620451637945567e-06,
"loss": 1.19,
"step": 1695
},
{
"epoch": 0.9058212335153857,
"grad_norm": 0.07654014983869019,
"learning_rate": 5.316962299621808e-06,
"loss": 1.1708,
"step": 1700
},
{
"epoch": 0.9084854136139603,
"grad_norm": 0.07678567172116282,
"learning_rate": 5.0216720185381595e-06,
"loss": 1.1873,
"step": 1705
},
{
"epoch": 0.911149593712535,
"grad_norm": 0.07752534581690265,
"learning_rate": 4.734606365292871e-06,
"loss": 1.175,
"step": 1710
},
{
"epoch": 0.9138137738111096,
"grad_norm": 0.07543079231709852,
"learning_rate": 4.4557901982743345e-06,
"loss": 1.1718,
"step": 1715
},
{
"epoch": 0.9164779539096843,
"grad_norm": 0.0847130046966339,
"learning_rate": 4.185247661508396e-06,
"loss": 1.1853,
"step": 1720
},
{
"epoch": 0.919142134008259,
"grad_norm": 0.07672126462506362,
"learning_rate": 3.923002182567737e-06,
"loss": 1.1528,
"step": 1725
},
{
"epoch": 0.9218063141068337,
"grad_norm": 0.08103275321155728,
"learning_rate": 3.6690764705430537e-06,
"loss": 1.1925,
"step": 1730
},
{
"epoch": 0.9244704942054083,
"grad_norm": 0.07625095781555866,
"learning_rate": 3.423492514076654e-06,
"loss": 1.1466,
"step": 1735
},
{
"epoch": 0.927134674303983,
"grad_norm": 0.07795348864062035,
"learning_rate": 3.186271579458333e-06,
"loss": 1.1804,
"step": 1740
},
{
"epoch": 0.9297988544025576,
"grad_norm": 0.0798431550705619,
"learning_rate": 2.9574342087837382e-06,
"loss": 1.1948,
"step": 1745
},
{
"epoch": 0.9324630345011323,
"grad_norm": 0.07723730498120181,
"learning_rate": 2.7370002181757114e-06,
"loss": 1.194,
"step": 1750
},
{
"epoch": 0.9351272145997069,
"grad_norm": 0.076024197686587,
"learning_rate": 2.52498869606812e-06,
"loss": 1.1553,
"step": 1755
},
{
"epoch": 0.9377913946982817,
"grad_norm": 0.07525767304214721,
"learning_rate": 2.3214180015530218e-06,
"loss": 1.1717,
"step": 1760
},
{
"epoch": 0.9404555747968563,
"grad_norm": 0.07866979015778834,
"learning_rate": 2.1263057627908478e-06,
"loss": 1.1877,
"step": 1765
},
{
"epoch": 0.943119754895431,
"grad_norm": 0.07918446703667696,
"learning_rate": 1.9396688754838355e-06,
"loss": 1.1825,
"step": 1770
},
{
"epoch": 0.9457839349940056,
"grad_norm": 0.07807410852748545,
"learning_rate": 1.7615235014130205e-06,
"loss": 1.1597,
"step": 1775
},
{
"epoch": 0.9484481150925803,
"grad_norm": 0.07773597995281895,
"learning_rate": 1.5918850670386676e-06,
"loss": 1.1736,
"step": 1780
},
{
"epoch": 0.9511122951911549,
"grad_norm": 0.08101968421115548,
"learning_rate": 1.4307682621644392e-06,
"loss": 1.1726,
"step": 1785
},
{
"epoch": 0.9537764752897295,
"grad_norm": 0.07640809417651317,
"learning_rate": 1.2781870386653017e-06,
"loss": 1.176,
"step": 1790
},
{
"epoch": 0.9564406553883043,
"grad_norm": 0.07624826302993815,
"learning_rate": 1.1341546092794475e-06,
"loss": 1.1712,
"step": 1795
},
{
"epoch": 0.9591048354868789,
"grad_norm": 0.07644512408140866,
"learning_rate": 9.986834464640328e-07,
"loss": 1.1804,
"step": 1800
},
{
"epoch": 0.9617690155854536,
"grad_norm": 0.078535810502388,
"learning_rate": 8.717852813152073e-07,
"loss": 1.1634,
"step": 1805
},
{
"epoch": 0.9644331956840282,
"grad_norm": 0.07806596561964084,
"learning_rate": 7.534711025522167e-07,
"loss": 1.1685,
"step": 1810
},
{
"epoch": 0.9670973757826029,
"grad_norm": 0.07845914770402948,
"learning_rate": 6.437511555658748e-07,
"loss": 1.1704,
"step": 1815
},
{
"epoch": 0.9697615558811775,
"grad_norm": 0.07631630513554058,
"learning_rate": 5.426349415313503e-07,
"loss": 1.167,
"step": 1820
},
{
"epoch": 0.9724257359797522,
"grad_norm": 0.07644461634231602,
"learning_rate": 4.5013121658538107e-07,
"loss": 1.1952,
"step": 1825
},
{
"epoch": 0.9750899160783268,
"grad_norm": 0.07797081390243328,
"learning_rate": 3.662479910681027e-07,
"loss": 1.1587,
"step": 1830
},
{
"epoch": 0.9777540961769016,
"grad_norm": 0.07746705967903876,
"learning_rate": 2.909925288293369e-07,
"loss": 1.1729,
"step": 1835
},
{
"epoch": 0.9804182762754762,
"grad_norm": 0.08118653860364235,
"learning_rate": 2.2437134659962778e-07,
"loss": 1.1637,
"step": 1840
},
{
"epoch": 0.9830824563740509,
"grad_norm": 0.07709302387499754,
"learning_rate": 1.6639021342588213e-07,
"loss": 1.1714,
"step": 1845
},
{
"epoch": 0.9857466364726255,
"grad_norm": 0.07514251112734556,
"learning_rate": 1.1705415017183585e-07,
"loss": 1.1826,
"step": 1850
},
{
"epoch": 0.9884108165712002,
"grad_norm": 0.07785113536208763,
"learning_rate": 7.636742908324613e-08,
"loss": 1.1894,
"step": 1855
},
{
"epoch": 0.9910749966697748,
"grad_norm": 0.08355649548742282,
"learning_rate": 4.4333573417953967e-08,
"loss": 1.1833,
"step": 1860
},
{
"epoch": 0.9937391767683496,
"grad_norm": 0.07679966879567424,
"learning_rate": 2.0955357140783893e-08,
"loss": 1.1922,
"step": 1865
},
{
"epoch": 0.9964033568669242,
"grad_norm": 0.07806796015090577,
"learning_rate": 6.234804683336038e-09,
"loss": 1.1612,
"step": 1870
},
{
"epoch": 0.9990675369654989,
"grad_norm": 0.07653772348938681,
"learning_rate": 1.7319076868194117e-10,
"loss": 1.1679,
"step": 1875
},
{
"epoch": 0.9996003729852138,
"eval_loss": 1.1644140481948853,
"eval_runtime": 1556.6302,
"eval_samples_per_second": 8.594,
"eval_steps_per_second": 0.538,
"step": 1876
},
{
"epoch": 0.9996003729852138,
"step": 1876,
"total_flos": 2.3594858912415744e+16,
"train_loss": 1.2015958401694227,
"train_runtime": 40566.6998,
"train_samples_per_second": 2.961,
"train_steps_per_second": 0.046
}
],
"logging_steps": 5,
"max_steps": 1876,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.3594858912415744e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}