dense-1M-12000 / trainer_state.json
bitersun's picture
Upload folder using huggingface_hub
573680f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.18681840472650563,
"eval_steps": 500,
"global_step": 12000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 7.784100196937735e-05,
"grad_norm": 18.677021026611328,
"learning_rate": 1.2453300124533002e-09,
"loss": 1.0606,
"step": 5
},
{
"epoch": 0.0001556820039387547,
"grad_norm": 8.49953842163086,
"learning_rate": 2.801992528019925e-09,
"loss": 1.0495,
"step": 10
},
{
"epoch": 0.00023352300590813205,
"grad_norm": 4.764372825622559,
"learning_rate": 4.358655043586551e-09,
"loss": 1.0324,
"step": 15
},
{
"epoch": 0.0003113640078775094,
"grad_norm": 6.350065231323242,
"learning_rate": 5.915317559153175e-09,
"loss": 1.0236,
"step": 20
},
{
"epoch": 0.00038920500984688676,
"grad_norm": 21.48935317993164,
"learning_rate": 7.471980074719801e-09,
"loss": 1.0156,
"step": 25
},
{
"epoch": 0.0004670460118162641,
"grad_norm": 11.964753150939941,
"learning_rate": 9.028642590286426e-09,
"loss": 1.1045,
"step": 30
},
{
"epoch": 0.0005448870137856414,
"grad_norm": 16.780696868896484,
"learning_rate": 1.0585305105853052e-08,
"loss": 0.9926,
"step": 35
},
{
"epoch": 0.0006227280157550188,
"grad_norm": 9.311758041381836,
"learning_rate": 1.2141967621419675e-08,
"loss": 1.0672,
"step": 40
},
{
"epoch": 0.0007005690177243961,
"grad_norm": 10.671490669250488,
"learning_rate": 1.36986301369863e-08,
"loss": 1.1432,
"step": 45
},
{
"epoch": 0.0007784100196937735,
"grad_norm": 6.056899547576904,
"learning_rate": 1.5255292652552926e-08,
"loss": 1.0207,
"step": 50
},
{
"epoch": 0.0008562510216631508,
"grad_norm": 12.727471351623535,
"learning_rate": 1.6811955168119553e-08,
"loss": 1.0626,
"step": 55
},
{
"epoch": 0.0009340920236325282,
"grad_norm": 3.2297894954681396,
"learning_rate": 1.8368617683686178e-08,
"loss": 1.0551,
"step": 60
},
{
"epoch": 0.0010119330256019056,
"grad_norm": 5.09151554107666,
"learning_rate": 1.9925280199252803e-08,
"loss": 1.0051,
"step": 65
},
{
"epoch": 0.0010897740275712829,
"grad_norm": 9.829240798950195,
"learning_rate": 2.1481942714819424e-08,
"loss": 1.0936,
"step": 70
},
{
"epoch": 0.0011676150295406602,
"grad_norm": 4.1169023513793945,
"learning_rate": 2.3038605230386048e-08,
"loss": 0.9367,
"step": 75
},
{
"epoch": 0.0012454560315100377,
"grad_norm": 22.784198760986328,
"learning_rate": 2.4595267745952676e-08,
"loss": 1.1475,
"step": 80
},
{
"epoch": 0.001323297033479415,
"grad_norm": 6.216701507568359,
"learning_rate": 2.61519302615193e-08,
"loss": 1.1738,
"step": 85
},
{
"epoch": 0.0014011380354487922,
"grad_norm": 8.767633438110352,
"learning_rate": 2.7708592777085925e-08,
"loss": 1.0995,
"step": 90
},
{
"epoch": 0.0014789790374181697,
"grad_norm": 6.444150447845459,
"learning_rate": 2.926525529265255e-08,
"loss": 1.1674,
"step": 95
},
{
"epoch": 0.001556820039387547,
"grad_norm": 21.939842224121094,
"learning_rate": 3.082191780821918e-08,
"loss": 0.9658,
"step": 100
},
{
"epoch": 0.0016346610413569243,
"grad_norm": 6.07455587387085,
"learning_rate": 3.23785803237858e-08,
"loss": 1.3969,
"step": 105
},
{
"epoch": 0.0017125020433263016,
"grad_norm": 7.66893196105957,
"learning_rate": 3.3935242839352427e-08,
"loss": 1.0274,
"step": 110
},
{
"epoch": 0.0017903430452956791,
"grad_norm": 6.411283016204834,
"learning_rate": 3.549190535491906e-08,
"loss": 1.0922,
"step": 115
},
{
"epoch": 0.0018681840472650564,
"grad_norm": 15.535103797912598,
"learning_rate": 3.704856787048568e-08,
"loss": 1.058,
"step": 120
},
{
"epoch": 0.0019460250492344337,
"grad_norm": 13.108068466186523,
"learning_rate": 3.860523038605231e-08,
"loss": 1.1347,
"step": 125
},
{
"epoch": 0.002023866051203811,
"grad_norm": 5.452599048614502,
"learning_rate": 4.016189290161893e-08,
"loss": 1.07,
"step": 130
},
{
"epoch": 0.0021017070531731885,
"grad_norm": 13.57522964477539,
"learning_rate": 4.1718555417185556e-08,
"loss": 1.0556,
"step": 135
},
{
"epoch": 0.0021795480551425658,
"grad_norm": 4.844541072845459,
"learning_rate": 4.3275217932752174e-08,
"loss": 1.0538,
"step": 140
},
{
"epoch": 0.002257389057111943,
"grad_norm": 7.6000800132751465,
"learning_rate": 4.48318804483188e-08,
"loss": 1.0884,
"step": 145
},
{
"epoch": 0.0023352300590813203,
"grad_norm": 6.445258617401123,
"learning_rate": 4.638854296388542e-08,
"loss": 1.0247,
"step": 150
},
{
"epoch": 0.002413071061050698,
"grad_norm": 4.861091136932373,
"learning_rate": 4.794520547945205e-08,
"loss": 1.0553,
"step": 155
},
{
"epoch": 0.0024909120630200753,
"grad_norm": 6.040435314178467,
"learning_rate": 4.950186799501867e-08,
"loss": 1.0682,
"step": 160
},
{
"epoch": 0.0025687530649894526,
"grad_norm": 10.561899185180664,
"learning_rate": 5.10585305105853e-08,
"loss": 0.9904,
"step": 165
},
{
"epoch": 0.00264659406695883,
"grad_norm": 8.238300323486328,
"learning_rate": 5.261519302615193e-08,
"loss": 1.0576,
"step": 170
},
{
"epoch": 0.002724435068928207,
"grad_norm": 10.821751594543457,
"learning_rate": 5.417185554171855e-08,
"loss": 1.0507,
"step": 175
},
{
"epoch": 0.0028022760708975845,
"grad_norm": 20.215164184570312,
"learning_rate": 5.5728518057285177e-08,
"loss": 1.2059,
"step": 180
},
{
"epoch": 0.0028801170728669618,
"grad_norm": 15.447042465209961,
"learning_rate": 5.72851805728518e-08,
"loss": 1.1047,
"step": 185
},
{
"epoch": 0.0029579580748363395,
"grad_norm": 13.472341537475586,
"learning_rate": 5.8841843088418426e-08,
"loss": 1.0508,
"step": 190
},
{
"epoch": 0.003035799076805717,
"grad_norm": 16.09784507751465,
"learning_rate": 6.039850560398505e-08,
"loss": 1.1144,
"step": 195
},
{
"epoch": 0.003113640078775094,
"grad_norm": 5.519948959350586,
"learning_rate": 6.195516811955167e-08,
"loss": 1.1127,
"step": 200
},
{
"epoch": 0.0031914810807444714,
"grad_norm": 9.467545509338379,
"learning_rate": 6.351183063511831e-08,
"loss": 1.0882,
"step": 205
},
{
"epoch": 0.0032693220827138486,
"grad_norm": 8.895452499389648,
"learning_rate": 6.506849315068492e-08,
"loss": 1.0845,
"step": 210
},
{
"epoch": 0.003347163084683226,
"grad_norm": 10.007709503173828,
"learning_rate": 6.662515566625156e-08,
"loss": 1.1585,
"step": 215
},
{
"epoch": 0.0034250040866526032,
"grad_norm": 10.499605178833008,
"learning_rate": 6.818181818181817e-08,
"loss": 1.103,
"step": 220
},
{
"epoch": 0.003502845088621981,
"grad_norm": 5.367983818054199,
"learning_rate": 6.973848069738481e-08,
"loss": 1.1586,
"step": 225
},
{
"epoch": 0.0035806860905913582,
"grad_norm": 19.27895164489746,
"learning_rate": 7.129514321295142e-08,
"loss": 1.0898,
"step": 230
},
{
"epoch": 0.0036585270925607355,
"grad_norm": 3.7263176441192627,
"learning_rate": 7.285180572851806e-08,
"loss": 1.0541,
"step": 235
},
{
"epoch": 0.003736368094530113,
"grad_norm": 21.48790740966797,
"learning_rate": 7.440846824408468e-08,
"loss": 1.2173,
"step": 240
},
{
"epoch": 0.00381420909649949,
"grad_norm": 5.5661702156066895,
"learning_rate": 7.596513075965131e-08,
"loss": 1.1816,
"step": 245
},
{
"epoch": 0.0038920500984688674,
"grad_norm": 13.601526260375977,
"learning_rate": 7.752179327521793e-08,
"loss": 1.0989,
"step": 250
},
{
"epoch": 0.003969891100438245,
"grad_norm": 9.873005867004395,
"learning_rate": 7.907845579078456e-08,
"loss": 1.1859,
"step": 255
},
{
"epoch": 0.004047732102407622,
"grad_norm": 4.8417277336120605,
"learning_rate": 8.063511830635118e-08,
"loss": 0.9859,
"step": 260
},
{
"epoch": 0.004125573104377,
"grad_norm": 3.8291945457458496,
"learning_rate": 8.21917808219178e-08,
"loss": 0.9709,
"step": 265
},
{
"epoch": 0.004203414106346377,
"grad_norm": 5.504295349121094,
"learning_rate": 8.374844333748443e-08,
"loss": 1.1271,
"step": 270
},
{
"epoch": 0.004281255108315754,
"grad_norm": 10.665711402893066,
"learning_rate": 8.530510585305104e-08,
"loss": 1.1773,
"step": 275
},
{
"epoch": 0.0043590961102851315,
"grad_norm": 8.259835243225098,
"learning_rate": 8.686176836861768e-08,
"loss": 1.1428,
"step": 280
},
{
"epoch": 0.004436937112254509,
"grad_norm": 15.531925201416016,
"learning_rate": 8.84184308841843e-08,
"loss": 1.1423,
"step": 285
},
{
"epoch": 0.004514778114223886,
"grad_norm": 17.920616149902344,
"learning_rate": 8.997509339975093e-08,
"loss": 1.3731,
"step": 290
},
{
"epoch": 0.004592619116193263,
"grad_norm": 5.740132808685303,
"learning_rate": 9.153175591531755e-08,
"loss": 0.9373,
"step": 295
},
{
"epoch": 0.004670460118162641,
"grad_norm": 6.698586463928223,
"learning_rate": 9.308841843088418e-08,
"loss": 1.0603,
"step": 300
},
{
"epoch": 0.004748301120132018,
"grad_norm": 4.851785182952881,
"learning_rate": 9.46450809464508e-08,
"loss": 1.004,
"step": 305
},
{
"epoch": 0.004826142122101396,
"grad_norm": 7.876951217651367,
"learning_rate": 9.620174346201743e-08,
"loss": 1.1607,
"step": 310
},
{
"epoch": 0.004903983124070773,
"grad_norm": 9.093779563903809,
"learning_rate": 9.775840597758405e-08,
"loss": 1.1045,
"step": 315
},
{
"epoch": 0.004981824126040151,
"grad_norm": 16.582103729248047,
"learning_rate": 9.931506849315068e-08,
"loss": 1.1154,
"step": 320
},
{
"epoch": 0.005059665128009528,
"grad_norm": 13.140198707580566,
"learning_rate": 1.008717310087173e-07,
"loss": 1.17,
"step": 325
},
{
"epoch": 0.005137506129978905,
"grad_norm": 3.4895646572113037,
"learning_rate": 1.0242839352428394e-07,
"loss": 0.9535,
"step": 330
},
{
"epoch": 0.0052153471319482825,
"grad_norm": 6.645687103271484,
"learning_rate": 1.0398505603985055e-07,
"loss": 1.05,
"step": 335
},
{
"epoch": 0.00529318813391766,
"grad_norm": 7.615957736968994,
"learning_rate": 1.0554171855541719e-07,
"loss": 0.9174,
"step": 340
},
{
"epoch": 0.005371029135887037,
"grad_norm": 8.536812782287598,
"learning_rate": 1.070983810709838e-07,
"loss": 1.1338,
"step": 345
},
{
"epoch": 0.005448870137856414,
"grad_norm": 4.573184967041016,
"learning_rate": 1.0865504358655044e-07,
"loss": 1.2018,
"step": 350
},
{
"epoch": 0.005526711139825792,
"grad_norm": 11.614198684692383,
"learning_rate": 1.1021170610211705e-07,
"loss": 1.1853,
"step": 355
},
{
"epoch": 0.005604552141795169,
"grad_norm": 12.930988311767578,
"learning_rate": 1.1176836861768369e-07,
"loss": 1.1772,
"step": 360
},
{
"epoch": 0.005682393143764546,
"grad_norm": 5.334465980529785,
"learning_rate": 1.133250311332503e-07,
"loss": 1.1402,
"step": 365
},
{
"epoch": 0.0057602341457339236,
"grad_norm": 19.55135726928711,
"learning_rate": 1.1488169364881693e-07,
"loss": 0.9569,
"step": 370
},
{
"epoch": 0.005838075147703301,
"grad_norm": 14.209831237792969,
"learning_rate": 1.1643835616438355e-07,
"loss": 1.1239,
"step": 375
},
{
"epoch": 0.005915916149672679,
"grad_norm": 5.5656352043151855,
"learning_rate": 1.1799501867995018e-07,
"loss": 1.1074,
"step": 380
},
{
"epoch": 0.005993757151642056,
"grad_norm": 10.571775436401367,
"learning_rate": 1.1955168119551682e-07,
"loss": 1.1663,
"step": 385
},
{
"epoch": 0.006071598153611434,
"grad_norm": 5.807967662811279,
"learning_rate": 1.2110834371108342e-07,
"loss": 1.1868,
"step": 390
},
{
"epoch": 0.006149439155580811,
"grad_norm": 7.003355503082275,
"learning_rate": 1.2266500622665007e-07,
"loss": 1.0249,
"step": 395
},
{
"epoch": 0.006227280157550188,
"grad_norm": 14.337294578552246,
"learning_rate": 1.2422166874221667e-07,
"loss": 1.0405,
"step": 400
},
{
"epoch": 0.006305121159519565,
"grad_norm": 12.388212203979492,
"learning_rate": 1.2577833125778332e-07,
"loss": 1.1801,
"step": 405
},
{
"epoch": 0.006382962161488943,
"grad_norm": 11.25795841217041,
"learning_rate": 1.2733499377334994e-07,
"loss": 1.1672,
"step": 410
},
{
"epoch": 0.00646080316345832,
"grad_norm": 15.970906257629395,
"learning_rate": 1.2889165628891654e-07,
"loss": 1.0815,
"step": 415
},
{
"epoch": 0.006538644165427697,
"grad_norm": 16.4951114654541,
"learning_rate": 1.3044831880448317e-07,
"loss": 1.039,
"step": 420
},
{
"epoch": 0.006616485167397075,
"grad_norm": 16.199981689453125,
"learning_rate": 1.3200498132004982e-07,
"loss": 1.1636,
"step": 425
},
{
"epoch": 0.006694326169366452,
"grad_norm": 7.787930965423584,
"learning_rate": 1.3356164383561644e-07,
"loss": 1.0949,
"step": 430
},
{
"epoch": 0.006772167171335829,
"grad_norm": 4.226932525634766,
"learning_rate": 1.3511830635118307e-07,
"loss": 1.0409,
"step": 435
},
{
"epoch": 0.0068500081733052064,
"grad_norm": 19.068387985229492,
"learning_rate": 1.3667496886674967e-07,
"loss": 0.9881,
"step": 440
},
{
"epoch": 0.006927849175274584,
"grad_norm": 3.8829450607299805,
"learning_rate": 1.3823163138231632e-07,
"loss": 0.9989,
"step": 445
},
{
"epoch": 0.007005690177243962,
"grad_norm": 5.948785305023193,
"learning_rate": 1.3978829389788294e-07,
"loss": 1.007,
"step": 450
},
{
"epoch": 0.007083531179213339,
"grad_norm": 5.125,
"learning_rate": 1.4134495641344957e-07,
"loss": 0.8301,
"step": 455
},
{
"epoch": 0.0071613721811827164,
"grad_norm": 12.499361038208008,
"learning_rate": 1.4290161892901616e-07,
"loss": 1.0923,
"step": 460
},
{
"epoch": 0.007239213183152094,
"grad_norm": 6.266834735870361,
"learning_rate": 1.4445828144458281e-07,
"loss": 0.88,
"step": 465
},
{
"epoch": 0.007317054185121471,
"grad_norm": 9.417441368103027,
"learning_rate": 1.4601494396014944e-07,
"loss": 0.9669,
"step": 470
},
{
"epoch": 0.007394895187090848,
"grad_norm": 9.376644134521484,
"learning_rate": 1.4757160647571606e-07,
"loss": 1.0241,
"step": 475
},
{
"epoch": 0.007472736189060226,
"grad_norm": 10.515301704406738,
"learning_rate": 1.491282689912827e-07,
"loss": 1.0468,
"step": 480
},
{
"epoch": 0.007550577191029603,
"grad_norm": 8.439921379089355,
"learning_rate": 1.506849315068493e-07,
"loss": 0.9356,
"step": 485
},
{
"epoch": 0.00762841819299898,
"grad_norm": 8.198512077331543,
"learning_rate": 1.5224159402241594e-07,
"loss": 1.1591,
"step": 490
},
{
"epoch": 0.0077062591949683575,
"grad_norm": 6.289046287536621,
"learning_rate": 1.5379825653798256e-07,
"loss": 1.0259,
"step": 495
},
{
"epoch": 0.007784100196937735,
"grad_norm": 18.078012466430664,
"learning_rate": 1.5535491905354919e-07,
"loss": 1.1059,
"step": 500
},
{
"epoch": 0.007861941198907112,
"grad_norm": 8.508400917053223,
"learning_rate": 1.569115815691158e-07,
"loss": 1.001,
"step": 505
},
{
"epoch": 0.00793978220087649,
"grad_norm": 6.552981853485107,
"learning_rate": 1.5846824408468243e-07,
"loss": 1.0819,
"step": 510
},
{
"epoch": 0.008017623202845867,
"grad_norm": 5.941412925720215,
"learning_rate": 1.6002490660024906e-07,
"loss": 1.0143,
"step": 515
},
{
"epoch": 0.008095464204815245,
"grad_norm": 10.764496803283691,
"learning_rate": 1.6158156911581568e-07,
"loss": 1.0228,
"step": 520
},
{
"epoch": 0.008173305206784621,
"grad_norm": 5.186371326446533,
"learning_rate": 1.6313823163138233e-07,
"loss": 0.9794,
"step": 525
},
{
"epoch": 0.008251146208754,
"grad_norm": 11.401899337768555,
"learning_rate": 1.6469489414694893e-07,
"loss": 1.0773,
"step": 530
},
{
"epoch": 0.008328987210723376,
"grad_norm": 5.4313788414001465,
"learning_rate": 1.6625155666251556e-07,
"loss": 0.9984,
"step": 535
},
{
"epoch": 0.008406828212692754,
"grad_norm": 7.18859338760376,
"learning_rate": 1.6780821917808218e-07,
"loss": 1.0894,
"step": 540
},
{
"epoch": 0.00848466921466213,
"grad_norm": 5.814337253570557,
"learning_rate": 1.6936488169364883e-07,
"loss": 0.983,
"step": 545
},
{
"epoch": 0.008562510216631508,
"grad_norm": 11.842198371887207,
"learning_rate": 1.7092154420921543e-07,
"loss": 0.994,
"step": 550
},
{
"epoch": 0.008640351218600887,
"grad_norm": 10.12619400024414,
"learning_rate": 1.7247820672478206e-07,
"loss": 1.015,
"step": 555
},
{
"epoch": 0.008718192220570263,
"grad_norm": 7.895757675170898,
"learning_rate": 1.7403486924034868e-07,
"loss": 1.1194,
"step": 560
},
{
"epoch": 0.008796033222539641,
"grad_norm": 5.340054512023926,
"learning_rate": 1.755915317559153e-07,
"loss": 1.0283,
"step": 565
},
{
"epoch": 0.008873874224509018,
"grad_norm": 13.950590133666992,
"learning_rate": 1.7714819427148193e-07,
"loss": 1.155,
"step": 570
},
{
"epoch": 0.008951715226478396,
"grad_norm": 10.90434741973877,
"learning_rate": 1.7870485678704855e-07,
"loss": 1.0135,
"step": 575
},
{
"epoch": 0.009029556228447772,
"grad_norm": 4.94070291519165,
"learning_rate": 1.8026151930261518e-07,
"loss": 0.9843,
"step": 580
},
{
"epoch": 0.00910739723041715,
"grad_norm": 9.50981616973877,
"learning_rate": 1.818181818181818e-07,
"loss": 1.0777,
"step": 585
},
{
"epoch": 0.009185238232386527,
"grad_norm": 9.218316078186035,
"learning_rate": 1.8337484433374845e-07,
"loss": 1.1192,
"step": 590
},
{
"epoch": 0.009263079234355905,
"grad_norm": 17.782791137695312,
"learning_rate": 1.8493150684931505e-07,
"loss": 1.0513,
"step": 595
},
{
"epoch": 0.009340920236325281,
"grad_norm": 5.774691581726074,
"learning_rate": 1.8648816936488168e-07,
"loss": 0.872,
"step": 600
},
{
"epoch": 0.00941876123829466,
"grad_norm": 6.310098171234131,
"learning_rate": 1.880448318804483e-07,
"loss": 0.9395,
"step": 605
},
{
"epoch": 0.009496602240264036,
"grad_norm": 6.68503999710083,
"learning_rate": 1.8960149439601495e-07,
"loss": 1.0394,
"step": 610
},
{
"epoch": 0.009574443242233414,
"grad_norm": 6.972198486328125,
"learning_rate": 1.9115815691158155e-07,
"loss": 1.066,
"step": 615
},
{
"epoch": 0.009652284244202792,
"grad_norm": 6.581061363220215,
"learning_rate": 1.9271481942714817e-07,
"loss": 0.9428,
"step": 620
},
{
"epoch": 0.009730125246172169,
"grad_norm": 10.010781288146973,
"learning_rate": 1.942714819427148e-07,
"loss": 1.097,
"step": 625
},
{
"epoch": 0.009807966248141547,
"grad_norm": 10.270834922790527,
"learning_rate": 1.9582814445828145e-07,
"loss": 1.0373,
"step": 630
},
{
"epoch": 0.009885807250110923,
"grad_norm": 7.189127445220947,
"learning_rate": 1.9738480697384807e-07,
"loss": 1.2356,
"step": 635
},
{
"epoch": 0.009963648252080301,
"grad_norm": 11.2526216506958,
"learning_rate": 1.9894146948941467e-07,
"loss": 0.8422,
"step": 640
},
{
"epoch": 0.010041489254049678,
"grad_norm": 5.1716203689575195,
"learning_rate": 2.004981320049813e-07,
"loss": 1.0132,
"step": 645
},
{
"epoch": 0.010119330256019056,
"grad_norm": 4.592648983001709,
"learning_rate": 2.0205479452054795e-07,
"loss": 1.0677,
"step": 650
},
{
"epoch": 0.010197171257988432,
"grad_norm": 4.74710750579834,
"learning_rate": 2.0361145703611457e-07,
"loss": 0.9573,
"step": 655
},
{
"epoch": 0.01027501225995781,
"grad_norm": 5.075165748596191,
"learning_rate": 2.0516811955168117e-07,
"loss": 0.919,
"step": 660
},
{
"epoch": 0.010352853261927187,
"grad_norm": 5.705000400543213,
"learning_rate": 2.067247820672478e-07,
"loss": 0.9722,
"step": 665
},
{
"epoch": 0.010430694263896565,
"grad_norm": 8.337606430053711,
"learning_rate": 2.0828144458281445e-07,
"loss": 1.0511,
"step": 670
},
{
"epoch": 0.010508535265865942,
"grad_norm": 7.93868350982666,
"learning_rate": 2.0983810709838107e-07,
"loss": 1.0511,
"step": 675
},
{
"epoch": 0.01058637626783532,
"grad_norm": 7.5352325439453125,
"learning_rate": 2.113947696139477e-07,
"loss": 1.0529,
"step": 680
},
{
"epoch": 0.010664217269804696,
"grad_norm": 12.067502975463867,
"learning_rate": 2.129514321295143e-07,
"loss": 0.9818,
"step": 685
},
{
"epoch": 0.010742058271774074,
"grad_norm": 4.793339729309082,
"learning_rate": 2.1450809464508094e-07,
"loss": 0.9757,
"step": 690
},
{
"epoch": 0.010819899273743452,
"grad_norm": 5.648492336273193,
"learning_rate": 2.1606475716064757e-07,
"loss": 0.9838,
"step": 695
},
{
"epoch": 0.010897740275712829,
"grad_norm": 14.50791072845459,
"learning_rate": 2.176214196762142e-07,
"loss": 1.025,
"step": 700
},
{
"epoch": 0.010975581277682207,
"grad_norm": 6.976552486419678,
"learning_rate": 2.191780821917808e-07,
"loss": 1.0636,
"step": 705
},
{
"epoch": 0.011053422279651583,
"grad_norm": 8.440703392028809,
"learning_rate": 2.2073474470734744e-07,
"loss": 0.8979,
"step": 710
},
{
"epoch": 0.011131263281620962,
"grad_norm": 17.822824478149414,
"learning_rate": 2.2229140722291407e-07,
"loss": 1.126,
"step": 715
},
{
"epoch": 0.011209104283590338,
"grad_norm": 3.6384825706481934,
"learning_rate": 2.238480697384807e-07,
"loss": 0.9756,
"step": 720
},
{
"epoch": 0.011286945285559716,
"grad_norm": 9.758706092834473,
"learning_rate": 2.2540473225404732e-07,
"loss": 1.0776,
"step": 725
},
{
"epoch": 0.011364786287529093,
"grad_norm": 6.821314334869385,
"learning_rate": 2.2696139476961394e-07,
"loss": 1.0809,
"step": 730
},
{
"epoch": 0.01144262728949847,
"grad_norm": 5.796785831451416,
"learning_rate": 2.2851805728518056e-07,
"loss": 1.0388,
"step": 735
},
{
"epoch": 0.011520468291467847,
"grad_norm": 14.487456321716309,
"learning_rate": 2.300747198007472e-07,
"loss": 0.9906,
"step": 740
},
{
"epoch": 0.011598309293437225,
"grad_norm": 5.587100505828857,
"learning_rate": 2.3163138231631381e-07,
"loss": 0.955,
"step": 745
},
{
"epoch": 0.011676150295406602,
"grad_norm": 5.029387474060059,
"learning_rate": 2.3318804483188044e-07,
"loss": 0.9509,
"step": 750
},
{
"epoch": 0.01175399129737598,
"grad_norm": 16.782621383666992,
"learning_rate": 2.3474470734744706e-07,
"loss": 1.0278,
"step": 755
},
{
"epoch": 0.011831832299345358,
"grad_norm": 8.211995124816895,
"learning_rate": 2.363013698630137e-07,
"loss": 1.1128,
"step": 760
},
{
"epoch": 0.011909673301314734,
"grad_norm": 8.179312705993652,
"learning_rate": 2.378580323785803e-07,
"loss": 0.9175,
"step": 765
},
{
"epoch": 0.011987514303284113,
"grad_norm": 3.8183233737945557,
"learning_rate": 2.3941469489414696e-07,
"loss": 1.0116,
"step": 770
},
{
"epoch": 0.012065355305253489,
"grad_norm": 11.87375545501709,
"learning_rate": 2.4097135740971356e-07,
"loss": 0.975,
"step": 775
},
{
"epoch": 0.012143196307222867,
"grad_norm": 20.000045776367188,
"learning_rate": 2.425280199252802e-07,
"loss": 1.0561,
"step": 780
},
{
"epoch": 0.012221037309192244,
"grad_norm": 4.025638103485107,
"learning_rate": 2.440846824408468e-07,
"loss": 0.977,
"step": 785
},
{
"epoch": 0.012298878311161622,
"grad_norm": 8.214958190917969,
"learning_rate": 2.4564134495641346e-07,
"loss": 0.9728,
"step": 790
},
{
"epoch": 0.012376719313130998,
"grad_norm": 13.562061309814453,
"learning_rate": 2.4719800747198006e-07,
"loss": 0.8595,
"step": 795
},
{
"epoch": 0.012454560315100376,
"grad_norm": 4.473455905914307,
"learning_rate": 2.4875466998754666e-07,
"loss": 0.8972,
"step": 800
},
{
"epoch": 0.012532401317069753,
"grad_norm": 5.311202049255371,
"learning_rate": 2.503113325031133e-07,
"loss": 0.8578,
"step": 805
},
{
"epoch": 0.01261024231903913,
"grad_norm": 11.063155174255371,
"learning_rate": 2.5186799501867996e-07,
"loss": 0.9938,
"step": 810
},
{
"epoch": 0.012688083321008507,
"grad_norm": 7.260047435760498,
"learning_rate": 2.5342465753424656e-07,
"loss": 0.9745,
"step": 815
},
{
"epoch": 0.012765924322977885,
"grad_norm": 5.3101067543029785,
"learning_rate": 2.549813200498132e-07,
"loss": 1.0001,
"step": 820
},
{
"epoch": 0.012843765324947264,
"grad_norm": 4.430516719818115,
"learning_rate": 2.5653798256537986e-07,
"loss": 0.9283,
"step": 825
},
{
"epoch": 0.01292160632691664,
"grad_norm": 4.081624507904053,
"learning_rate": 2.580946450809464e-07,
"loss": 0.9694,
"step": 830
},
{
"epoch": 0.012999447328886018,
"grad_norm": 6.339404106140137,
"learning_rate": 2.5965130759651306e-07,
"loss": 1.0485,
"step": 835
},
{
"epoch": 0.013077288330855395,
"grad_norm": 8.773398399353027,
"learning_rate": 2.6120797011207965e-07,
"loss": 0.8494,
"step": 840
},
{
"epoch": 0.013155129332824773,
"grad_norm": 9.235841751098633,
"learning_rate": 2.627646326276463e-07,
"loss": 0.9223,
"step": 845
},
{
"epoch": 0.01323297033479415,
"grad_norm": 5.350943565368652,
"learning_rate": 2.6432129514321296e-07,
"loss": 1.0852,
"step": 850
},
{
"epoch": 0.013310811336763527,
"grad_norm": 5.6170268058776855,
"learning_rate": 2.6587795765877955e-07,
"loss": 0.9529,
"step": 855
},
{
"epoch": 0.013388652338732904,
"grad_norm": 6.033858776092529,
"learning_rate": 2.674346201743462e-07,
"loss": 0.9264,
"step": 860
},
{
"epoch": 0.013466493340702282,
"grad_norm": 10.408087730407715,
"learning_rate": 2.6899128268991286e-07,
"loss": 0.9435,
"step": 865
},
{
"epoch": 0.013544334342671658,
"grad_norm": 3.902411460876465,
"learning_rate": 2.7054794520547945e-07,
"loss": 1.0717,
"step": 870
},
{
"epoch": 0.013622175344641036,
"grad_norm": 6.315438270568848,
"learning_rate": 2.7210460772104605e-07,
"loss": 1.0222,
"step": 875
},
{
"epoch": 0.013700016346610413,
"grad_norm": 9.6283540725708,
"learning_rate": 2.7366127023661265e-07,
"loss": 0.9607,
"step": 880
},
{
"epoch": 0.013777857348579791,
"grad_norm": 8.017468452453613,
"learning_rate": 2.752179327521793e-07,
"loss": 0.908,
"step": 885
},
{
"epoch": 0.013855698350549167,
"grad_norm": 4.06109094619751,
"learning_rate": 2.7677459526774595e-07,
"loss": 0.9779,
"step": 890
},
{
"epoch": 0.013933539352518546,
"grad_norm": 4.540249347686768,
"learning_rate": 2.7833125778331255e-07,
"loss": 1.0685,
"step": 895
},
{
"epoch": 0.014011380354487924,
"grad_norm": 5.971028804779053,
"learning_rate": 2.798879202988792e-07,
"loss": 1.0487,
"step": 900
},
{
"epoch": 0.0140892213564573,
"grad_norm": 7.365455150604248,
"learning_rate": 2.8144458281444585e-07,
"loss": 1.0255,
"step": 905
},
{
"epoch": 0.014167062358426678,
"grad_norm": 5.49646520614624,
"learning_rate": 2.8300124533001245e-07,
"loss": 1.0066,
"step": 910
},
{
"epoch": 0.014244903360396055,
"grad_norm": 5.0211615562438965,
"learning_rate": 2.845579078455791e-07,
"loss": 0.9018,
"step": 915
},
{
"epoch": 0.014322744362365433,
"grad_norm": 3.7670419216156006,
"learning_rate": 2.8611457036114565e-07,
"loss": 1.0793,
"step": 920
},
{
"epoch": 0.01440058536433481,
"grad_norm": 10.098974227905273,
"learning_rate": 2.876712328767123e-07,
"loss": 1.0537,
"step": 925
},
{
"epoch": 0.014478426366304187,
"grad_norm": 8.83332633972168,
"learning_rate": 2.8922789539227895e-07,
"loss": 0.9274,
"step": 930
},
{
"epoch": 0.014556267368273564,
"grad_norm": 13.259550094604492,
"learning_rate": 2.9078455790784555e-07,
"loss": 0.9684,
"step": 935
},
{
"epoch": 0.014634108370242942,
"grad_norm": 9.241827964782715,
"learning_rate": 2.923412204234122e-07,
"loss": 1.031,
"step": 940
},
{
"epoch": 0.014711949372212318,
"grad_norm": 7.292890548706055,
"learning_rate": 2.9389788293897885e-07,
"loss": 0.9006,
"step": 945
},
{
"epoch": 0.014789790374181697,
"grad_norm": 4.794684886932373,
"learning_rate": 2.9545454545454545e-07,
"loss": 1.0308,
"step": 950
},
{
"epoch": 0.014867631376151073,
"grad_norm": 4.3201518058776855,
"learning_rate": 2.970112079701121e-07,
"loss": 0.9706,
"step": 955
},
{
"epoch": 0.014945472378120451,
"grad_norm": 3.5388669967651367,
"learning_rate": 2.985678704856787e-07,
"loss": 0.9782,
"step": 960
},
{
"epoch": 0.01502331338008983,
"grad_norm": 10.980652809143066,
"learning_rate": 3.001245330012453e-07,
"loss": 1.1015,
"step": 965
},
{
"epoch": 0.015101154382059206,
"grad_norm": 7.639592170715332,
"learning_rate": 3.0168119551681194e-07,
"loss": 0.9046,
"step": 970
},
{
"epoch": 0.015178995384028584,
"grad_norm": 5.50681734085083,
"learning_rate": 3.0323785803237854e-07,
"loss": 1.0192,
"step": 975
},
{
"epoch": 0.01525683638599796,
"grad_norm": 4.924655437469482,
"learning_rate": 3.047945205479452e-07,
"loss": 1.0545,
"step": 980
},
{
"epoch": 0.015334677387967338,
"grad_norm": 6.294414043426514,
"learning_rate": 3.0635118306351184e-07,
"loss": 0.9807,
"step": 985
},
{
"epoch": 0.015412518389936715,
"grad_norm": 4.609034538269043,
"learning_rate": 3.0790784557907844e-07,
"loss": 1.0205,
"step": 990
},
{
"epoch": 0.015490359391906093,
"grad_norm": 3.4544599056243896,
"learning_rate": 3.094645080946451e-07,
"loss": 0.9761,
"step": 995
},
{
"epoch": 0.01556820039387547,
"grad_norm": 5.186591148376465,
"learning_rate": 3.110211706102117e-07,
"loss": 0.9933,
"step": 1000
},
{
"epoch": 0.015646041395844846,
"grad_norm": 4.516424179077148,
"learning_rate": 3.125778331257783e-07,
"loss": 0.9057,
"step": 1005
},
{
"epoch": 0.015723882397814224,
"grad_norm": 4.458924293518066,
"learning_rate": 3.1413449564134494e-07,
"loss": 0.9526,
"step": 1010
},
{
"epoch": 0.015801723399783602,
"grad_norm": 5.840490341186523,
"learning_rate": 3.1569115815691154e-07,
"loss": 1.0032,
"step": 1015
},
{
"epoch": 0.01587956440175298,
"grad_norm": 13.803277015686035,
"learning_rate": 3.172478206724782e-07,
"loss": 1.0162,
"step": 1020
},
{
"epoch": 0.015957405403722355,
"grad_norm": 3.742831230163574,
"learning_rate": 3.1880448318804484e-07,
"loss": 0.9714,
"step": 1025
},
{
"epoch": 0.016035246405691733,
"grad_norm": 5.748800277709961,
"learning_rate": 3.2036114570361144e-07,
"loss": 0.9545,
"step": 1030
},
{
"epoch": 0.01611308740766111,
"grad_norm": 4.5021491050720215,
"learning_rate": 3.219178082191781e-07,
"loss": 0.9503,
"step": 1035
},
{
"epoch": 0.01619092840963049,
"grad_norm": 6.095613956451416,
"learning_rate": 3.234744707347447e-07,
"loss": 0.991,
"step": 1040
},
{
"epoch": 0.016268769411599868,
"grad_norm": 4.993571758270264,
"learning_rate": 3.2503113325031134e-07,
"loss": 0.9221,
"step": 1045
},
{
"epoch": 0.016346610413569242,
"grad_norm": 5.949316501617432,
"learning_rate": 3.2658779576587794e-07,
"loss": 0.9897,
"step": 1050
},
{
"epoch": 0.01642445141553862,
"grad_norm": 5.225283622741699,
"learning_rate": 3.2814445828144453e-07,
"loss": 0.9719,
"step": 1055
},
{
"epoch": 0.016502292417508,
"grad_norm": 15.378800392150879,
"learning_rate": 3.297011207970112e-07,
"loss": 0.8633,
"step": 1060
},
{
"epoch": 0.016580133419477377,
"grad_norm": 4.347599506378174,
"learning_rate": 3.312577833125778e-07,
"loss": 0.9054,
"step": 1065
},
{
"epoch": 0.01665797442144675,
"grad_norm": 5.208911895751953,
"learning_rate": 3.3281444582814443e-07,
"loss": 0.899,
"step": 1070
},
{
"epoch": 0.01673581542341613,
"grad_norm": 6.316863059997559,
"learning_rate": 3.343711083437111e-07,
"loss": 1.0165,
"step": 1075
},
{
"epoch": 0.016813656425385508,
"grad_norm": 5.477814197540283,
"learning_rate": 3.359277708592777e-07,
"loss": 0.96,
"step": 1080
},
{
"epoch": 0.016891497427354886,
"grad_norm": 4.848371505737305,
"learning_rate": 3.3748443337484433e-07,
"loss": 0.9703,
"step": 1085
},
{
"epoch": 0.01696933842932426,
"grad_norm": 9.025872230529785,
"learning_rate": 3.39041095890411e-07,
"loss": 0.9239,
"step": 1090
},
{
"epoch": 0.01704717943129364,
"grad_norm": 3.3916220664978027,
"learning_rate": 3.4059775840597753e-07,
"loss": 1.0136,
"step": 1095
},
{
"epoch": 0.017125020433263017,
"grad_norm": 9.25607967376709,
"learning_rate": 3.421544209215442e-07,
"loss": 0.9626,
"step": 1100
},
{
"epoch": 0.017202861435232395,
"grad_norm": 7.245452880859375,
"learning_rate": 3.437110834371108e-07,
"loss": 1.0026,
"step": 1105
},
{
"epoch": 0.017280702437201773,
"grad_norm": 3.3463306427001953,
"learning_rate": 3.4526774595267743e-07,
"loss": 0.9243,
"step": 1110
},
{
"epoch": 0.017358543439171148,
"grad_norm": 5.334697723388672,
"learning_rate": 3.468244084682441e-07,
"loss": 1.1001,
"step": 1115
},
{
"epoch": 0.017436384441140526,
"grad_norm": 4.7469305992126465,
"learning_rate": 3.483810709838107e-07,
"loss": 1.0421,
"step": 1120
},
{
"epoch": 0.017514225443109904,
"grad_norm": 4.398116111755371,
"learning_rate": 3.4993773349937733e-07,
"loss": 0.9502,
"step": 1125
},
{
"epoch": 0.017592066445079282,
"grad_norm": 3.972031831741333,
"learning_rate": 3.51494396014944e-07,
"loss": 0.8974,
"step": 1130
},
{
"epoch": 0.017669907447048657,
"grad_norm": 5.13526725769043,
"learning_rate": 3.530510585305106e-07,
"loss": 0.9668,
"step": 1135
},
{
"epoch": 0.017747748449018035,
"grad_norm": 3.752171754837036,
"learning_rate": 3.546077210460772e-07,
"loss": 1.0589,
"step": 1140
},
{
"epoch": 0.017825589450987413,
"grad_norm": 6.005197048187256,
"learning_rate": 3.561643835616438e-07,
"loss": 0.9786,
"step": 1145
},
{
"epoch": 0.01790343045295679,
"grad_norm": 5.12382173538208,
"learning_rate": 3.5772104607721043e-07,
"loss": 0.936,
"step": 1150
},
{
"epoch": 0.017981271454926166,
"grad_norm": 7.456275939941406,
"learning_rate": 3.592777085927771e-07,
"loss": 0.9261,
"step": 1155
},
{
"epoch": 0.018059112456895544,
"grad_norm": 3.7287797927856445,
"learning_rate": 3.608343711083437e-07,
"loss": 0.9291,
"step": 1160
},
{
"epoch": 0.018136953458864923,
"grad_norm": 3.916651725769043,
"learning_rate": 3.6239103362391033e-07,
"loss": 0.9193,
"step": 1165
},
{
"epoch": 0.0182147944608343,
"grad_norm": 4.2813720703125,
"learning_rate": 3.63947696139477e-07,
"loss": 1.0079,
"step": 1170
},
{
"epoch": 0.01829263546280368,
"grad_norm": 8.352608680725098,
"learning_rate": 3.655043586550436e-07,
"loss": 0.9901,
"step": 1175
},
{
"epoch": 0.018370476464773054,
"grad_norm": 5.297429084777832,
"learning_rate": 3.6706102117061023e-07,
"loss": 0.9049,
"step": 1180
},
{
"epoch": 0.01844831746674243,
"grad_norm": 4.064713478088379,
"learning_rate": 3.6861768368617677e-07,
"loss": 1.1307,
"step": 1185
},
{
"epoch": 0.01852615846871181,
"grad_norm": 6.08450174331665,
"learning_rate": 3.701743462017434e-07,
"loss": 1.0267,
"step": 1190
},
{
"epoch": 0.018603999470681188,
"grad_norm": 4.351869106292725,
"learning_rate": 3.717310087173101e-07,
"loss": 1.0315,
"step": 1195
},
{
"epoch": 0.018681840472650563,
"grad_norm": 7.120603084564209,
"learning_rate": 3.7328767123287667e-07,
"loss": 1.0092,
"step": 1200
},
{
"epoch": 0.01875968147461994,
"grad_norm": 4.8134660720825195,
"learning_rate": 3.748443337484433e-07,
"loss": 0.9491,
"step": 1205
},
{
"epoch": 0.01883752247658932,
"grad_norm": 5.852837085723877,
"learning_rate": 3.7640099626401e-07,
"loss": 1.0029,
"step": 1210
},
{
"epoch": 0.018915363478558697,
"grad_norm": 5.291375160217285,
"learning_rate": 3.7795765877957657e-07,
"loss": 0.8198,
"step": 1215
},
{
"epoch": 0.018993204480528072,
"grad_norm": 3.2667717933654785,
"learning_rate": 3.795143212951432e-07,
"loss": 0.9155,
"step": 1220
},
{
"epoch": 0.01907104548249745,
"grad_norm": 4.952467918395996,
"learning_rate": 3.810709838107098e-07,
"loss": 0.9144,
"step": 1225
},
{
"epoch": 0.019148886484466828,
"grad_norm": 4.495504379272461,
"learning_rate": 3.826276463262764e-07,
"loss": 0.9236,
"step": 1230
},
{
"epoch": 0.019226727486436206,
"grad_norm": 5.554149627685547,
"learning_rate": 3.8418430884184307e-07,
"loss": 0.7856,
"step": 1235
},
{
"epoch": 0.019304568488405584,
"grad_norm": 6.092937469482422,
"learning_rate": 3.8574097135740967e-07,
"loss": 0.8681,
"step": 1240
},
{
"epoch": 0.01938240949037496,
"grad_norm": 3.9643170833587646,
"learning_rate": 3.872976338729763e-07,
"loss": 0.836,
"step": 1245
},
{
"epoch": 0.019460250492344337,
"grad_norm": 3.9617724418640137,
"learning_rate": 3.8885429638854297e-07,
"loss": 1.015,
"step": 1250
},
{
"epoch": 0.019538091494313715,
"grad_norm": 8.572834014892578,
"learning_rate": 3.9041095890410957e-07,
"loss": 0.9308,
"step": 1255
},
{
"epoch": 0.019615932496283094,
"grad_norm": 6.380552291870117,
"learning_rate": 3.919676214196762e-07,
"loss": 0.9056,
"step": 1260
},
{
"epoch": 0.01969377349825247,
"grad_norm": 5.703736782073975,
"learning_rate": 3.935242839352428e-07,
"loss": 0.9864,
"step": 1265
},
{
"epoch": 0.019771614500221846,
"grad_norm": 4.2661824226379395,
"learning_rate": 3.9508094645080947e-07,
"loss": 0.8801,
"step": 1270
},
{
"epoch": 0.019849455502191225,
"grad_norm": 3.4654171466827393,
"learning_rate": 3.9663760896637607e-07,
"loss": 0.8954,
"step": 1275
},
{
"epoch": 0.019927296504160603,
"grad_norm": 5.910457611083984,
"learning_rate": 3.9819427148194266e-07,
"loss": 0.8456,
"step": 1280
},
{
"epoch": 0.020005137506129977,
"grad_norm": 6.345880031585693,
"learning_rate": 3.997509339975093e-07,
"loss": 0.9825,
"step": 1285
},
{
"epoch": 0.020082978508099356,
"grad_norm": 11.178544044494629,
"learning_rate": 4.0130759651307597e-07,
"loss": 0.9716,
"step": 1290
},
{
"epoch": 0.020160819510068734,
"grad_norm": 3.9438936710357666,
"learning_rate": 4.0286425902864256e-07,
"loss": 0.9629,
"step": 1295
},
{
"epoch": 0.020238660512038112,
"grad_norm": 7.4510273933410645,
"learning_rate": 4.044209215442092e-07,
"loss": 0.9815,
"step": 1300
},
{
"epoch": 0.020316501514007487,
"grad_norm": 6.15594482421875,
"learning_rate": 4.059775840597758e-07,
"loss": 0.9805,
"step": 1305
},
{
"epoch": 0.020394342515976865,
"grad_norm": 5.105663776397705,
"learning_rate": 4.0753424657534246e-07,
"loss": 1.0047,
"step": 1310
},
{
"epoch": 0.020472183517946243,
"grad_norm": 4.2579779624938965,
"learning_rate": 4.090909090909091e-07,
"loss": 0.797,
"step": 1315
},
{
"epoch": 0.02055002451991562,
"grad_norm": 3.6263747215270996,
"learning_rate": 4.1064757160647566e-07,
"loss": 0.9526,
"step": 1320
},
{
"epoch": 0.020627865521885,
"grad_norm": 4.003891944885254,
"learning_rate": 4.122042341220423e-07,
"loss": 0.862,
"step": 1325
},
{
"epoch": 0.020705706523854374,
"grad_norm": 4.833682060241699,
"learning_rate": 4.137608966376089e-07,
"loss": 1.0438,
"step": 1330
},
{
"epoch": 0.020783547525823752,
"grad_norm": 8.875425338745117,
"learning_rate": 4.1531755915317556e-07,
"loss": 1.0013,
"step": 1335
},
{
"epoch": 0.02086138852779313,
"grad_norm": 5.356649398803711,
"learning_rate": 4.168742216687422e-07,
"loss": 0.8956,
"step": 1340
},
{
"epoch": 0.02093922952976251,
"grad_norm": 5.640366554260254,
"learning_rate": 4.184308841843088e-07,
"loss": 0.8346,
"step": 1345
},
{
"epoch": 0.021017070531731883,
"grad_norm": 3.717663288116455,
"learning_rate": 4.1998754669987546e-07,
"loss": 0.8512,
"step": 1350
},
{
"epoch": 0.02109491153370126,
"grad_norm": 3.557542324066162,
"learning_rate": 4.215442092154421e-07,
"loss": 0.9313,
"step": 1355
},
{
"epoch": 0.02117275253567064,
"grad_norm": 5.178566932678223,
"learning_rate": 4.231008717310087e-07,
"loss": 0.9086,
"step": 1360
},
{
"epoch": 0.021250593537640017,
"grad_norm": 5.773383140563965,
"learning_rate": 4.246575342465753e-07,
"loss": 0.9678,
"step": 1365
},
{
"epoch": 0.021328434539609392,
"grad_norm": 4.725634574890137,
"learning_rate": 4.262141967621419e-07,
"loss": 0.9356,
"step": 1370
},
{
"epoch": 0.02140627554157877,
"grad_norm": 3.0198757648468018,
"learning_rate": 4.2777085927770856e-07,
"loss": 0.9342,
"step": 1375
},
{
"epoch": 0.02148411654354815,
"grad_norm": 5.704006195068359,
"learning_rate": 4.293275217932752e-07,
"loss": 1.0469,
"step": 1380
},
{
"epoch": 0.021561957545517527,
"grad_norm": 4.559571743011475,
"learning_rate": 4.308841843088418e-07,
"loss": 0.845,
"step": 1385
},
{
"epoch": 0.021639798547486905,
"grad_norm": 9.018213272094727,
"learning_rate": 4.3244084682440846e-07,
"loss": 0.9221,
"step": 1390
},
{
"epoch": 0.02171763954945628,
"grad_norm": 6.414641380310059,
"learning_rate": 4.339975093399751e-07,
"loss": 1.0877,
"step": 1395
},
{
"epoch": 0.021795480551425658,
"grad_norm": 4.217600345611572,
"learning_rate": 4.355541718555417e-07,
"loss": 0.8761,
"step": 1400
},
{
"epoch": 0.021873321553395036,
"grad_norm": 5.274855136871338,
"learning_rate": 4.3711083437110836e-07,
"loss": 0.9046,
"step": 1405
},
{
"epoch": 0.021951162555364414,
"grad_norm": 11.607494354248047,
"learning_rate": 4.386674968866749e-07,
"loss": 0.981,
"step": 1410
},
{
"epoch": 0.02202900355733379,
"grad_norm": 5.442785263061523,
"learning_rate": 4.4022415940224155e-07,
"loss": 0.9357,
"step": 1415
},
{
"epoch": 0.022106844559303167,
"grad_norm": 4.934208869934082,
"learning_rate": 4.417808219178082e-07,
"loss": 0.9124,
"step": 1420
},
{
"epoch": 0.022184685561272545,
"grad_norm": 5.2812933921813965,
"learning_rate": 4.433374844333748e-07,
"loss": 0.9655,
"step": 1425
},
{
"epoch": 0.022262526563241923,
"grad_norm": 3.1578216552734375,
"learning_rate": 4.4489414694894145e-07,
"loss": 0.9452,
"step": 1430
},
{
"epoch": 0.022340367565211298,
"grad_norm": 10.148691177368164,
"learning_rate": 4.464508094645081e-07,
"loss": 0.9015,
"step": 1435
},
{
"epoch": 0.022418208567180676,
"grad_norm": 3.150479793548584,
"learning_rate": 4.480074719800747e-07,
"loss": 0.8744,
"step": 1440
},
{
"epoch": 0.022496049569150054,
"grad_norm": 5.963056564331055,
"learning_rate": 4.4956413449564135e-07,
"loss": 1.0312,
"step": 1445
},
{
"epoch": 0.022573890571119432,
"grad_norm": 5.098721981048584,
"learning_rate": 4.5112079701120795e-07,
"loss": 0.8031,
"step": 1450
},
{
"epoch": 0.02265173157308881,
"grad_norm": 5.2625017166137695,
"learning_rate": 4.5267745952677455e-07,
"loss": 0.8132,
"step": 1455
},
{
"epoch": 0.022729572575058185,
"grad_norm": 8.537793159484863,
"learning_rate": 4.542341220423412e-07,
"loss": 0.8296,
"step": 1460
},
{
"epoch": 0.022807413577027563,
"grad_norm": 6.819812774658203,
"learning_rate": 4.557907845579078e-07,
"loss": 0.9476,
"step": 1465
},
{
"epoch": 0.02288525457899694,
"grad_norm": 4.941056251525879,
"learning_rate": 4.5734744707347445e-07,
"loss": 0.8785,
"step": 1470
},
{
"epoch": 0.02296309558096632,
"grad_norm": 5.378219127655029,
"learning_rate": 4.589041095890411e-07,
"loss": 0.8987,
"step": 1475
},
{
"epoch": 0.023040936582935694,
"grad_norm": 4.793314456939697,
"learning_rate": 4.604607721046077e-07,
"loss": 0.8309,
"step": 1480
},
{
"epoch": 0.023118777584905072,
"grad_norm": 7.7251434326171875,
"learning_rate": 4.6201743462017435e-07,
"loss": 0.959,
"step": 1485
},
{
"epoch": 0.02319661858687445,
"grad_norm": 3.7208149433135986,
"learning_rate": 4.6357409713574095e-07,
"loss": 0.9126,
"step": 1490
},
{
"epoch": 0.02327445958884383,
"grad_norm": 4.322316646575928,
"learning_rate": 4.651307596513076e-07,
"loss": 0.9567,
"step": 1495
},
{
"epoch": 0.023352300590813203,
"grad_norm": 5.451142311096191,
"learning_rate": 4.666874221668742e-07,
"loss": 0.9943,
"step": 1500
},
{
"epoch": 0.02343014159278258,
"grad_norm": 6.478999614715576,
"learning_rate": 4.682440846824408e-07,
"loss": 0.8577,
"step": 1505
},
{
"epoch": 0.02350798259475196,
"grad_norm": 5.626023292541504,
"learning_rate": 4.6980074719800745e-07,
"loss": 0.9176,
"step": 1510
},
{
"epoch": 0.023585823596721338,
"grad_norm": 9.153360366821289,
"learning_rate": 4.713574097135741e-07,
"loss": 1.0269,
"step": 1515
},
{
"epoch": 0.023663664598690716,
"grad_norm": 11.129598617553711,
"learning_rate": 4.729140722291407e-07,
"loss": 0.9428,
"step": 1520
},
{
"epoch": 0.02374150560066009,
"grad_norm": 5.8177313804626465,
"learning_rate": 4.7447073474470735e-07,
"loss": 1.0107,
"step": 1525
},
{
"epoch": 0.02381934660262947,
"grad_norm": 6.537820816040039,
"learning_rate": 4.7602739726027394e-07,
"loss": 0.8104,
"step": 1530
},
{
"epoch": 0.023897187604598847,
"grad_norm": 4.420594692230225,
"learning_rate": 4.775840597758406e-07,
"loss": 0.8484,
"step": 1535
},
{
"epoch": 0.023975028606568225,
"grad_norm": 6.306564807891846,
"learning_rate": 4.791407222914072e-07,
"loss": 0.796,
"step": 1540
},
{
"epoch": 0.0240528696085376,
"grad_norm": 11.836288452148438,
"learning_rate": 4.806973848069738e-07,
"loss": 0.8949,
"step": 1545
},
{
"epoch": 0.024130710610506978,
"grad_norm": 4.565202713012695,
"learning_rate": 4.822540473225404e-07,
"loss": 0.8881,
"step": 1550
},
{
"epoch": 0.024208551612476356,
"grad_norm": 4.610184669494629,
"learning_rate": 4.83810709838107e-07,
"loss": 1.0267,
"step": 1555
},
{
"epoch": 0.024286392614445734,
"grad_norm": 4.136282444000244,
"learning_rate": 4.853673723536737e-07,
"loss": 0.9593,
"step": 1560
},
{
"epoch": 0.02436423361641511,
"grad_norm": 4.203325271606445,
"learning_rate": 4.869240348692403e-07,
"loss": 0.9377,
"step": 1565
},
{
"epoch": 0.024442074618384487,
"grad_norm": 8.468722343444824,
"learning_rate": 4.88480697384807e-07,
"loss": 0.9949,
"step": 1570
},
{
"epoch": 0.024519915620353865,
"grad_norm": 7.116949558258057,
"learning_rate": 4.900373599003736e-07,
"loss": 0.92,
"step": 1575
},
{
"epoch": 0.024597756622323243,
"grad_norm": 4.857876777648926,
"learning_rate": 4.915940224159402e-07,
"loss": 0.8945,
"step": 1580
},
{
"epoch": 0.02467559762429262,
"grad_norm": 7.421228408813477,
"learning_rate": 4.931506849315068e-07,
"loss": 0.8807,
"step": 1585
},
{
"epoch": 0.024753438626261996,
"grad_norm": 7.203330993652344,
"learning_rate": 4.947073474470734e-07,
"loss": 0.8998,
"step": 1590
},
{
"epoch": 0.024831279628231374,
"grad_norm": 12.598939895629883,
"learning_rate": 4.9626400996264e-07,
"loss": 0.8843,
"step": 1595
},
{
"epoch": 0.024909120630200753,
"grad_norm": 6.573790073394775,
"learning_rate": 4.978206724782067e-07,
"loss": 0.8954,
"step": 1600
},
{
"epoch": 0.02498696163217013,
"grad_norm": 5.063882350921631,
"learning_rate": 4.993773349937733e-07,
"loss": 0.8539,
"step": 1605
},
{
"epoch": 0.025064802634139505,
"grad_norm": 5.859914779663086,
"learning_rate": 5.0093399750934e-07,
"loss": 0.9102,
"step": 1610
},
{
"epoch": 0.025142643636108884,
"grad_norm": 4.542943954467773,
"learning_rate": 5.024906600249066e-07,
"loss": 0.876,
"step": 1615
},
{
"epoch": 0.02522048463807826,
"grad_norm": 6.943472862243652,
"learning_rate": 5.040473225404732e-07,
"loss": 0.8886,
"step": 1620
},
{
"epoch": 0.02529832564004764,
"grad_norm": 5.794211387634277,
"learning_rate": 5.056039850560398e-07,
"loss": 0.9136,
"step": 1625
},
{
"epoch": 0.025376166642017015,
"grad_norm": 3.58612322807312,
"learning_rate": 5.071606475716065e-07,
"loss": 0.8483,
"step": 1630
},
{
"epoch": 0.025454007643986393,
"grad_norm": 8.513461112976074,
"learning_rate": 5.087173100871731e-07,
"loss": 0.9484,
"step": 1635
},
{
"epoch": 0.02553184864595577,
"grad_norm": 3.152209997177124,
"learning_rate": 5.102739726027398e-07,
"loss": 0.8564,
"step": 1640
},
{
"epoch": 0.02560968964792515,
"grad_norm": 11.711703300476074,
"learning_rate": 5.118306351183063e-07,
"loss": 0.7842,
"step": 1645
},
{
"epoch": 0.025687530649894527,
"grad_norm": 4.101468086242676,
"learning_rate": 5.13387297633873e-07,
"loss": 0.8646,
"step": 1650
},
{
"epoch": 0.025765371651863902,
"grad_norm": 3.844512462615967,
"learning_rate": 5.149439601494395e-07,
"loss": 0.8094,
"step": 1655
},
{
"epoch": 0.02584321265383328,
"grad_norm": 3.546029567718506,
"learning_rate": 5.165006226650062e-07,
"loss": 0.87,
"step": 1660
},
{
"epoch": 0.025921053655802658,
"grad_norm": 3.3729195594787598,
"learning_rate": 5.180572851805728e-07,
"loss": 0.9467,
"step": 1665
},
{
"epoch": 0.025998894657772036,
"grad_norm": 3.984131336212158,
"learning_rate": 5.196139476961394e-07,
"loss": 0.8571,
"step": 1670
},
{
"epoch": 0.02607673565974141,
"grad_norm": 5.9442291259765625,
"learning_rate": 5.21170610211706e-07,
"loss": 0.892,
"step": 1675
},
{
"epoch": 0.02615457666171079,
"grad_norm": 6.404414653778076,
"learning_rate": 5.227272727272727e-07,
"loss": 0.8978,
"step": 1680
},
{
"epoch": 0.026232417663680167,
"grad_norm": 8.53201961517334,
"learning_rate": 5.242839352428393e-07,
"loss": 0.8403,
"step": 1685
},
{
"epoch": 0.026310258665649545,
"grad_norm": 7.944653511047363,
"learning_rate": 5.25840597758406e-07,
"loss": 0.7752,
"step": 1690
},
{
"epoch": 0.02638809966761892,
"grad_norm": 4.13915491104126,
"learning_rate": 5.273972602739725e-07,
"loss": 1.053,
"step": 1695
},
{
"epoch": 0.0264659406695883,
"grad_norm": 9.199925422668457,
"learning_rate": 5.289539227895392e-07,
"loss": 1.0808,
"step": 1700
},
{
"epoch": 0.026543781671557676,
"grad_norm": 4.507978439331055,
"learning_rate": 5.305105853051058e-07,
"loss": 0.9654,
"step": 1705
},
{
"epoch": 0.026621622673527055,
"grad_norm": 5.004615783691406,
"learning_rate": 5.320672478206725e-07,
"loss": 0.9003,
"step": 1710
},
{
"epoch": 0.02669946367549643,
"grad_norm": 9.572540283203125,
"learning_rate": 5.336239103362391e-07,
"loss": 1.0496,
"step": 1715
},
{
"epoch": 0.026777304677465807,
"grad_norm": 6.494607925415039,
"learning_rate": 5.351805728518058e-07,
"loss": 0.9283,
"step": 1720
},
{
"epoch": 0.026855145679435186,
"grad_norm": 6.419877529144287,
"learning_rate": 5.367372353673723e-07,
"loss": 0.9118,
"step": 1725
},
{
"epoch": 0.026932986681404564,
"grad_norm": 8.065162658691406,
"learning_rate": 5.38293897882939e-07,
"loss": 0.9376,
"step": 1730
},
{
"epoch": 0.027010827683373942,
"grad_norm": 11.1658935546875,
"learning_rate": 5.398505603985056e-07,
"loss": 0.9734,
"step": 1735
},
{
"epoch": 0.027088668685343317,
"grad_norm": 8.80482006072998,
"learning_rate": 5.414072229140723e-07,
"loss": 0.9357,
"step": 1740
},
{
"epoch": 0.027166509687312695,
"grad_norm": 9.545907974243164,
"learning_rate": 5.429638854296388e-07,
"loss": 1.0168,
"step": 1745
},
{
"epoch": 0.027244350689282073,
"grad_norm": 3.2502315044403076,
"learning_rate": 5.445205479452054e-07,
"loss": 0.8814,
"step": 1750
},
{
"epoch": 0.02732219169125145,
"grad_norm": 7.160440921783447,
"learning_rate": 5.46077210460772e-07,
"loss": 0.8058,
"step": 1755
},
{
"epoch": 0.027400032693220826,
"grad_norm": 4.625821113586426,
"learning_rate": 5.476338729763387e-07,
"loss": 0.8834,
"step": 1760
},
{
"epoch": 0.027477873695190204,
"grad_norm": 6.714595317840576,
"learning_rate": 5.491905354919053e-07,
"loss": 0.9709,
"step": 1765
},
{
"epoch": 0.027555714697159582,
"grad_norm": 5.669415473937988,
"learning_rate": 5.50747198007472e-07,
"loss": 0.9646,
"step": 1770
},
{
"epoch": 0.02763355569912896,
"grad_norm": 6.046622276306152,
"learning_rate": 5.523038605230385e-07,
"loss": 0.8469,
"step": 1775
},
{
"epoch": 0.027711396701098335,
"grad_norm": 16.526947021484375,
"learning_rate": 5.538605230386052e-07,
"loss": 0.877,
"step": 1780
},
{
"epoch": 0.027789237703067713,
"grad_norm": 4.415500164031982,
"learning_rate": 5.554171855541718e-07,
"loss": 0.8356,
"step": 1785
},
{
"epoch": 0.02786707870503709,
"grad_norm": 4.823260307312012,
"learning_rate": 5.569738480697385e-07,
"loss": 0.9628,
"step": 1790
},
{
"epoch": 0.02794491970700647,
"grad_norm": 8.501585006713867,
"learning_rate": 5.585305105853051e-07,
"loss": 0.7859,
"step": 1795
},
{
"epoch": 0.028022760708975848,
"grad_norm": 10.616768836975098,
"learning_rate": 5.600871731008718e-07,
"loss": 0.9145,
"step": 1800
},
{
"epoch": 0.028100601710945222,
"grad_norm": 6.610407829284668,
"learning_rate": 5.616438356164383e-07,
"loss": 0.8058,
"step": 1805
},
{
"epoch": 0.0281784427129146,
"grad_norm": 4.978299617767334,
"learning_rate": 5.63200498132005e-07,
"loss": 0.8,
"step": 1810
},
{
"epoch": 0.02825628371488398,
"grad_norm": 4.922807693481445,
"learning_rate": 5.647571606475716e-07,
"loss": 1.0388,
"step": 1815
},
{
"epoch": 0.028334124716853357,
"grad_norm": 6.71333122253418,
"learning_rate": 5.663138231631383e-07,
"loss": 0.9221,
"step": 1820
},
{
"epoch": 0.02841196571882273,
"grad_norm": 4.787428379058838,
"learning_rate": 5.678704856787049e-07,
"loss": 0.8824,
"step": 1825
},
{
"epoch": 0.02848980672079211,
"grad_norm": 8.047598838806152,
"learning_rate": 5.694271481942715e-07,
"loss": 0.8187,
"step": 1830
},
{
"epoch": 0.028567647722761488,
"grad_norm": 6.064495086669922,
"learning_rate": 5.70983810709838e-07,
"loss": 0.7595,
"step": 1835
},
{
"epoch": 0.028645488724730866,
"grad_norm": 4.46295690536499,
"learning_rate": 5.725404732254047e-07,
"loss": 0.9397,
"step": 1840
},
{
"epoch": 0.02872332972670024,
"grad_norm": 7.761974334716797,
"learning_rate": 5.740971357409713e-07,
"loss": 0.915,
"step": 1845
},
{
"epoch": 0.02880117072866962,
"grad_norm": 5.134248733520508,
"learning_rate": 5.75653798256538e-07,
"loss": 0.862,
"step": 1850
},
{
"epoch": 0.028879011730638997,
"grad_norm": 5.424485206604004,
"learning_rate": 5.772104607721045e-07,
"loss": 0.8359,
"step": 1855
},
{
"epoch": 0.028956852732608375,
"grad_norm": 2.9714298248291016,
"learning_rate": 5.787671232876712e-07,
"loss": 0.7999,
"step": 1860
},
{
"epoch": 0.029034693734577753,
"grad_norm": 6.131465911865234,
"learning_rate": 5.803237858032378e-07,
"loss": 0.8619,
"step": 1865
},
{
"epoch": 0.029112534736547128,
"grad_norm": 7.894665241241455,
"learning_rate": 5.818804483188045e-07,
"loss": 0.771,
"step": 1870
},
{
"epoch": 0.029190375738516506,
"grad_norm": 3.163548469543457,
"learning_rate": 5.834371108343711e-07,
"loss": 0.7482,
"step": 1875
},
{
"epoch": 0.029268216740485884,
"grad_norm": 5.383469581604004,
"learning_rate": 5.849937733499378e-07,
"loss": 0.895,
"step": 1880
},
{
"epoch": 0.029346057742455262,
"grad_norm": 6.841033935546875,
"learning_rate": 5.865504358655043e-07,
"loss": 0.8822,
"step": 1885
},
{
"epoch": 0.029423898744424637,
"grad_norm": 9.069436073303223,
"learning_rate": 5.88107098381071e-07,
"loss": 0.8947,
"step": 1890
},
{
"epoch": 0.029501739746394015,
"grad_norm": 5.3066725730896,
"learning_rate": 5.896637608966376e-07,
"loss": 0.8046,
"step": 1895
},
{
"epoch": 0.029579580748363393,
"grad_norm": 5.761783599853516,
"learning_rate": 5.912204234122043e-07,
"loss": 0.9087,
"step": 1900
},
{
"epoch": 0.02965742175033277,
"grad_norm": 3.4487996101379395,
"learning_rate": 5.927770859277709e-07,
"loss": 0.9291,
"step": 1905
},
{
"epoch": 0.029735262752302146,
"grad_norm": 5.8793816566467285,
"learning_rate": 5.943337484433375e-07,
"loss": 0.8244,
"step": 1910
},
{
"epoch": 0.029813103754271524,
"grad_norm": 6.812746047973633,
"learning_rate": 5.958904109589041e-07,
"loss": 0.8169,
"step": 1915
},
{
"epoch": 0.029890944756240902,
"grad_norm": 5.695523738861084,
"learning_rate": 5.974470734744707e-07,
"loss": 0.9072,
"step": 1920
},
{
"epoch": 0.02996878575821028,
"grad_norm": 3.443061590194702,
"learning_rate": 5.990037359900373e-07,
"loss": 0.8709,
"step": 1925
},
{
"epoch": 0.03004662676017966,
"grad_norm": 6.014828681945801,
"learning_rate": 6.00560398505604e-07,
"loss": 0.917,
"step": 1930
},
{
"epoch": 0.030124467762149033,
"grad_norm": 4.14946985244751,
"learning_rate": 6.021170610211705e-07,
"loss": 0.8827,
"step": 1935
},
{
"epoch": 0.03020230876411841,
"grad_norm": 4.128273963928223,
"learning_rate": 6.036737235367372e-07,
"loss": 0.8062,
"step": 1940
},
{
"epoch": 0.03028014976608779,
"grad_norm": 5.5036115646362305,
"learning_rate": 6.052303860523038e-07,
"loss": 0.9832,
"step": 1945
},
{
"epoch": 0.030357990768057168,
"grad_norm": 5.694386005401611,
"learning_rate": 6.067870485678705e-07,
"loss": 0.7735,
"step": 1950
},
{
"epoch": 0.030435831770026543,
"grad_norm": 3.861293315887451,
"learning_rate": 6.083437110834371e-07,
"loss": 0.8782,
"step": 1955
},
{
"epoch": 0.03051367277199592,
"grad_norm": 5.179184436798096,
"learning_rate": 6.099003735990037e-07,
"loss": 0.908,
"step": 1960
},
{
"epoch": 0.0305915137739653,
"grad_norm": 4.929222106933594,
"learning_rate": 6.114570361145703e-07,
"loss": 0.8967,
"step": 1965
},
{
"epoch": 0.030669354775934677,
"grad_norm": 3.300053596496582,
"learning_rate": 6.13013698630137e-07,
"loss": 0.9517,
"step": 1970
},
{
"epoch": 0.03074719577790405,
"grad_norm": 4.976810932159424,
"learning_rate": 6.145703611457036e-07,
"loss": 0.8676,
"step": 1975
},
{
"epoch": 0.03082503677987343,
"grad_norm": 3.866328477859497,
"learning_rate": 6.161270236612703e-07,
"loss": 0.9735,
"step": 1980
},
{
"epoch": 0.030902877781842808,
"grad_norm": 4.272680759429932,
"learning_rate": 6.176836861768369e-07,
"loss": 0.9716,
"step": 1985
},
{
"epoch": 0.030980718783812186,
"grad_norm": 6.74641752243042,
"learning_rate": 6.192403486924035e-07,
"loss": 0.815,
"step": 1990
},
{
"epoch": 0.031058559785781564,
"grad_norm": 3.4278452396392822,
"learning_rate": 6.207970112079701e-07,
"loss": 0.9165,
"step": 1995
},
{
"epoch": 0.03113640078775094,
"grad_norm": 7.538846492767334,
"learning_rate": 6.223536737235368e-07,
"loss": 0.9513,
"step": 2000
},
{
"epoch": 0.031214241789720317,
"grad_norm": 4.969770431518555,
"learning_rate": 6.239103362391034e-07,
"loss": 0.816,
"step": 2005
},
{
"epoch": 0.03129208279168969,
"grad_norm": 9.244134902954102,
"learning_rate": 6.2546699875467e-07,
"loss": 0.9293,
"step": 2010
},
{
"epoch": 0.03136992379365907,
"grad_norm": 5.617055416107178,
"learning_rate": 6.270236612702365e-07,
"loss": 0.8553,
"step": 2015
},
{
"epoch": 0.03144776479562845,
"grad_norm": 4.888432502746582,
"learning_rate": 6.285803237858031e-07,
"loss": 0.8679,
"step": 2020
},
{
"epoch": 0.031525605797597826,
"grad_norm": 4.528554916381836,
"learning_rate": 6.301369863013698e-07,
"loss": 0.9086,
"step": 2025
},
{
"epoch": 0.031603446799567204,
"grad_norm": 6.504762172698975,
"learning_rate": 6.316936488169364e-07,
"loss": 0.9501,
"step": 2030
},
{
"epoch": 0.03168128780153658,
"grad_norm": 3.974257230758667,
"learning_rate": 6.332503113325031e-07,
"loss": 0.888,
"step": 2035
},
{
"epoch": 0.03175912880350596,
"grad_norm": 8.628198623657227,
"learning_rate": 6.348069738480696e-07,
"loss": 0.8664,
"step": 2040
},
{
"epoch": 0.03183696980547534,
"grad_norm": 4.1892805099487305,
"learning_rate": 6.363636363636363e-07,
"loss": 0.9896,
"step": 2045
},
{
"epoch": 0.03191481080744471,
"grad_norm": 5.350588321685791,
"learning_rate": 6.37920298879203e-07,
"loss": 0.8815,
"step": 2050
},
{
"epoch": 0.03199265180941409,
"grad_norm": 5.569740295410156,
"learning_rate": 6.394769613947696e-07,
"loss": 0.8785,
"step": 2055
},
{
"epoch": 0.032070492811383466,
"grad_norm": 7.358509063720703,
"learning_rate": 6.410336239103362e-07,
"loss": 0.8415,
"step": 2060
},
{
"epoch": 0.032148333813352845,
"grad_norm": 5.384446144104004,
"learning_rate": 6.425902864259029e-07,
"loss": 0.9108,
"step": 2065
},
{
"epoch": 0.03222617481532222,
"grad_norm": 4.48892068862915,
"learning_rate": 6.441469489414694e-07,
"loss": 0.9423,
"step": 2070
},
{
"epoch": 0.0323040158172916,
"grad_norm": 4.302936553955078,
"learning_rate": 6.457036114570361e-07,
"loss": 0.8849,
"step": 2075
},
{
"epoch": 0.03238185681926098,
"grad_norm": 5.185121536254883,
"learning_rate": 6.472602739726027e-07,
"loss": 0.8177,
"step": 2080
},
{
"epoch": 0.03245969782123036,
"grad_norm": 3.2999234199523926,
"learning_rate": 6.488169364881694e-07,
"loss": 0.9199,
"step": 2085
},
{
"epoch": 0.032537538823199735,
"grad_norm": 19.133163452148438,
"learning_rate": 6.50373599003736e-07,
"loss": 0.9283,
"step": 2090
},
{
"epoch": 0.03261537982516911,
"grad_norm": 3.4535083770751953,
"learning_rate": 6.519302615193026e-07,
"loss": 0.9707,
"step": 2095
},
{
"epoch": 0.032693220827138485,
"grad_norm": 11.507316589355469,
"learning_rate": 6.534869240348691e-07,
"loss": 0.8878,
"step": 2100
},
{
"epoch": 0.03277106182910786,
"grad_norm": 21.363101959228516,
"learning_rate": 6.550435865504358e-07,
"loss": 0.9761,
"step": 2105
},
{
"epoch": 0.03284890283107724,
"grad_norm": 4.29213285446167,
"learning_rate": 6.566002490660024e-07,
"loss": 0.7789,
"step": 2110
},
{
"epoch": 0.03292674383304662,
"grad_norm": 7.540319442749023,
"learning_rate": 6.581569115815691e-07,
"loss": 0.7991,
"step": 2115
},
{
"epoch": 0.033004584835016,
"grad_norm": 3.658780097961426,
"learning_rate": 6.597135740971356e-07,
"loss": 0.9211,
"step": 2120
},
{
"epoch": 0.033082425836985375,
"grad_norm": 8.205567359924316,
"learning_rate": 6.612702366127023e-07,
"loss": 0.8083,
"step": 2125
},
{
"epoch": 0.033160266838954754,
"grad_norm": 6.272342681884766,
"learning_rate": 6.628268991282689e-07,
"loss": 0.9724,
"step": 2130
},
{
"epoch": 0.03323810784092413,
"grad_norm": 7.037917137145996,
"learning_rate": 6.643835616438356e-07,
"loss": 0.8821,
"step": 2135
},
{
"epoch": 0.0333159488428935,
"grad_norm": 6.3946027755737305,
"learning_rate": 6.659402241594022e-07,
"loss": 0.9065,
"step": 2140
},
{
"epoch": 0.03339378984486288,
"grad_norm": 7.079307556152344,
"learning_rate": 6.674968866749689e-07,
"loss": 0.993,
"step": 2145
},
{
"epoch": 0.03347163084683226,
"grad_norm": 6.372123718261719,
"learning_rate": 6.690535491905354e-07,
"loss": 0.9372,
"step": 2150
},
{
"epoch": 0.03354947184880164,
"grad_norm": 2.9949862957000732,
"learning_rate": 6.706102117061021e-07,
"loss": 0.7785,
"step": 2155
},
{
"epoch": 0.033627312850771016,
"grad_norm": 5.278440475463867,
"learning_rate": 6.721668742216687e-07,
"loss": 0.8646,
"step": 2160
},
{
"epoch": 0.033705153852740394,
"grad_norm": 3.972559928894043,
"learning_rate": 6.737235367372354e-07,
"loss": 0.8816,
"step": 2165
},
{
"epoch": 0.03378299485470977,
"grad_norm": 7.038811683654785,
"learning_rate": 6.75280199252802e-07,
"loss": 0.8586,
"step": 2170
},
{
"epoch": 0.03386083585667915,
"grad_norm": 4.659327507019043,
"learning_rate": 6.768368617683686e-07,
"loss": 0.8671,
"step": 2175
},
{
"epoch": 0.03393867685864852,
"grad_norm": 3.272244453430176,
"learning_rate": 6.783935242839352e-07,
"loss": 0.8553,
"step": 2180
},
{
"epoch": 0.0340165178606179,
"grad_norm": 4.486519813537598,
"learning_rate": 6.799501867995019e-07,
"loss": 0.9569,
"step": 2185
},
{
"epoch": 0.03409435886258728,
"grad_norm": 3.5172436237335205,
"learning_rate": 6.815068493150684e-07,
"loss": 0.9172,
"step": 2190
},
{
"epoch": 0.034172199864556656,
"grad_norm": 8.919556617736816,
"learning_rate": 6.830635118306351e-07,
"loss": 0.9005,
"step": 2195
},
{
"epoch": 0.034250040866526034,
"grad_norm": 3.1688411235809326,
"learning_rate": 6.846201743462016e-07,
"loss": 0.9052,
"step": 2200
},
{
"epoch": 0.03432788186849541,
"grad_norm": 8.181324005126953,
"learning_rate": 6.861768368617683e-07,
"loss": 0.9344,
"step": 2205
},
{
"epoch": 0.03440572287046479,
"grad_norm": 14.188647270202637,
"learning_rate": 6.877334993773349e-07,
"loss": 0.8221,
"step": 2210
},
{
"epoch": 0.03448356387243417,
"grad_norm": 2.8779571056365967,
"learning_rate": 6.892901618929016e-07,
"loss": 0.8213,
"step": 2215
},
{
"epoch": 0.034561404874403547,
"grad_norm": 4.762483596801758,
"learning_rate": 6.908468244084682e-07,
"loss": 0.9539,
"step": 2220
},
{
"epoch": 0.03463924587637292,
"grad_norm": 5.372674942016602,
"learning_rate": 6.924034869240348e-07,
"loss": 0.9323,
"step": 2225
},
{
"epoch": 0.034717086878342296,
"grad_norm": 4.73727560043335,
"learning_rate": 6.939601494396014e-07,
"loss": 0.9555,
"step": 2230
},
{
"epoch": 0.034794927880311674,
"grad_norm": 2.479062557220459,
"learning_rate": 6.955168119551681e-07,
"loss": 0.8026,
"step": 2235
},
{
"epoch": 0.03487276888228105,
"grad_norm": 4.98023796081543,
"learning_rate": 6.970734744707347e-07,
"loss": 0.9514,
"step": 2240
},
{
"epoch": 0.03495060988425043,
"grad_norm": 4.072389125823975,
"learning_rate": 6.986301369863014e-07,
"loss": 0.9739,
"step": 2245
},
{
"epoch": 0.03502845088621981,
"grad_norm": 3.26598858833313,
"learning_rate": 7.00186799501868e-07,
"loss": 0.8112,
"step": 2250
},
{
"epoch": 0.03510629188818919,
"grad_norm": 10.324394226074219,
"learning_rate": 7.017434620174346e-07,
"loss": 0.8578,
"step": 2255
},
{
"epoch": 0.035184132890158565,
"grad_norm": 7.579793453216553,
"learning_rate": 7.033001245330012e-07,
"loss": 0.8586,
"step": 2260
},
{
"epoch": 0.035261973892127936,
"grad_norm": 3.6266613006591797,
"learning_rate": 7.048567870485679e-07,
"loss": 0.8904,
"step": 2265
},
{
"epoch": 0.035339814894097314,
"grad_norm": 4.336295127868652,
"learning_rate": 7.064134495641345e-07,
"loss": 0.84,
"step": 2270
},
{
"epoch": 0.03541765589606669,
"grad_norm": 3.5872817039489746,
"learning_rate": 7.079701120797012e-07,
"loss": 0.7951,
"step": 2275
},
{
"epoch": 0.03549549689803607,
"grad_norm": 4.598228454589844,
"learning_rate": 7.095267745952676e-07,
"loss": 0.7467,
"step": 2280
},
{
"epoch": 0.03557333790000545,
"grad_norm": 3.560222625732422,
"learning_rate": 7.110834371108343e-07,
"loss": 0.9047,
"step": 2285
},
{
"epoch": 0.03565117890197483,
"grad_norm": 2.8487563133239746,
"learning_rate": 7.126400996264009e-07,
"loss": 0.9243,
"step": 2290
},
{
"epoch": 0.035729019903944205,
"grad_norm": 5.525490760803223,
"learning_rate": 7.141967621419676e-07,
"loss": 0.8549,
"step": 2295
},
{
"epoch": 0.03580686090591358,
"grad_norm": 3.5428950786590576,
"learning_rate": 7.157534246575342e-07,
"loss": 0.9336,
"step": 2300
},
{
"epoch": 0.03588470190788296,
"grad_norm": 8.396724700927734,
"learning_rate": 7.173100871731008e-07,
"loss": 0.9101,
"step": 2305
},
{
"epoch": 0.03596254290985233,
"grad_norm": 6.355068206787109,
"learning_rate": 7.188667496886674e-07,
"loss": 0.8673,
"step": 2310
},
{
"epoch": 0.03604038391182171,
"grad_norm": 8.388739585876465,
"learning_rate": 7.204234122042341e-07,
"loss": 0.9225,
"step": 2315
},
{
"epoch": 0.03611822491379109,
"grad_norm": 4.088027477264404,
"learning_rate": 7.219800747198007e-07,
"loss": 0.8003,
"step": 2320
},
{
"epoch": 0.03619606591576047,
"grad_norm": 3.6764137744903564,
"learning_rate": 7.235367372353674e-07,
"loss": 0.782,
"step": 2325
},
{
"epoch": 0.036273906917729845,
"grad_norm": 3.6554110050201416,
"learning_rate": 7.25093399750934e-07,
"loss": 0.9257,
"step": 2330
},
{
"epoch": 0.03635174791969922,
"grad_norm": 6.99379301071167,
"learning_rate": 7.266500622665006e-07,
"loss": 0.9329,
"step": 2335
},
{
"epoch": 0.0364295889216686,
"grad_norm": 3.984800100326538,
"learning_rate": 7.282067247820672e-07,
"loss": 0.9596,
"step": 2340
},
{
"epoch": 0.03650742992363798,
"grad_norm": 7.992112159729004,
"learning_rate": 7.297633872976339e-07,
"loss": 0.8945,
"step": 2345
},
{
"epoch": 0.03658527092560736,
"grad_norm": 3.314192295074463,
"learning_rate": 7.313200498132005e-07,
"loss": 0.817,
"step": 2350
},
{
"epoch": 0.03666311192757673,
"grad_norm": 5.738452434539795,
"learning_rate": 7.328767123287672e-07,
"loss": 0.8133,
"step": 2355
},
{
"epoch": 0.03674095292954611,
"grad_norm": 4.364063739776611,
"learning_rate": 7.344333748443337e-07,
"loss": 0.7555,
"step": 2360
},
{
"epoch": 0.036818793931515485,
"grad_norm": 6.397834777832031,
"learning_rate": 7.359900373599004e-07,
"loss": 0.9289,
"step": 2365
},
{
"epoch": 0.03689663493348486,
"grad_norm": 4.602386951446533,
"learning_rate": 7.375466998754669e-07,
"loss": 0.8466,
"step": 2370
},
{
"epoch": 0.03697447593545424,
"grad_norm": 4.438021659851074,
"learning_rate": 7.391033623910336e-07,
"loss": 0.8155,
"step": 2375
},
{
"epoch": 0.03705231693742362,
"grad_norm": 5.829861164093018,
"learning_rate": 7.406600249066002e-07,
"loss": 0.9119,
"step": 2380
},
{
"epoch": 0.037130157939393,
"grad_norm": 3.999397039413452,
"learning_rate": 7.422166874221668e-07,
"loss": 0.9544,
"step": 2385
},
{
"epoch": 0.037207998941362376,
"grad_norm": 7.094069480895996,
"learning_rate": 7.437733499377334e-07,
"loss": 0.8562,
"step": 2390
},
{
"epoch": 0.03728583994333175,
"grad_norm": 7.502668857574463,
"learning_rate": 7.453300124533001e-07,
"loss": 0.7626,
"step": 2395
},
{
"epoch": 0.037363680945301125,
"grad_norm": 4.224865913391113,
"learning_rate": 7.468866749688667e-07,
"loss": 0.8287,
"step": 2400
},
{
"epoch": 0.037441521947270504,
"grad_norm": 4.2678046226501465,
"learning_rate": 7.484433374844334e-07,
"loss": 0.9631,
"step": 2405
},
{
"epoch": 0.03751936294923988,
"grad_norm": 4.143566608428955,
"learning_rate": 7.5e-07,
"loss": 0.9013,
"step": 2410
},
{
"epoch": 0.03759720395120926,
"grad_norm": 3.8706650733947754,
"learning_rate": 7.515566625155666e-07,
"loss": 0.8463,
"step": 2415
},
{
"epoch": 0.03767504495317864,
"grad_norm": 6.372035503387451,
"learning_rate": 7.531133250311332e-07,
"loss": 0.8966,
"step": 2420
},
{
"epoch": 0.037752885955148016,
"grad_norm": 4.3398613929748535,
"learning_rate": 7.546699875466999e-07,
"loss": 0.8045,
"step": 2425
},
{
"epoch": 0.037830726957117394,
"grad_norm": 2.7824904918670654,
"learning_rate": 7.562266500622665e-07,
"loss": 0.8311,
"step": 2430
},
{
"epoch": 0.03790856795908677,
"grad_norm": 3.9570069313049316,
"learning_rate": 7.577833125778332e-07,
"loss": 0.9548,
"step": 2435
},
{
"epoch": 0.037986408961056144,
"grad_norm": 4.316530227661133,
"learning_rate": 7.593399750933997e-07,
"loss": 0.7945,
"step": 2440
},
{
"epoch": 0.03806424996302552,
"grad_norm": 4.4045844078063965,
"learning_rate": 7.608966376089664e-07,
"loss": 0.9145,
"step": 2445
},
{
"epoch": 0.0381420909649949,
"grad_norm": 3.736820697784424,
"learning_rate": 7.62453300124533e-07,
"loss": 0.9292,
"step": 2450
},
{
"epoch": 0.03821993196696428,
"grad_norm": 3.8448410034179688,
"learning_rate": 7.640099626400996e-07,
"loss": 0.8863,
"step": 2455
},
{
"epoch": 0.038297772968933656,
"grad_norm": 7.468678951263428,
"learning_rate": 7.655666251556662e-07,
"loss": 0.8776,
"step": 2460
},
{
"epoch": 0.038375613970903034,
"grad_norm": 4.066128253936768,
"learning_rate": 7.671232876712328e-07,
"loss": 0.906,
"step": 2465
},
{
"epoch": 0.03845345497287241,
"grad_norm": 8.009504318237305,
"learning_rate": 7.686799501867994e-07,
"loss": 0.9295,
"step": 2470
},
{
"epoch": 0.03853129597484179,
"grad_norm": 3.9662601947784424,
"learning_rate": 7.702366127023661e-07,
"loss": 0.927,
"step": 2475
},
{
"epoch": 0.03860913697681117,
"grad_norm": 3.94587779045105,
"learning_rate": 7.717932752179327e-07,
"loss": 0.9679,
"step": 2480
},
{
"epoch": 0.03868697797878054,
"grad_norm": 3.856196641921997,
"learning_rate": 7.733499377334994e-07,
"loss": 0.981,
"step": 2485
},
{
"epoch": 0.03876481898074992,
"grad_norm": 12.542234420776367,
"learning_rate": 7.749066002490659e-07,
"loss": 0.8065,
"step": 2490
},
{
"epoch": 0.038842659982719296,
"grad_norm": 5.717936038970947,
"learning_rate": 7.764632627646326e-07,
"loss": 0.8912,
"step": 2495
},
{
"epoch": 0.038920500984688675,
"grad_norm": 9.94604206085205,
"learning_rate": 7.780199252801992e-07,
"loss": 0.9442,
"step": 2500
},
{
"epoch": 0.03899834198665805,
"grad_norm": 5.26216983795166,
"learning_rate": 7.795765877957659e-07,
"loss": 0.9552,
"step": 2505
},
{
"epoch": 0.03907618298862743,
"grad_norm": 6.468954563140869,
"learning_rate": 7.811332503113325e-07,
"loss": 0.9372,
"step": 2510
},
{
"epoch": 0.03915402399059681,
"grad_norm": 2.9301857948303223,
"learning_rate": 7.826899128268992e-07,
"loss": 0.8479,
"step": 2515
},
{
"epoch": 0.03923186499256619,
"grad_norm": 6.389108657836914,
"learning_rate": 7.842465753424657e-07,
"loss": 0.9255,
"step": 2520
},
{
"epoch": 0.03930970599453556,
"grad_norm": 4.842959880828857,
"learning_rate": 7.858032378580324e-07,
"loss": 0.8478,
"step": 2525
},
{
"epoch": 0.03938754699650494,
"grad_norm": 3.118706464767456,
"learning_rate": 7.87359900373599e-07,
"loss": 0.8668,
"step": 2530
},
{
"epoch": 0.039465387998474315,
"grad_norm": 6.257364273071289,
"learning_rate": 7.889165628891657e-07,
"loss": 0.873,
"step": 2535
},
{
"epoch": 0.03954322900044369,
"grad_norm": 4.405180931091309,
"learning_rate": 7.904732254047323e-07,
"loss": 0.841,
"step": 2540
},
{
"epoch": 0.03962107000241307,
"grad_norm": 9.870434761047363,
"learning_rate": 7.920298879202987e-07,
"loss": 0.937,
"step": 2545
},
{
"epoch": 0.03969891100438245,
"grad_norm": 3.4615135192871094,
"learning_rate": 7.935865504358654e-07,
"loss": 0.9465,
"step": 2550
},
{
"epoch": 0.03977675200635183,
"grad_norm": 4.9833760261535645,
"learning_rate": 7.95143212951432e-07,
"loss": 0.8261,
"step": 2555
},
{
"epoch": 0.039854593008321205,
"grad_norm": 4.042236804962158,
"learning_rate": 7.966998754669987e-07,
"loss": 0.9602,
"step": 2560
},
{
"epoch": 0.039932434010290584,
"grad_norm": 4.549630641937256,
"learning_rate": 7.982565379825654e-07,
"loss": 0.8674,
"step": 2565
},
{
"epoch": 0.040010275012259955,
"grad_norm": 3.67543363571167,
"learning_rate": 7.998132004981319e-07,
"loss": 0.9073,
"step": 2570
},
{
"epoch": 0.04008811601422933,
"grad_norm": 6.078221321105957,
"learning_rate": 8.013698630136985e-07,
"loss": 0.9359,
"step": 2575
},
{
"epoch": 0.04016595701619871,
"grad_norm": 5.599534034729004,
"learning_rate": 8.029265255292652e-07,
"loss": 0.8628,
"step": 2580
},
{
"epoch": 0.04024379801816809,
"grad_norm": 5.098958492279053,
"learning_rate": 8.044831880448319e-07,
"loss": 0.9412,
"step": 2585
},
{
"epoch": 0.04032163902013747,
"grad_norm": 7.108897686004639,
"learning_rate": 8.060398505603985e-07,
"loss": 0.9517,
"step": 2590
},
{
"epoch": 0.040399480022106846,
"grad_norm": 4.495419979095459,
"learning_rate": 8.075965130759652e-07,
"loss": 0.8605,
"step": 2595
},
{
"epoch": 0.040477321024076224,
"grad_norm": 4.583033084869385,
"learning_rate": 8.091531755915317e-07,
"loss": 0.8956,
"step": 2600
},
{
"epoch": 0.0405551620260456,
"grad_norm": 5.067065238952637,
"learning_rate": 8.107098381070983e-07,
"loss": 0.8306,
"step": 2605
},
{
"epoch": 0.04063300302801497,
"grad_norm": 7.724658012390137,
"learning_rate": 8.12266500622665e-07,
"loss": 0.8716,
"step": 2610
},
{
"epoch": 0.04071084402998435,
"grad_norm": 2.8972911834716797,
"learning_rate": 8.138231631382317e-07,
"loss": 0.8987,
"step": 2615
},
{
"epoch": 0.04078868503195373,
"grad_norm": 7.840747833251953,
"learning_rate": 8.153798256537983e-07,
"loss": 0.8418,
"step": 2620
},
{
"epoch": 0.04086652603392311,
"grad_norm": 7.727685928344727,
"learning_rate": 8.169364881693648e-07,
"loss": 0.9107,
"step": 2625
},
{
"epoch": 0.040944367035892486,
"grad_norm": 3.801807165145874,
"learning_rate": 8.184931506849315e-07,
"loss": 0.8083,
"step": 2630
},
{
"epoch": 0.041022208037861864,
"grad_norm": 12.985006332397461,
"learning_rate": 8.20049813200498e-07,
"loss": 0.9866,
"step": 2635
},
{
"epoch": 0.04110004903983124,
"grad_norm": 3.2062785625457764,
"learning_rate": 8.216064757160647e-07,
"loss": 0.8945,
"step": 2640
},
{
"epoch": 0.04117789004180062,
"grad_norm": 4.6915459632873535,
"learning_rate": 8.231631382316313e-07,
"loss": 0.8236,
"step": 2645
},
{
"epoch": 0.04125573104377,
"grad_norm": 3.5803701877593994,
"learning_rate": 8.247198007471979e-07,
"loss": 0.8776,
"step": 2650
},
{
"epoch": 0.04133357204573937,
"grad_norm": 12.053580284118652,
"learning_rate": 8.262764632627645e-07,
"loss": 0.9105,
"step": 2655
},
{
"epoch": 0.04141141304770875,
"grad_norm": 6.285280227661133,
"learning_rate": 8.278331257783312e-07,
"loss": 0.9066,
"step": 2660
},
{
"epoch": 0.041489254049678126,
"grad_norm": 5.232326984405518,
"learning_rate": 8.293897882938978e-07,
"loss": 0.8508,
"step": 2665
},
{
"epoch": 0.041567095051647504,
"grad_norm": 3.038318395614624,
"learning_rate": 8.309464508094645e-07,
"loss": 0.8356,
"step": 2670
},
{
"epoch": 0.04164493605361688,
"grad_norm": 7.6262335777282715,
"learning_rate": 8.32503113325031e-07,
"loss": 0.9568,
"step": 2675
},
{
"epoch": 0.04172277705558626,
"grad_norm": 3.0321080684661865,
"learning_rate": 8.340597758405977e-07,
"loss": 0.7881,
"step": 2680
},
{
"epoch": 0.04180061805755564,
"grad_norm": 9.739387512207031,
"learning_rate": 8.356164383561643e-07,
"loss": 0.74,
"step": 2685
},
{
"epoch": 0.04187845905952502,
"grad_norm": 8.000276565551758,
"learning_rate": 8.37173100871731e-07,
"loss": 0.9413,
"step": 2690
},
{
"epoch": 0.041956300061494395,
"grad_norm": 6.706925868988037,
"learning_rate": 8.387297633872976e-07,
"loss": 0.7788,
"step": 2695
},
{
"epoch": 0.042034141063463766,
"grad_norm": 6.568419933319092,
"learning_rate": 8.402864259028643e-07,
"loss": 0.8554,
"step": 2700
},
{
"epoch": 0.042111982065433144,
"grad_norm": 3.8879165649414062,
"learning_rate": 8.418430884184308e-07,
"loss": 0.9076,
"step": 2705
},
{
"epoch": 0.04218982306740252,
"grad_norm": 5.89036226272583,
"learning_rate": 8.433997509339975e-07,
"loss": 0.91,
"step": 2710
},
{
"epoch": 0.0422676640693719,
"grad_norm": 5.522625923156738,
"learning_rate": 8.449564134495641e-07,
"loss": 0.9102,
"step": 2715
},
{
"epoch": 0.04234550507134128,
"grad_norm": 4.862393379211426,
"learning_rate": 8.465130759651308e-07,
"loss": 0.8503,
"step": 2720
},
{
"epoch": 0.04242334607331066,
"grad_norm": 8.545342445373535,
"learning_rate": 8.480697384806973e-07,
"loss": 0.7882,
"step": 2725
},
{
"epoch": 0.042501187075280035,
"grad_norm": 3.1325466632843018,
"learning_rate": 8.496264009962639e-07,
"loss": 0.7993,
"step": 2730
},
{
"epoch": 0.04257902807724941,
"grad_norm": 3.6244635581970215,
"learning_rate": 8.511830635118305e-07,
"loss": 0.7498,
"step": 2735
},
{
"epoch": 0.042656869079218784,
"grad_norm": 7.154248237609863,
"learning_rate": 8.527397260273972e-07,
"loss": 0.856,
"step": 2740
},
{
"epoch": 0.04273471008118816,
"grad_norm": 4.3253960609436035,
"learning_rate": 8.542963885429638e-07,
"loss": 1.0214,
"step": 2745
},
{
"epoch": 0.04281255108315754,
"grad_norm": 4.56231164932251,
"learning_rate": 8.558530510585305e-07,
"loss": 0.8501,
"step": 2750
},
{
"epoch": 0.04289039208512692,
"grad_norm": 3.396204710006714,
"learning_rate": 8.57409713574097e-07,
"loss": 0.9002,
"step": 2755
},
{
"epoch": 0.0429682330870963,
"grad_norm": 5.2896952629089355,
"learning_rate": 8.589663760896637e-07,
"loss": 0.8824,
"step": 2760
},
{
"epoch": 0.043046074089065675,
"grad_norm": 2.9441330432891846,
"learning_rate": 8.605230386052303e-07,
"loss": 0.915,
"step": 2765
},
{
"epoch": 0.04312391509103505,
"grad_norm": 3.7935092449188232,
"learning_rate": 8.62079701120797e-07,
"loss": 0.9273,
"step": 2770
},
{
"epoch": 0.04320175609300443,
"grad_norm": 2.87821102142334,
"learning_rate": 8.636363636363636e-07,
"loss": 0.7991,
"step": 2775
},
{
"epoch": 0.04327959709497381,
"grad_norm": 6.359185218811035,
"learning_rate": 8.651930261519303e-07,
"loss": 0.8009,
"step": 2780
},
{
"epoch": 0.04335743809694318,
"grad_norm": 4.339592456817627,
"learning_rate": 8.667496886674968e-07,
"loss": 0.9357,
"step": 2785
},
{
"epoch": 0.04343527909891256,
"grad_norm": 5.373045921325684,
"learning_rate": 8.683063511830635e-07,
"loss": 0.9278,
"step": 2790
},
{
"epoch": 0.04351312010088194,
"grad_norm": 4.687058448791504,
"learning_rate": 8.698630136986301e-07,
"loss": 0.8102,
"step": 2795
},
{
"epoch": 0.043590961102851315,
"grad_norm": 3.0270180702209473,
"learning_rate": 8.714196762141968e-07,
"loss": 0.7115,
"step": 2800
},
{
"epoch": 0.04366880210482069,
"grad_norm": 4.379403114318848,
"learning_rate": 8.729763387297634e-07,
"loss": 0.9258,
"step": 2805
},
{
"epoch": 0.04374664310679007,
"grad_norm": 5.339996814727783,
"learning_rate": 8.7453300124533e-07,
"loss": 0.8254,
"step": 2810
},
{
"epoch": 0.04382448410875945,
"grad_norm": 8.269269943237305,
"learning_rate": 8.760896637608965e-07,
"loss": 0.8743,
"step": 2815
},
{
"epoch": 0.04390232511072883,
"grad_norm": 3.314060688018799,
"learning_rate": 8.776463262764632e-07,
"loss": 0.8462,
"step": 2820
},
{
"epoch": 0.043980166112698206,
"grad_norm": 5.628077507019043,
"learning_rate": 8.792029887920298e-07,
"loss": 0.8659,
"step": 2825
},
{
"epoch": 0.04405800711466758,
"grad_norm": 5.050654888153076,
"learning_rate": 8.807596513075965e-07,
"loss": 0.9523,
"step": 2830
},
{
"epoch": 0.044135848116636955,
"grad_norm": 3.8180229663848877,
"learning_rate": 8.82316313823163e-07,
"loss": 0.8504,
"step": 2835
},
{
"epoch": 0.044213689118606334,
"grad_norm": 3.3269238471984863,
"learning_rate": 8.838729763387297e-07,
"loss": 0.8544,
"step": 2840
},
{
"epoch": 0.04429153012057571,
"grad_norm": 5.524733066558838,
"learning_rate": 8.854296388542963e-07,
"loss": 0.8963,
"step": 2845
},
{
"epoch": 0.04436937112254509,
"grad_norm": 4.741829872131348,
"learning_rate": 8.86986301369863e-07,
"loss": 0.8262,
"step": 2850
},
{
"epoch": 0.04444721212451447,
"grad_norm": 4.213449001312256,
"learning_rate": 8.885429638854296e-07,
"loss": 0.8722,
"step": 2855
},
{
"epoch": 0.044525053126483846,
"grad_norm": 3.3883323669433594,
"learning_rate": 8.900996264009963e-07,
"loss": 0.8378,
"step": 2860
},
{
"epoch": 0.044602894128453224,
"grad_norm": 5.192069053649902,
"learning_rate": 8.916562889165628e-07,
"loss": 0.8204,
"step": 2865
},
{
"epoch": 0.044680735130422596,
"grad_norm": 3.5852484703063965,
"learning_rate": 8.932129514321295e-07,
"loss": 0.9008,
"step": 2870
},
{
"epoch": 0.044758576132391974,
"grad_norm": 5.334090709686279,
"learning_rate": 8.947696139476961e-07,
"loss": 0.7885,
"step": 2875
},
{
"epoch": 0.04483641713436135,
"grad_norm": 5.502117156982422,
"learning_rate": 8.963262764632628e-07,
"loss": 0.8715,
"step": 2880
},
{
"epoch": 0.04491425813633073,
"grad_norm": 3.577226400375366,
"learning_rate": 8.978829389788294e-07,
"loss": 0.8076,
"step": 2885
},
{
"epoch": 0.04499209913830011,
"grad_norm": 2.6925950050354004,
"learning_rate": 8.99439601494396e-07,
"loss": 0.8624,
"step": 2890
},
{
"epoch": 0.045069940140269486,
"grad_norm": 15.992047309875488,
"learning_rate": 9.009962640099626e-07,
"loss": 0.8628,
"step": 2895
},
{
"epoch": 0.045147781142238864,
"grad_norm": 4.682984352111816,
"learning_rate": 9.025529265255293e-07,
"loss": 0.876,
"step": 2900
},
{
"epoch": 0.04522562214420824,
"grad_norm": 3.694166421890259,
"learning_rate": 9.041095890410958e-07,
"loss": 0.7707,
"step": 2905
},
{
"epoch": 0.04530346314617762,
"grad_norm": 6.382852077484131,
"learning_rate": 9.056662515566625e-07,
"loss": 0.8372,
"step": 2910
},
{
"epoch": 0.04538130414814699,
"grad_norm": 21.609174728393555,
"learning_rate": 9.07222914072229e-07,
"loss": 0.9099,
"step": 2915
},
{
"epoch": 0.04545914515011637,
"grad_norm": 2.4359121322631836,
"learning_rate": 9.087795765877957e-07,
"loss": 0.9096,
"step": 2920
},
{
"epoch": 0.04553698615208575,
"grad_norm": 3.6651458740234375,
"learning_rate": 9.103362391033623e-07,
"loss": 0.9408,
"step": 2925
},
{
"epoch": 0.045614827154055126,
"grad_norm": 5.385332107543945,
"learning_rate": 9.11892901618929e-07,
"loss": 0.9312,
"step": 2930
},
{
"epoch": 0.045692668156024505,
"grad_norm": 3.9548020362854004,
"learning_rate": 9.134495641344956e-07,
"loss": 0.8158,
"step": 2935
},
{
"epoch": 0.04577050915799388,
"grad_norm": 2.7402524948120117,
"learning_rate": 9.150062266500622e-07,
"loss": 0.7451,
"step": 2940
},
{
"epoch": 0.04584835015996326,
"grad_norm": 6.259926795959473,
"learning_rate": 9.165628891656288e-07,
"loss": 0.7399,
"step": 2945
},
{
"epoch": 0.04592619116193264,
"grad_norm": 2.590571403503418,
"learning_rate": 9.181195516811955e-07,
"loss": 0.9088,
"step": 2950
},
{
"epoch": 0.04600403216390201,
"grad_norm": 4.331900596618652,
"learning_rate": 9.196762141967621e-07,
"loss": 0.8636,
"step": 2955
},
{
"epoch": 0.04608187316587139,
"grad_norm": 5.567667007446289,
"learning_rate": 9.212328767123288e-07,
"loss": 0.9059,
"step": 2960
},
{
"epoch": 0.04615971416784077,
"grad_norm": 4.826610565185547,
"learning_rate": 9.227895392278954e-07,
"loss": 0.7942,
"step": 2965
},
{
"epoch": 0.046237555169810145,
"grad_norm": 7.561775207519531,
"learning_rate": 9.24346201743462e-07,
"loss": 0.8461,
"step": 2970
},
{
"epoch": 0.04631539617177952,
"grad_norm": 4.251841068267822,
"learning_rate": 9.259028642590286e-07,
"loss": 0.8541,
"step": 2975
},
{
"epoch": 0.0463932371737489,
"grad_norm": 5.157746315002441,
"learning_rate": 9.274595267745953e-07,
"loss": 0.8863,
"step": 2980
},
{
"epoch": 0.04647107817571828,
"grad_norm": 4.906675815582275,
"learning_rate": 9.290161892901619e-07,
"loss": 0.8432,
"step": 2985
},
{
"epoch": 0.04654891917768766,
"grad_norm": 4.339780807495117,
"learning_rate": 9.305728518057285e-07,
"loss": 0.8448,
"step": 2990
},
{
"epoch": 0.046626760179657036,
"grad_norm": 7.889379024505615,
"learning_rate": 9.32129514321295e-07,
"loss": 0.9431,
"step": 2995
},
{
"epoch": 0.04670460118162641,
"grad_norm": 3.7697620391845703,
"learning_rate": 9.336861768368617e-07,
"loss": 0.8569,
"step": 3000
},
{
"epoch": 0.046782442183595785,
"grad_norm": 8.277153968811035,
"learning_rate": 9.352428393524283e-07,
"loss": 0.7687,
"step": 3005
},
{
"epoch": 0.04686028318556516,
"grad_norm": 6.6820149421691895,
"learning_rate": 9.36799501867995e-07,
"loss": 0.8283,
"step": 3010
},
{
"epoch": 0.04693812418753454,
"grad_norm": 6.457581996917725,
"learning_rate": 9.383561643835616e-07,
"loss": 0.8447,
"step": 3015
},
{
"epoch": 0.04701596518950392,
"grad_norm": 8.55042552947998,
"learning_rate": 9.399128268991282e-07,
"loss": 0.8986,
"step": 3020
},
{
"epoch": 0.0470938061914733,
"grad_norm": 8.297921180725098,
"learning_rate": 9.414694894146948e-07,
"loss": 0.9012,
"step": 3025
},
{
"epoch": 0.047171647193442676,
"grad_norm": 7.09883975982666,
"learning_rate": 9.430261519302615e-07,
"loss": 0.7454,
"step": 3030
},
{
"epoch": 0.047249488195412054,
"grad_norm": 8.166378021240234,
"learning_rate": 9.445828144458281e-07,
"loss": 0.8405,
"step": 3035
},
{
"epoch": 0.04732732919738143,
"grad_norm": 4.509795665740967,
"learning_rate": 9.461394769613948e-07,
"loss": 0.8076,
"step": 3040
},
{
"epoch": 0.0474051701993508,
"grad_norm": 8.811022758483887,
"learning_rate": 9.476961394769614e-07,
"loss": 0.8387,
"step": 3045
},
{
"epoch": 0.04748301120132018,
"grad_norm": 3.122080087661743,
"learning_rate": 9.49252801992528e-07,
"loss": 0.7069,
"step": 3050
},
{
"epoch": 0.04756085220328956,
"grad_norm": 6.84942626953125,
"learning_rate": 9.508094645080946e-07,
"loss": 0.7412,
"step": 3055
},
{
"epoch": 0.04763869320525894,
"grad_norm": 7.2728424072265625,
"learning_rate": 9.523661270236613e-07,
"loss": 0.8156,
"step": 3060
},
{
"epoch": 0.047716534207228316,
"grad_norm": 4.008538246154785,
"learning_rate": 9.539227895392278e-07,
"loss": 0.8618,
"step": 3065
},
{
"epoch": 0.047794375209197694,
"grad_norm": 3.9167230129241943,
"learning_rate": 9.554794520547946e-07,
"loss": 0.8026,
"step": 3070
},
{
"epoch": 0.04787221621116707,
"grad_norm": 3.683629274368286,
"learning_rate": 9.570361145703611e-07,
"loss": 0.9918,
"step": 3075
},
{
"epoch": 0.04795005721313645,
"grad_norm": 4.745260238647461,
"learning_rate": 9.585927770859277e-07,
"loss": 0.8148,
"step": 3080
},
{
"epoch": 0.04802789821510582,
"grad_norm": 2.839996814727783,
"learning_rate": 9.601494396014944e-07,
"loss": 0.8176,
"step": 3085
},
{
"epoch": 0.0481057392170752,
"grad_norm": 5.715896129608154,
"learning_rate": 9.61706102117061e-07,
"loss": 0.9266,
"step": 3090
},
{
"epoch": 0.04818358021904458,
"grad_norm": 3.44376540184021,
"learning_rate": 9.632627646326275e-07,
"loss": 0.7851,
"step": 3095
},
{
"epoch": 0.048261421221013956,
"grad_norm": 4.266874313354492,
"learning_rate": 9.648194271481943e-07,
"loss": 0.8701,
"step": 3100
},
{
"epoch": 0.048339262222983334,
"grad_norm": 3.6180949211120605,
"learning_rate": 9.663760896637608e-07,
"loss": 0.9206,
"step": 3105
},
{
"epoch": 0.04841710322495271,
"grad_norm": 4.103425979614258,
"learning_rate": 9.679327521793276e-07,
"loss": 0.9025,
"step": 3110
},
{
"epoch": 0.04849494422692209,
"grad_norm": 3.339601516723633,
"learning_rate": 9.69489414694894e-07,
"loss": 0.7176,
"step": 3115
},
{
"epoch": 0.04857278522889147,
"grad_norm": 5.723580360412598,
"learning_rate": 9.710460772104606e-07,
"loss": 0.7712,
"step": 3120
},
{
"epoch": 0.04865062623086085,
"grad_norm": 9.84174919128418,
"learning_rate": 9.726027397260274e-07,
"loss": 0.8301,
"step": 3125
},
{
"epoch": 0.04872846723283022,
"grad_norm": 4.004911422729492,
"learning_rate": 9.74159402241594e-07,
"loss": 0.8232,
"step": 3130
},
{
"epoch": 0.048806308234799596,
"grad_norm": 4.22821044921875,
"learning_rate": 9.757160647571607e-07,
"loss": 0.8408,
"step": 3135
},
{
"epoch": 0.048884149236768974,
"grad_norm": 3.268477439880371,
"learning_rate": 9.772727272727273e-07,
"loss": 0.854,
"step": 3140
},
{
"epoch": 0.04896199023873835,
"grad_norm": 3.3312723636627197,
"learning_rate": 9.788293897882938e-07,
"loss": 0.7584,
"step": 3145
},
{
"epoch": 0.04903983124070773,
"grad_norm": 2.6721420288085938,
"learning_rate": 9.803860523038606e-07,
"loss": 0.7101,
"step": 3150
},
{
"epoch": 0.04911767224267711,
"grad_norm": 3.1036221981048584,
"learning_rate": 9.81942714819427e-07,
"loss": 0.8943,
"step": 3155
},
{
"epoch": 0.04919551324464649,
"grad_norm": 4.581750869750977,
"learning_rate": 9.834993773349939e-07,
"loss": 0.956,
"step": 3160
},
{
"epoch": 0.049273354246615865,
"grad_norm": 5.273120880126953,
"learning_rate": 9.850560398505604e-07,
"loss": 0.8077,
"step": 3165
},
{
"epoch": 0.04935119524858524,
"grad_norm": 7.310013771057129,
"learning_rate": 9.86612702366127e-07,
"loss": 0.8506,
"step": 3170
},
{
"epoch": 0.049429036250554614,
"grad_norm": 5.949068069458008,
"learning_rate": 9.881693648816935e-07,
"loss": 0.8132,
"step": 3175
},
{
"epoch": 0.04950687725252399,
"grad_norm": 5.228186130523682,
"learning_rate": 9.897260273972602e-07,
"loss": 0.9303,
"step": 3180
},
{
"epoch": 0.04958471825449337,
"grad_norm": 4.407035827636719,
"learning_rate": 9.912826899128268e-07,
"loss": 0.8184,
"step": 3185
},
{
"epoch": 0.04966255925646275,
"grad_norm": 4.605864524841309,
"learning_rate": 9.928393524283936e-07,
"loss": 0.9336,
"step": 3190
},
{
"epoch": 0.04974040025843213,
"grad_norm": 3.0708847045898438,
"learning_rate": 9.9439601494396e-07,
"loss": 0.8725,
"step": 3195
},
{
"epoch": 0.049818241260401505,
"grad_norm": 3.3742926120758057,
"learning_rate": 9.959526774595266e-07,
"loss": 0.8121,
"step": 3200
},
{
"epoch": 0.04989608226237088,
"grad_norm": 2.685382843017578,
"learning_rate": 9.975093399750934e-07,
"loss": 0.7798,
"step": 3205
},
{
"epoch": 0.04997392326434026,
"grad_norm": 4.932633876800537,
"learning_rate": 9.9906600249066e-07,
"loss": 0.8492,
"step": 3210
},
{
"epoch": 0.05005176426630963,
"grad_norm": 8.489307403564453,
"learning_rate": 9.999672243981579e-07,
"loss": 0.8355,
"step": 3215
},
{
"epoch": 0.05012960526827901,
"grad_norm": 4.679005146026611,
"learning_rate": 9.99885285393553e-07,
"loss": 0.9012,
"step": 3220
},
{
"epoch": 0.05020744627024839,
"grad_norm": 6.65717887878418,
"learning_rate": 9.99803346388948e-07,
"loss": 1.0277,
"step": 3225
},
{
"epoch": 0.05028528727221777,
"grad_norm": 5.373363494873047,
"learning_rate": 9.99721407384343e-07,
"loss": 1.0007,
"step": 3230
},
{
"epoch": 0.050363128274187145,
"grad_norm": 3.9103312492370605,
"learning_rate": 9.996394683797382e-07,
"loss": 0.8015,
"step": 3235
},
{
"epoch": 0.05044096927615652,
"grad_norm": 6.019688606262207,
"learning_rate": 9.995575293751332e-07,
"loss": 0.8575,
"step": 3240
},
{
"epoch": 0.0505188102781259,
"grad_norm": 17.253416061401367,
"learning_rate": 9.99475590370528e-07,
"loss": 0.7818,
"step": 3245
},
{
"epoch": 0.05059665128009528,
"grad_norm": 9.291438102722168,
"learning_rate": 9.993936513659232e-07,
"loss": 0.9093,
"step": 3250
},
{
"epoch": 0.05067449228206466,
"grad_norm": 4.7031121253967285,
"learning_rate": 9.993117123613182e-07,
"loss": 0.792,
"step": 3255
},
{
"epoch": 0.05075233328403403,
"grad_norm": 3.9141600131988525,
"learning_rate": 9.992297733567131e-07,
"loss": 0.8803,
"step": 3260
},
{
"epoch": 0.05083017428600341,
"grad_norm": 5.731180191040039,
"learning_rate": 9.991478343521082e-07,
"loss": 0.9267,
"step": 3265
},
{
"epoch": 0.050908015287972785,
"grad_norm": 4.963929653167725,
"learning_rate": 9.990658953475033e-07,
"loss": 0.8762,
"step": 3270
},
{
"epoch": 0.050985856289942164,
"grad_norm": 5.126701831817627,
"learning_rate": 9.989839563428983e-07,
"loss": 0.9762,
"step": 3275
},
{
"epoch": 0.05106369729191154,
"grad_norm": 5.1071953773498535,
"learning_rate": 9.989020173382934e-07,
"loss": 0.9504,
"step": 3280
},
{
"epoch": 0.05114153829388092,
"grad_norm": 4.061114311218262,
"learning_rate": 9.988200783336883e-07,
"loss": 0.821,
"step": 3285
},
{
"epoch": 0.0512193792958503,
"grad_norm": 3.604483127593994,
"learning_rate": 9.987381393290833e-07,
"loss": 0.9565,
"step": 3290
},
{
"epoch": 0.051297220297819676,
"grad_norm": 4.070693016052246,
"learning_rate": 9.986562003244784e-07,
"loss": 0.7469,
"step": 3295
},
{
"epoch": 0.051375061299789054,
"grad_norm": 3.4125092029571533,
"learning_rate": 9.985742613198735e-07,
"loss": 0.7926,
"step": 3300
},
{
"epoch": 0.051452902301758426,
"grad_norm": 7.950231075286865,
"learning_rate": 9.984923223152686e-07,
"loss": 0.8422,
"step": 3305
},
{
"epoch": 0.051530743303727804,
"grad_norm": 3.185955762863159,
"learning_rate": 9.984103833106634e-07,
"loss": 0.7959,
"step": 3310
},
{
"epoch": 0.05160858430569718,
"grad_norm": 4.626750946044922,
"learning_rate": 9.983284443060585e-07,
"loss": 0.9932,
"step": 3315
},
{
"epoch": 0.05168642530766656,
"grad_norm": 2.5758249759674072,
"learning_rate": 9.982465053014536e-07,
"loss": 0.7739,
"step": 3320
},
{
"epoch": 0.05176426630963594,
"grad_norm": 3.6274349689483643,
"learning_rate": 9.981645662968484e-07,
"loss": 0.8351,
"step": 3325
},
{
"epoch": 0.051842107311605316,
"grad_norm": 3.520857572555542,
"learning_rate": 9.980826272922435e-07,
"loss": 0.8815,
"step": 3330
},
{
"epoch": 0.051919948313574694,
"grad_norm": 4.665640354156494,
"learning_rate": 9.980006882876386e-07,
"loss": 0.8575,
"step": 3335
},
{
"epoch": 0.05199778931554407,
"grad_norm": 5.597052574157715,
"learning_rate": 9.979187492830337e-07,
"loss": 0.8373,
"step": 3340
},
{
"epoch": 0.052075630317513444,
"grad_norm": 5.660586357116699,
"learning_rate": 9.978368102784287e-07,
"loss": 0.9164,
"step": 3345
},
{
"epoch": 0.05215347131948282,
"grad_norm": 11.376925468444824,
"learning_rate": 9.977548712738238e-07,
"loss": 0.8779,
"step": 3350
},
{
"epoch": 0.0522313123214522,
"grad_norm": 3.930678606033325,
"learning_rate": 9.976729322692187e-07,
"loss": 0.8638,
"step": 3355
},
{
"epoch": 0.05230915332342158,
"grad_norm": 4.059145450592041,
"learning_rate": 9.975909932646138e-07,
"loss": 0.7965,
"step": 3360
},
{
"epoch": 0.052386994325390956,
"grad_norm": 4.585720539093018,
"learning_rate": 9.975090542600088e-07,
"loss": 0.8034,
"step": 3365
},
{
"epoch": 0.052464835327360335,
"grad_norm": 5.015563488006592,
"learning_rate": 9.974271152554037e-07,
"loss": 0.8109,
"step": 3370
},
{
"epoch": 0.05254267632932971,
"grad_norm": 3.2969090938568115,
"learning_rate": 9.973451762507988e-07,
"loss": 0.9502,
"step": 3375
},
{
"epoch": 0.05262051733129909,
"grad_norm": 3.2702388763427734,
"learning_rate": 9.972632372461938e-07,
"loss": 0.8148,
"step": 3380
},
{
"epoch": 0.05269835833326847,
"grad_norm": 2.95889949798584,
"learning_rate": 9.97181298241589e-07,
"loss": 0.8935,
"step": 3385
},
{
"epoch": 0.05277619933523784,
"grad_norm": 5.157326698303223,
"learning_rate": 9.97099359236984e-07,
"loss": 0.9001,
"step": 3390
},
{
"epoch": 0.05285404033720722,
"grad_norm": 3.6577107906341553,
"learning_rate": 9.97017420232379e-07,
"loss": 0.7983,
"step": 3395
},
{
"epoch": 0.0529318813391766,
"grad_norm": 2.539867401123047,
"learning_rate": 9.969354812277741e-07,
"loss": 0.732,
"step": 3400
},
{
"epoch": 0.053009722341145975,
"grad_norm": 6.6847076416015625,
"learning_rate": 9.96853542223169e-07,
"loss": 0.8909,
"step": 3405
},
{
"epoch": 0.05308756334311535,
"grad_norm": 3.6293387413024902,
"learning_rate": 9.96771603218564e-07,
"loss": 0.7757,
"step": 3410
},
{
"epoch": 0.05316540434508473,
"grad_norm": 9.500846862792969,
"learning_rate": 9.966896642139592e-07,
"loss": 0.8709,
"step": 3415
},
{
"epoch": 0.05324324534705411,
"grad_norm": 8.317655563354492,
"learning_rate": 9.96607725209354e-07,
"loss": 0.8833,
"step": 3420
},
{
"epoch": 0.05332108634902349,
"grad_norm": 6.386698246002197,
"learning_rate": 9.96525786204749e-07,
"loss": 0.9136,
"step": 3425
},
{
"epoch": 0.05339892735099286,
"grad_norm": 3.567600965499878,
"learning_rate": 9.964438472001442e-07,
"loss": 0.8465,
"step": 3430
},
{
"epoch": 0.05347676835296224,
"grad_norm": 7.062701225280762,
"learning_rate": 9.963619081955392e-07,
"loss": 0.8179,
"step": 3435
},
{
"epoch": 0.053554609354931615,
"grad_norm": 3.983492851257324,
"learning_rate": 9.962799691909343e-07,
"loss": 0.899,
"step": 3440
},
{
"epoch": 0.05363245035690099,
"grad_norm": 7.150521278381348,
"learning_rate": 9.961980301863292e-07,
"loss": 0.7949,
"step": 3445
},
{
"epoch": 0.05371029135887037,
"grad_norm": 5.3643107414245605,
"learning_rate": 9.961160911817243e-07,
"loss": 0.893,
"step": 3450
},
{
"epoch": 0.05378813236083975,
"grad_norm": 7.8569440841674805,
"learning_rate": 9.960341521771193e-07,
"loss": 0.7597,
"step": 3455
},
{
"epoch": 0.05386597336280913,
"grad_norm": 2.990384817123413,
"learning_rate": 9.959522131725144e-07,
"loss": 0.8968,
"step": 3460
},
{
"epoch": 0.053943814364778506,
"grad_norm": 11.023333549499512,
"learning_rate": 9.958702741679093e-07,
"loss": 0.8577,
"step": 3465
},
{
"epoch": 0.054021655366747884,
"grad_norm": 3.8599610328674316,
"learning_rate": 9.957883351633043e-07,
"loss": 0.8187,
"step": 3470
},
{
"epoch": 0.054099496368717255,
"grad_norm": 4.514223575592041,
"learning_rate": 9.957063961586994e-07,
"loss": 0.8948,
"step": 3475
},
{
"epoch": 0.05417733737068663,
"grad_norm": 5.561735153198242,
"learning_rate": 9.956244571540945e-07,
"loss": 0.7144,
"step": 3480
},
{
"epoch": 0.05425517837265601,
"grad_norm": 2.5921874046325684,
"learning_rate": 9.955425181494896e-07,
"loss": 0.8599,
"step": 3485
},
{
"epoch": 0.05433301937462539,
"grad_norm": 4.871161937713623,
"learning_rate": 9.954605791448844e-07,
"loss": 0.9644,
"step": 3490
},
{
"epoch": 0.05441086037659477,
"grad_norm": 6.471960544586182,
"learning_rate": 9.953786401402795e-07,
"loss": 0.764,
"step": 3495
},
{
"epoch": 0.054488701378564146,
"grad_norm": 5.133829593658447,
"learning_rate": 9.952967011356746e-07,
"loss": 0.8484,
"step": 3500
},
{
"epoch": 0.054566542380533524,
"grad_norm": 15.294747352600098,
"learning_rate": 9.952147621310697e-07,
"loss": 0.9278,
"step": 3505
},
{
"epoch": 0.0546443833825029,
"grad_norm": 4.0458526611328125,
"learning_rate": 9.951328231264645e-07,
"loss": 0.8015,
"step": 3510
},
{
"epoch": 0.05472222438447228,
"grad_norm": 3.96840238571167,
"learning_rate": 9.950508841218596e-07,
"loss": 0.9182,
"step": 3515
},
{
"epoch": 0.05480006538644165,
"grad_norm": 3.493230104446411,
"learning_rate": 9.949689451172547e-07,
"loss": 0.7351,
"step": 3520
},
{
"epoch": 0.05487790638841103,
"grad_norm": 6.453081130981445,
"learning_rate": 9.948870061126497e-07,
"loss": 0.6706,
"step": 3525
},
{
"epoch": 0.05495574739038041,
"grad_norm": 4.883228302001953,
"learning_rate": 9.948050671080446e-07,
"loss": 0.8926,
"step": 3530
},
{
"epoch": 0.055033588392349786,
"grad_norm": 8.88487434387207,
"learning_rate": 9.947231281034397e-07,
"loss": 0.7815,
"step": 3535
},
{
"epoch": 0.055111429394319164,
"grad_norm": 3.5414915084838867,
"learning_rate": 9.946411890988348e-07,
"loss": 0.8066,
"step": 3540
},
{
"epoch": 0.05518927039628854,
"grad_norm": 3.3924477100372314,
"learning_rate": 9.945592500942298e-07,
"loss": 0.8942,
"step": 3545
},
{
"epoch": 0.05526711139825792,
"grad_norm": 8.606155395507812,
"learning_rate": 9.94477311089625e-07,
"loss": 0.8062,
"step": 3550
},
{
"epoch": 0.0553449524002273,
"grad_norm": 3.5798611640930176,
"learning_rate": 9.9439537208502e-07,
"loss": 0.8145,
"step": 3555
},
{
"epoch": 0.05542279340219667,
"grad_norm": 4.816424369812012,
"learning_rate": 9.943134330804148e-07,
"loss": 0.9767,
"step": 3560
},
{
"epoch": 0.05550063440416605,
"grad_norm": 3.161212682723999,
"learning_rate": 9.9423149407581e-07,
"loss": 0.7526,
"step": 3565
},
{
"epoch": 0.055578475406135426,
"grad_norm": 5.3241143226623535,
"learning_rate": 9.94149555071205e-07,
"loss": 0.8756,
"step": 3570
},
{
"epoch": 0.055656316408104804,
"grad_norm": 4.702089786529541,
"learning_rate": 9.940676160665999e-07,
"loss": 0.7844,
"step": 3575
},
{
"epoch": 0.05573415741007418,
"grad_norm": 3.6324615478515625,
"learning_rate": 9.93985677061995e-07,
"loss": 0.9757,
"step": 3580
},
{
"epoch": 0.05581199841204356,
"grad_norm": 5.574779510498047,
"learning_rate": 9.9390373805739e-07,
"loss": 0.8368,
"step": 3585
},
{
"epoch": 0.05588983941401294,
"grad_norm": 3.3760433197021484,
"learning_rate": 9.93821799052785e-07,
"loss": 0.9753,
"step": 3590
},
{
"epoch": 0.05596768041598232,
"grad_norm": 3.6447086334228516,
"learning_rate": 9.937398600481802e-07,
"loss": 0.8654,
"step": 3595
},
{
"epoch": 0.056045521417951695,
"grad_norm": 10.935750007629395,
"learning_rate": 9.936579210435752e-07,
"loss": 0.8504,
"step": 3600
},
{
"epoch": 0.056123362419921066,
"grad_norm": 5.356347560882568,
"learning_rate": 9.9357598203897e-07,
"loss": 0.8439,
"step": 3605
},
{
"epoch": 0.056201203421890444,
"grad_norm": 7.737555027008057,
"learning_rate": 9.934940430343652e-07,
"loss": 0.8997,
"step": 3610
},
{
"epoch": 0.05627904442385982,
"grad_norm": 4.059571266174316,
"learning_rate": 9.934121040297602e-07,
"loss": 0.736,
"step": 3615
},
{
"epoch": 0.0563568854258292,
"grad_norm": 10.28212833404541,
"learning_rate": 9.933301650251551e-07,
"loss": 0.8219,
"step": 3620
},
{
"epoch": 0.05643472642779858,
"grad_norm": 7.522468090057373,
"learning_rate": 9.932482260205502e-07,
"loss": 0.7058,
"step": 3625
},
{
"epoch": 0.05651256742976796,
"grad_norm": 4.0811872482299805,
"learning_rate": 9.931662870159453e-07,
"loss": 0.8334,
"step": 3630
},
{
"epoch": 0.056590408431737335,
"grad_norm": 2.533539295196533,
"learning_rate": 9.930843480113403e-07,
"loss": 0.8185,
"step": 3635
},
{
"epoch": 0.05666824943370671,
"grad_norm": 2.272587776184082,
"learning_rate": 9.930024090067354e-07,
"loss": 0.8294,
"step": 3640
},
{
"epoch": 0.05674609043567609,
"grad_norm": 4.402963638305664,
"learning_rate": 9.929204700021305e-07,
"loss": 0.8253,
"step": 3645
},
{
"epoch": 0.05682393143764546,
"grad_norm": 4.450977802276611,
"learning_rate": 9.928385309975253e-07,
"loss": 0.7287,
"step": 3650
},
{
"epoch": 0.05690177243961484,
"grad_norm": 4.995216369628906,
"learning_rate": 9.927565919929204e-07,
"loss": 0.7744,
"step": 3655
},
{
"epoch": 0.05697961344158422,
"grad_norm": 4.42352294921875,
"learning_rate": 9.926746529883155e-07,
"loss": 0.8216,
"step": 3660
},
{
"epoch": 0.0570574544435536,
"grad_norm": 5.005922317504883,
"learning_rate": 9.925927139837106e-07,
"loss": 0.88,
"step": 3665
},
{
"epoch": 0.057135295445522975,
"grad_norm": 4.319427013397217,
"learning_rate": 9.925107749791054e-07,
"loss": 0.9386,
"step": 3670
},
{
"epoch": 0.05721313644749235,
"grad_norm": 4.61904239654541,
"learning_rate": 9.924288359745005e-07,
"loss": 0.8248,
"step": 3675
},
{
"epoch": 0.05729097744946173,
"grad_norm": 3.656996250152588,
"learning_rate": 9.923468969698956e-07,
"loss": 0.8898,
"step": 3680
},
{
"epoch": 0.05736881845143111,
"grad_norm": 10.73847484588623,
"learning_rate": 9.922649579652907e-07,
"loss": 0.7295,
"step": 3685
},
{
"epoch": 0.05744665945340048,
"grad_norm": 3.2956910133361816,
"learning_rate": 9.921830189606855e-07,
"loss": 0.7937,
"step": 3690
},
{
"epoch": 0.05752450045536986,
"grad_norm": 3.310476541519165,
"learning_rate": 9.921010799560806e-07,
"loss": 0.7597,
"step": 3695
},
{
"epoch": 0.05760234145733924,
"grad_norm": 6.073892116546631,
"learning_rate": 9.920191409514757e-07,
"loss": 0.775,
"step": 3700
},
{
"epoch": 0.057680182459308615,
"grad_norm": 4.651096820831299,
"learning_rate": 9.919372019468707e-07,
"loss": 0.9085,
"step": 3705
},
{
"epoch": 0.057758023461277994,
"grad_norm": 5.112009048461914,
"learning_rate": 9.918552629422658e-07,
"loss": 0.854,
"step": 3710
},
{
"epoch": 0.05783586446324737,
"grad_norm": 3.9226460456848145,
"learning_rate": 9.917733239376607e-07,
"loss": 0.8815,
"step": 3715
},
{
"epoch": 0.05791370546521675,
"grad_norm": 5.9531707763671875,
"learning_rate": 9.916913849330558e-07,
"loss": 0.8794,
"step": 3720
},
{
"epoch": 0.05799154646718613,
"grad_norm": 7.749881744384766,
"learning_rate": 9.916094459284508e-07,
"loss": 0.8347,
"step": 3725
},
{
"epoch": 0.058069387469155506,
"grad_norm": 3.2161874771118164,
"learning_rate": 9.915275069238457e-07,
"loss": 0.8297,
"step": 3730
},
{
"epoch": 0.05814722847112488,
"grad_norm": 3.4381978511810303,
"learning_rate": 9.914455679192408e-07,
"loss": 0.8016,
"step": 3735
},
{
"epoch": 0.058225069473094256,
"grad_norm": 6.175289630889893,
"learning_rate": 9.913636289146358e-07,
"loss": 0.7378,
"step": 3740
},
{
"epoch": 0.058302910475063634,
"grad_norm": 7.808245658874512,
"learning_rate": 9.91281689910031e-07,
"loss": 0.8631,
"step": 3745
},
{
"epoch": 0.05838075147703301,
"grad_norm": 8.13048267364502,
"learning_rate": 9.91199750905426e-07,
"loss": 0.7241,
"step": 3750
},
{
"epoch": 0.05845859247900239,
"grad_norm": 14.47769546508789,
"learning_rate": 9.91117811900821e-07,
"loss": 0.7604,
"step": 3755
},
{
"epoch": 0.05853643348097177,
"grad_norm": 13.544578552246094,
"learning_rate": 9.91035872896216e-07,
"loss": 0.9168,
"step": 3760
},
{
"epoch": 0.058614274482941146,
"grad_norm": 3.012338638305664,
"learning_rate": 9.90953933891611e-07,
"loss": 0.8438,
"step": 3765
},
{
"epoch": 0.058692115484910524,
"grad_norm": 8.543879508972168,
"learning_rate": 9.90871994887006e-07,
"loss": 0.8027,
"step": 3770
},
{
"epoch": 0.058769956486879896,
"grad_norm": 3.5552265644073486,
"learning_rate": 9.907900558824012e-07,
"loss": 0.9394,
"step": 3775
},
{
"epoch": 0.058847797488849274,
"grad_norm": 2.7634129524230957,
"learning_rate": 9.90708116877796e-07,
"loss": 0.8544,
"step": 3780
},
{
"epoch": 0.05892563849081865,
"grad_norm": 4.050414085388184,
"learning_rate": 9.90626177873191e-07,
"loss": 0.8405,
"step": 3785
},
{
"epoch": 0.05900347949278803,
"grad_norm": 3.3038461208343506,
"learning_rate": 9.905442388685862e-07,
"loss": 0.763,
"step": 3790
},
{
"epoch": 0.05908132049475741,
"grad_norm": 5.79196834564209,
"learning_rate": 9.904622998639812e-07,
"loss": 0.8174,
"step": 3795
},
{
"epoch": 0.059159161496726786,
"grad_norm": 4.359936714172363,
"learning_rate": 9.903803608593763e-07,
"loss": 0.8229,
"step": 3800
},
{
"epoch": 0.059237002498696165,
"grad_norm": 6.546017169952393,
"learning_rate": 9.902984218547714e-07,
"loss": 0.835,
"step": 3805
},
{
"epoch": 0.05931484350066554,
"grad_norm": 6.203246593475342,
"learning_rate": 9.902164828501663e-07,
"loss": 0.9859,
"step": 3810
},
{
"epoch": 0.05939268450263492,
"grad_norm": 3.92028546333313,
"learning_rate": 9.901345438455613e-07,
"loss": 0.84,
"step": 3815
},
{
"epoch": 0.05947052550460429,
"grad_norm": 4.098803520202637,
"learning_rate": 9.900526048409564e-07,
"loss": 0.8088,
"step": 3820
},
{
"epoch": 0.05954836650657367,
"grad_norm": 4.060965061187744,
"learning_rate": 9.899706658363513e-07,
"loss": 0.8048,
"step": 3825
},
{
"epoch": 0.05962620750854305,
"grad_norm": 7.130313873291016,
"learning_rate": 9.898887268317463e-07,
"loss": 0.991,
"step": 3830
},
{
"epoch": 0.05970404851051243,
"grad_norm": 4.355027198791504,
"learning_rate": 9.898067878271414e-07,
"loss": 0.9168,
"step": 3835
},
{
"epoch": 0.059781889512481805,
"grad_norm": 4.409844398498535,
"learning_rate": 9.897248488225365e-07,
"loss": 0.7811,
"step": 3840
},
{
"epoch": 0.05985973051445118,
"grad_norm": 4.593713283538818,
"learning_rate": 9.896429098179316e-07,
"loss": 0.9282,
"step": 3845
},
{
"epoch": 0.05993757151642056,
"grad_norm": 3.813417911529541,
"learning_rate": 9.895609708133266e-07,
"loss": 0.8671,
"step": 3850
},
{
"epoch": 0.06001541251838994,
"grad_norm": 9.554966926574707,
"learning_rate": 9.894790318087215e-07,
"loss": 0.8516,
"step": 3855
},
{
"epoch": 0.06009325352035932,
"grad_norm": 3.616415500640869,
"learning_rate": 9.893970928041166e-07,
"loss": 0.8382,
"step": 3860
},
{
"epoch": 0.06017109452232869,
"grad_norm": 3.379333019256592,
"learning_rate": 9.893151537995117e-07,
"loss": 0.9661,
"step": 3865
},
{
"epoch": 0.06024893552429807,
"grad_norm": 2.6693906784057617,
"learning_rate": 9.892332147949065e-07,
"loss": 0.8133,
"step": 3870
},
{
"epoch": 0.060326776526267445,
"grad_norm": 4.557685375213623,
"learning_rate": 9.891512757903016e-07,
"loss": 0.8617,
"step": 3875
},
{
"epoch": 0.06040461752823682,
"grad_norm": 2.69423770904541,
"learning_rate": 9.890693367856967e-07,
"loss": 0.7904,
"step": 3880
},
{
"epoch": 0.0604824585302062,
"grad_norm": 3.213026762008667,
"learning_rate": 9.889873977810917e-07,
"loss": 0.7852,
"step": 3885
},
{
"epoch": 0.06056029953217558,
"grad_norm": 3.25534725189209,
"learning_rate": 9.889054587764868e-07,
"loss": 0.8165,
"step": 3890
},
{
"epoch": 0.06063814053414496,
"grad_norm": 5.834784984588623,
"learning_rate": 9.888235197718817e-07,
"loss": 0.9304,
"step": 3895
},
{
"epoch": 0.060715981536114336,
"grad_norm": 3.369537353515625,
"learning_rate": 9.887415807672768e-07,
"loss": 0.7562,
"step": 3900
},
{
"epoch": 0.06079382253808371,
"grad_norm": 5.367571830749512,
"learning_rate": 9.886596417626718e-07,
"loss": 0.8158,
"step": 3905
},
{
"epoch": 0.060871663540053085,
"grad_norm": 4.397671222686768,
"learning_rate": 9.88577702758067e-07,
"loss": 0.8699,
"step": 3910
},
{
"epoch": 0.06094950454202246,
"grad_norm": 3.270768404006958,
"learning_rate": 9.88495763753462e-07,
"loss": 0.9022,
"step": 3915
},
{
"epoch": 0.06102734554399184,
"grad_norm": 4.194687366485596,
"learning_rate": 9.884138247488568e-07,
"loss": 0.937,
"step": 3920
},
{
"epoch": 0.06110518654596122,
"grad_norm": 3.5028905868530273,
"learning_rate": 9.88331885744252e-07,
"loss": 0.9853,
"step": 3925
},
{
"epoch": 0.0611830275479306,
"grad_norm": 9.81811237335205,
"learning_rate": 9.88249946739647e-07,
"loss": 0.9332,
"step": 3930
},
{
"epoch": 0.061260868549899976,
"grad_norm": 9.531314849853516,
"learning_rate": 9.881680077350419e-07,
"loss": 0.8402,
"step": 3935
},
{
"epoch": 0.061338709551869354,
"grad_norm": 6.465907096862793,
"learning_rate": 9.88086068730437e-07,
"loss": 0.9443,
"step": 3940
},
{
"epoch": 0.06141655055383873,
"grad_norm": 9.462715148925781,
"learning_rate": 9.88004129725832e-07,
"loss": 0.7544,
"step": 3945
},
{
"epoch": 0.0614943915558081,
"grad_norm": 4.005988121032715,
"learning_rate": 9.87922190721227e-07,
"loss": 0.8655,
"step": 3950
},
{
"epoch": 0.06157223255777748,
"grad_norm": 6.533730983734131,
"learning_rate": 9.878402517166222e-07,
"loss": 0.9202,
"step": 3955
},
{
"epoch": 0.06165007355974686,
"grad_norm": 4.695230484008789,
"learning_rate": 9.877583127120172e-07,
"loss": 0.7832,
"step": 3960
},
{
"epoch": 0.06172791456171624,
"grad_norm": 4.281477451324463,
"learning_rate": 9.87676373707412e-07,
"loss": 0.8885,
"step": 3965
},
{
"epoch": 0.061805755563685616,
"grad_norm": 4.162761688232422,
"learning_rate": 9.875944347028072e-07,
"loss": 0.8782,
"step": 3970
},
{
"epoch": 0.061883596565654994,
"grad_norm": 3.2788217067718506,
"learning_rate": 9.875124956982022e-07,
"loss": 0.901,
"step": 3975
},
{
"epoch": 0.06196143756762437,
"grad_norm": 3.823699951171875,
"learning_rate": 9.874305566935971e-07,
"loss": 0.8811,
"step": 3980
},
{
"epoch": 0.06203927856959375,
"grad_norm": 5.366037368774414,
"learning_rate": 9.873486176889922e-07,
"loss": 0.9297,
"step": 3985
},
{
"epoch": 0.06211711957156313,
"grad_norm": 3.4064414501190186,
"learning_rate": 9.872666786843873e-07,
"loss": 0.8329,
"step": 3990
},
{
"epoch": 0.0621949605735325,
"grad_norm": 6.189504146575928,
"learning_rate": 9.871847396797823e-07,
"loss": 0.8325,
"step": 3995
},
{
"epoch": 0.06227280157550188,
"grad_norm": 2.825984001159668,
"learning_rate": 9.871028006751774e-07,
"loss": 0.7901,
"step": 4000
},
{
"epoch": 0.062350642577471256,
"grad_norm": 3.610321521759033,
"learning_rate": 9.870208616705725e-07,
"loss": 0.7974,
"step": 4005
},
{
"epoch": 0.062428483579440634,
"grad_norm": 4.4487128257751465,
"learning_rate": 9.869389226659676e-07,
"loss": 0.9165,
"step": 4010
},
{
"epoch": 0.06250632458141,
"grad_norm": 3.201486110687256,
"learning_rate": 9.868569836613624e-07,
"loss": 0.9165,
"step": 4015
},
{
"epoch": 0.06258416558337938,
"grad_norm": 6.013232231140137,
"learning_rate": 9.867750446567575e-07,
"loss": 0.7316,
"step": 4020
},
{
"epoch": 0.06266200658534876,
"grad_norm": 4.562684535980225,
"learning_rate": 9.866931056521526e-07,
"loss": 0.8648,
"step": 4025
},
{
"epoch": 0.06273984758731814,
"grad_norm": 3.915780544281006,
"learning_rate": 9.866111666475474e-07,
"loss": 0.7684,
"step": 4030
},
{
"epoch": 0.06281768858928752,
"grad_norm": 13.098698616027832,
"learning_rate": 9.865292276429425e-07,
"loss": 0.8222,
"step": 4035
},
{
"epoch": 0.0628955295912569,
"grad_norm": 5.85524320602417,
"learning_rate": 9.864472886383376e-07,
"loss": 0.8593,
"step": 4040
},
{
"epoch": 0.06297337059322627,
"grad_norm": 12.446966171264648,
"learning_rate": 9.863653496337327e-07,
"loss": 0.6881,
"step": 4045
},
{
"epoch": 0.06305121159519565,
"grad_norm": 3.663348436355591,
"learning_rate": 9.862834106291277e-07,
"loss": 0.6791,
"step": 4050
},
{
"epoch": 0.06312905259716503,
"grad_norm": 5.9468159675598145,
"learning_rate": 9.862014716245226e-07,
"loss": 0.883,
"step": 4055
},
{
"epoch": 0.06320689359913441,
"grad_norm": 4.544028282165527,
"learning_rate": 9.861195326199177e-07,
"loss": 0.6979,
"step": 4060
},
{
"epoch": 0.06328473460110379,
"grad_norm": 4.25548791885376,
"learning_rate": 9.860375936153127e-07,
"loss": 0.757,
"step": 4065
},
{
"epoch": 0.06336257560307317,
"grad_norm": 4.892475128173828,
"learning_rate": 9.859556546107078e-07,
"loss": 0.8346,
"step": 4070
},
{
"epoch": 0.06344041660504254,
"grad_norm": 3.967132091522217,
"learning_rate": 9.858737156061027e-07,
"loss": 0.7614,
"step": 4075
},
{
"epoch": 0.06351825760701192,
"grad_norm": 9.065237998962402,
"learning_rate": 9.857917766014978e-07,
"loss": 0.8471,
"step": 4080
},
{
"epoch": 0.0635960986089813,
"grad_norm": 5.109429359436035,
"learning_rate": 9.857098375968928e-07,
"loss": 0.7441,
"step": 4085
},
{
"epoch": 0.06367393961095068,
"grad_norm": 13.242950439453125,
"learning_rate": 9.85627898592288e-07,
"loss": 0.7784,
"step": 4090
},
{
"epoch": 0.06375178061292006,
"grad_norm": 7.870430946350098,
"learning_rate": 9.855459595876828e-07,
"loss": 0.9225,
"step": 4095
},
{
"epoch": 0.06382962161488942,
"grad_norm": 6.2109761238098145,
"learning_rate": 9.854640205830778e-07,
"loss": 0.8741,
"step": 4100
},
{
"epoch": 0.0639074626168588,
"grad_norm": 4.566768169403076,
"learning_rate": 9.85382081578473e-07,
"loss": 0.7312,
"step": 4105
},
{
"epoch": 0.06398530361882818,
"grad_norm": 4.343275547027588,
"learning_rate": 9.85300142573868e-07,
"loss": 0.8077,
"step": 4110
},
{
"epoch": 0.06406314462079755,
"grad_norm": 3.710590124130249,
"learning_rate": 9.85218203569263e-07,
"loss": 0.8512,
"step": 4115
},
{
"epoch": 0.06414098562276693,
"grad_norm": 5.875495433807373,
"learning_rate": 9.85136264564658e-07,
"loss": 0.8588,
"step": 4120
},
{
"epoch": 0.06421882662473631,
"grad_norm": 5.609859943389893,
"learning_rate": 9.85054325560053e-07,
"loss": 0.9756,
"step": 4125
},
{
"epoch": 0.06429666762670569,
"grad_norm": 3.695260763168335,
"learning_rate": 9.84972386555448e-07,
"loss": 0.8677,
"step": 4130
},
{
"epoch": 0.06437450862867507,
"grad_norm": 4.265758991241455,
"learning_rate": 9.848904475508432e-07,
"loss": 0.868,
"step": 4135
},
{
"epoch": 0.06445234963064445,
"grad_norm": 5.0540361404418945,
"learning_rate": 9.84808508546238e-07,
"loss": 0.7448,
"step": 4140
},
{
"epoch": 0.06453019063261382,
"grad_norm": 3.1422078609466553,
"learning_rate": 9.84726569541633e-07,
"loss": 0.8085,
"step": 4145
},
{
"epoch": 0.0646080316345832,
"grad_norm": 3.257333755493164,
"learning_rate": 9.846446305370282e-07,
"loss": 0.904,
"step": 4150
},
{
"epoch": 0.06468587263655258,
"grad_norm": 6.303824424743652,
"learning_rate": 9.845626915324232e-07,
"loss": 0.6844,
"step": 4155
},
{
"epoch": 0.06476371363852196,
"grad_norm": 7.541611194610596,
"learning_rate": 9.844807525278183e-07,
"loss": 0.9385,
"step": 4160
},
{
"epoch": 0.06484155464049134,
"grad_norm": 3.217496633529663,
"learning_rate": 9.843988135232134e-07,
"loss": 0.8201,
"step": 4165
},
{
"epoch": 0.06491939564246071,
"grad_norm": 4.375589370727539,
"learning_rate": 9.843168745186083e-07,
"loss": 0.8042,
"step": 4170
},
{
"epoch": 0.06499723664443009,
"grad_norm": 6.62051248550415,
"learning_rate": 9.842349355140033e-07,
"loss": 0.7035,
"step": 4175
},
{
"epoch": 0.06507507764639947,
"grad_norm": 4.503577709197998,
"learning_rate": 9.841529965093984e-07,
"loss": 0.8564,
"step": 4180
},
{
"epoch": 0.06515291864836883,
"grad_norm": 3.583695411682129,
"learning_rate": 9.840710575047933e-07,
"loss": 0.9069,
"step": 4185
},
{
"epoch": 0.06523075965033821,
"grad_norm": 4.029445648193359,
"learning_rate": 9.839891185001883e-07,
"loss": 0.8835,
"step": 4190
},
{
"epoch": 0.06530860065230759,
"grad_norm": 3.6656410694122314,
"learning_rate": 9.839071794955834e-07,
"loss": 0.7814,
"step": 4195
},
{
"epoch": 0.06538644165427697,
"grad_norm": 3.0505213737487793,
"learning_rate": 9.838252404909785e-07,
"loss": 0.7942,
"step": 4200
},
{
"epoch": 0.06546428265624635,
"grad_norm": 4.775297164916992,
"learning_rate": 9.837433014863736e-07,
"loss": 0.8875,
"step": 4205
},
{
"epoch": 0.06554212365821573,
"grad_norm": 5.490566253662109,
"learning_rate": 9.836613624817686e-07,
"loss": 0.7635,
"step": 4210
},
{
"epoch": 0.0656199646601851,
"grad_norm": 3.202033519744873,
"learning_rate": 9.835794234771635e-07,
"loss": 0.7558,
"step": 4215
},
{
"epoch": 0.06569780566215448,
"grad_norm": 5.484325408935547,
"learning_rate": 9.834974844725586e-07,
"loss": 0.8066,
"step": 4220
},
{
"epoch": 0.06577564666412386,
"grad_norm": 2.903610944747925,
"learning_rate": 9.834155454679537e-07,
"loss": 0.7833,
"step": 4225
},
{
"epoch": 0.06585348766609324,
"grad_norm": 3.188546895980835,
"learning_rate": 9.833336064633485e-07,
"loss": 0.7774,
"step": 4230
},
{
"epoch": 0.06593132866806262,
"grad_norm": 3.055574655532837,
"learning_rate": 9.832516674587436e-07,
"loss": 0.6551,
"step": 4235
},
{
"epoch": 0.066009169670032,
"grad_norm": 4.439972877502441,
"learning_rate": 9.831697284541387e-07,
"loss": 0.7456,
"step": 4240
},
{
"epoch": 0.06608701067200137,
"grad_norm": 2.4513139724731445,
"learning_rate": 9.830877894495337e-07,
"loss": 0.7752,
"step": 4245
},
{
"epoch": 0.06616485167397075,
"grad_norm": 4.66846227645874,
"learning_rate": 9.830058504449288e-07,
"loss": 0.9322,
"step": 4250
},
{
"epoch": 0.06624269267594013,
"grad_norm": 4.819527626037598,
"learning_rate": 9.82923911440324e-07,
"loss": 0.9361,
"step": 4255
},
{
"epoch": 0.06632053367790951,
"grad_norm": 8.028414726257324,
"learning_rate": 9.828419724357188e-07,
"loss": 0.8214,
"step": 4260
},
{
"epoch": 0.06639837467987889,
"grad_norm": 3.565459728240967,
"learning_rate": 9.827600334311138e-07,
"loss": 0.7668,
"step": 4265
},
{
"epoch": 0.06647621568184826,
"grad_norm": 2.9492602348327637,
"learning_rate": 9.82678094426509e-07,
"loss": 0.7513,
"step": 4270
},
{
"epoch": 0.06655405668381763,
"grad_norm": 4.8683576583862305,
"learning_rate": 9.82596155421904e-07,
"loss": 0.8725,
"step": 4275
},
{
"epoch": 0.066631897685787,
"grad_norm": 4.162265300750732,
"learning_rate": 9.825142164172989e-07,
"loss": 0.8275,
"step": 4280
},
{
"epoch": 0.06670973868775638,
"grad_norm": 3.6537702083587646,
"learning_rate": 9.82432277412694e-07,
"loss": 0.7264,
"step": 4285
},
{
"epoch": 0.06678757968972576,
"grad_norm": 3.9282073974609375,
"learning_rate": 9.82350338408089e-07,
"loss": 0.977,
"step": 4290
},
{
"epoch": 0.06686542069169514,
"grad_norm": 5.129037857055664,
"learning_rate": 9.82268399403484e-07,
"loss": 0.8609,
"step": 4295
},
{
"epoch": 0.06694326169366452,
"grad_norm": 4.563994884490967,
"learning_rate": 9.82186460398879e-07,
"loss": 0.8303,
"step": 4300
},
{
"epoch": 0.0670211026956339,
"grad_norm": 3.177889585494995,
"learning_rate": 9.82104521394274e-07,
"loss": 0.9134,
"step": 4305
},
{
"epoch": 0.06709894369760327,
"grad_norm": 4.675817966461182,
"learning_rate": 9.82022582389669e-07,
"loss": 0.7188,
"step": 4310
},
{
"epoch": 0.06717678469957265,
"grad_norm": 6.9661173820495605,
"learning_rate": 9.819406433850642e-07,
"loss": 0.7871,
"step": 4315
},
{
"epoch": 0.06725462570154203,
"grad_norm": 6.177728176116943,
"learning_rate": 9.818587043804592e-07,
"loss": 0.7438,
"step": 4320
},
{
"epoch": 0.06733246670351141,
"grad_norm": 3.9021103382110596,
"learning_rate": 9.81776765375854e-07,
"loss": 0.8456,
"step": 4325
},
{
"epoch": 0.06741030770548079,
"grad_norm": 6.576573371887207,
"learning_rate": 9.816948263712492e-07,
"loss": 0.8173,
"step": 4330
},
{
"epoch": 0.06748814870745017,
"grad_norm": 3.117799997329712,
"learning_rate": 9.816128873666442e-07,
"loss": 0.8552,
"step": 4335
},
{
"epoch": 0.06756598970941954,
"grad_norm": 5.52931022644043,
"learning_rate": 9.815309483620391e-07,
"loss": 0.7353,
"step": 4340
},
{
"epoch": 0.06764383071138892,
"grad_norm": 3.3571298122406006,
"learning_rate": 9.814490093574342e-07,
"loss": 0.7253,
"step": 4345
},
{
"epoch": 0.0677216717133583,
"grad_norm": 4.7125468254089355,
"learning_rate": 9.813670703528293e-07,
"loss": 0.8708,
"step": 4350
},
{
"epoch": 0.06779951271532768,
"grad_norm": 3.7811620235443115,
"learning_rate": 9.812851313482243e-07,
"loss": 0.744,
"step": 4355
},
{
"epoch": 0.06787735371729704,
"grad_norm": 4.079869270324707,
"learning_rate": 9.812031923436194e-07,
"loss": 0.8291,
"step": 4360
},
{
"epoch": 0.06795519471926642,
"grad_norm": 2.9714179039001465,
"learning_rate": 9.811212533390145e-07,
"loss": 0.879,
"step": 4365
},
{
"epoch": 0.0680330357212358,
"grad_norm": 4.301975250244141,
"learning_rate": 9.810393143344094e-07,
"loss": 0.7528,
"step": 4370
},
{
"epoch": 0.06811087672320518,
"grad_norm": 4.707742214202881,
"learning_rate": 9.809573753298044e-07,
"loss": 0.7686,
"step": 4375
},
{
"epoch": 0.06818871772517456,
"grad_norm": 2.911092758178711,
"learning_rate": 9.808754363251995e-07,
"loss": 0.8224,
"step": 4380
},
{
"epoch": 0.06826655872714393,
"grad_norm": 3.809354543685913,
"learning_rate": 9.807934973205944e-07,
"loss": 0.9337,
"step": 4385
},
{
"epoch": 0.06834439972911331,
"grad_norm": 3.0105934143066406,
"learning_rate": 9.807115583159894e-07,
"loss": 0.7952,
"step": 4390
},
{
"epoch": 0.06842224073108269,
"grad_norm": 4.267519474029541,
"learning_rate": 9.806296193113845e-07,
"loss": 0.9312,
"step": 4395
},
{
"epoch": 0.06850008173305207,
"grad_norm": 13.714824676513672,
"learning_rate": 9.805476803067796e-07,
"loss": 0.7266,
"step": 4400
},
{
"epoch": 0.06857792273502145,
"grad_norm": 5.861302852630615,
"learning_rate": 9.804657413021747e-07,
"loss": 0.8267,
"step": 4405
},
{
"epoch": 0.06865576373699082,
"grad_norm": 4.226170539855957,
"learning_rate": 9.803838022975697e-07,
"loss": 0.6108,
"step": 4410
},
{
"epoch": 0.0687336047389602,
"grad_norm": 4.260887145996094,
"learning_rate": 9.803018632929648e-07,
"loss": 0.7917,
"step": 4415
},
{
"epoch": 0.06881144574092958,
"grad_norm": 2.1800050735473633,
"learning_rate": 9.802199242883597e-07,
"loss": 0.7279,
"step": 4420
},
{
"epoch": 0.06888928674289896,
"grad_norm": 4.386568069458008,
"learning_rate": 9.801379852837548e-07,
"loss": 0.7997,
"step": 4425
},
{
"epoch": 0.06896712774486834,
"grad_norm": 7.1831135749816895,
"learning_rate": 9.800560462791498e-07,
"loss": 0.9703,
"step": 4430
},
{
"epoch": 0.06904496874683771,
"grad_norm": 7.631860733032227,
"learning_rate": 9.799741072745447e-07,
"loss": 0.7836,
"step": 4435
},
{
"epoch": 0.06912280974880709,
"grad_norm": 3.6150078773498535,
"learning_rate": 9.798921682699398e-07,
"loss": 0.6982,
"step": 4440
},
{
"epoch": 0.06920065075077646,
"grad_norm": 5.267273902893066,
"learning_rate": 9.798102292653348e-07,
"loss": 0.9292,
"step": 4445
},
{
"epoch": 0.06927849175274584,
"grad_norm": 6.139009952545166,
"learning_rate": 9.7972829026073e-07,
"loss": 0.776,
"step": 4450
},
{
"epoch": 0.06935633275471521,
"grad_norm": 6.20229959487915,
"learning_rate": 9.79646351256125e-07,
"loss": 0.8239,
"step": 4455
},
{
"epoch": 0.06943417375668459,
"grad_norm": 3.204371929168701,
"learning_rate": 9.7956441225152e-07,
"loss": 0.8123,
"step": 4460
},
{
"epoch": 0.06951201475865397,
"grad_norm": 4.521599769592285,
"learning_rate": 9.79482473246915e-07,
"loss": 0.7049,
"step": 4465
},
{
"epoch": 0.06958985576062335,
"grad_norm": 5.0935750007629395,
"learning_rate": 9.7940053424231e-07,
"loss": 0.8673,
"step": 4470
},
{
"epoch": 0.06966769676259273,
"grad_norm": 7.926290512084961,
"learning_rate": 9.79318595237705e-07,
"loss": 0.8195,
"step": 4475
},
{
"epoch": 0.0697455377645621,
"grad_norm": 4.315165042877197,
"learning_rate": 9.792366562331e-07,
"loss": 0.8674,
"step": 4480
},
{
"epoch": 0.06982337876653148,
"grad_norm": 3.775836706161499,
"learning_rate": 9.79154717228495e-07,
"loss": 0.8334,
"step": 4485
},
{
"epoch": 0.06990121976850086,
"grad_norm": 2.560904026031494,
"learning_rate": 9.7907277822389e-07,
"loss": 0.8271,
"step": 4490
},
{
"epoch": 0.06997906077047024,
"grad_norm": 11.29925537109375,
"learning_rate": 9.789908392192852e-07,
"loss": 0.9633,
"step": 4495
},
{
"epoch": 0.07005690177243962,
"grad_norm": 4.101975917816162,
"learning_rate": 9.789089002146802e-07,
"loss": 0.8858,
"step": 4500
},
{
"epoch": 0.070134742774409,
"grad_norm": 2.970782518386841,
"learning_rate": 9.78826961210075e-07,
"loss": 0.8608,
"step": 4505
},
{
"epoch": 0.07021258377637837,
"grad_norm": 7.289088726043701,
"learning_rate": 9.787450222054702e-07,
"loss": 0.8347,
"step": 4510
},
{
"epoch": 0.07029042477834775,
"grad_norm": 7.107760429382324,
"learning_rate": 9.786630832008653e-07,
"loss": 0.771,
"step": 4515
},
{
"epoch": 0.07036826578031713,
"grad_norm": 3.630275249481201,
"learning_rate": 9.785811441962603e-07,
"loss": 0.7113,
"step": 4520
},
{
"epoch": 0.07044610678228651,
"grad_norm": 4.681270122528076,
"learning_rate": 9.784992051916554e-07,
"loss": 0.8278,
"step": 4525
},
{
"epoch": 0.07052394778425587,
"grad_norm": 3.6923000812530518,
"learning_rate": 9.784172661870503e-07,
"loss": 0.8067,
"step": 4530
},
{
"epoch": 0.07060178878622525,
"grad_norm": 3.538496255874634,
"learning_rate": 9.783353271824453e-07,
"loss": 0.7577,
"step": 4535
},
{
"epoch": 0.07067962978819463,
"grad_norm": 3.3996520042419434,
"learning_rate": 9.782533881778404e-07,
"loss": 0.9051,
"step": 4540
},
{
"epoch": 0.070757470790164,
"grad_norm": 4.107473850250244,
"learning_rate": 9.781714491732353e-07,
"loss": 0.8982,
"step": 4545
},
{
"epoch": 0.07083531179213338,
"grad_norm": 2.9986937046051025,
"learning_rate": 9.780895101686304e-07,
"loss": 0.9025,
"step": 4550
},
{
"epoch": 0.07091315279410276,
"grad_norm": 3.413224697113037,
"learning_rate": 9.780075711640254e-07,
"loss": 0.7699,
"step": 4555
},
{
"epoch": 0.07099099379607214,
"grad_norm": 3.332380771636963,
"learning_rate": 9.779256321594205e-07,
"loss": 0.6913,
"step": 4560
},
{
"epoch": 0.07106883479804152,
"grad_norm": 3.161701202392578,
"learning_rate": 9.778436931548156e-07,
"loss": 0.7502,
"step": 4565
},
{
"epoch": 0.0711466758000109,
"grad_norm": 3.6863913536071777,
"learning_rate": 9.777617541502107e-07,
"loss": 0.7959,
"step": 4570
},
{
"epoch": 0.07122451680198028,
"grad_norm": 4.537403583526611,
"learning_rate": 9.776798151456055e-07,
"loss": 0.8646,
"step": 4575
},
{
"epoch": 0.07130235780394965,
"grad_norm": 4.111873149871826,
"learning_rate": 9.775978761410006e-07,
"loss": 0.8503,
"step": 4580
},
{
"epoch": 0.07138019880591903,
"grad_norm": 8.788448333740234,
"learning_rate": 9.775159371363957e-07,
"loss": 0.8046,
"step": 4585
},
{
"epoch": 0.07145803980788841,
"grad_norm": 5.538233757019043,
"learning_rate": 9.774339981317905e-07,
"loss": 0.8018,
"step": 4590
},
{
"epoch": 0.07153588080985779,
"grad_norm": 6.06341028213501,
"learning_rate": 9.773520591271856e-07,
"loss": 0.8073,
"step": 4595
},
{
"epoch": 0.07161372181182717,
"grad_norm": 3.6553616523742676,
"learning_rate": 9.772701201225807e-07,
"loss": 0.8142,
"step": 4600
},
{
"epoch": 0.07169156281379654,
"grad_norm": 4.252196311950684,
"learning_rate": 9.771881811179758e-07,
"loss": 0.7505,
"step": 4605
},
{
"epoch": 0.07176940381576592,
"grad_norm": 3.3813109397888184,
"learning_rate": 9.771062421133708e-07,
"loss": 0.7076,
"step": 4610
},
{
"epoch": 0.0718472448177353,
"grad_norm": 9.012163162231445,
"learning_rate": 9.77024303108766e-07,
"loss": 0.8812,
"step": 4615
},
{
"epoch": 0.07192508581970466,
"grad_norm": 6.12354040145874,
"learning_rate": 9.769423641041608e-07,
"loss": 0.8218,
"step": 4620
},
{
"epoch": 0.07200292682167404,
"grad_norm": 3.364898681640625,
"learning_rate": 9.768604250995558e-07,
"loss": 0.8775,
"step": 4625
},
{
"epoch": 0.07208076782364342,
"grad_norm": 13.047234535217285,
"learning_rate": 9.76778486094951e-07,
"loss": 0.7578,
"step": 4630
},
{
"epoch": 0.0721586088256128,
"grad_norm": 6.722197532653809,
"learning_rate": 9.766965470903458e-07,
"loss": 0.8446,
"step": 4635
},
{
"epoch": 0.07223644982758218,
"grad_norm": 4.028960227966309,
"learning_rate": 9.766146080857409e-07,
"loss": 0.8284,
"step": 4640
},
{
"epoch": 0.07231429082955156,
"grad_norm": 3.668736219406128,
"learning_rate": 9.76532669081136e-07,
"loss": 0.8298,
"step": 4645
},
{
"epoch": 0.07239213183152093,
"grad_norm": 3.391463041305542,
"learning_rate": 9.76450730076531e-07,
"loss": 0.8038,
"step": 4650
},
{
"epoch": 0.07246997283349031,
"grad_norm": 2.8080356121063232,
"learning_rate": 9.76368791071926e-07,
"loss": 0.8614,
"step": 4655
},
{
"epoch": 0.07254781383545969,
"grad_norm": 3.9080796241760254,
"learning_rate": 9.762868520673212e-07,
"loss": 0.8239,
"step": 4660
},
{
"epoch": 0.07262565483742907,
"grad_norm": 3.0968992710113525,
"learning_rate": 9.76204913062716e-07,
"loss": 0.8751,
"step": 4665
},
{
"epoch": 0.07270349583939845,
"grad_norm": 6.975797176361084,
"learning_rate": 9.76122974058111e-07,
"loss": 0.7877,
"step": 4670
},
{
"epoch": 0.07278133684136782,
"grad_norm": 5.175839424133301,
"learning_rate": 9.760410350535062e-07,
"loss": 0.7242,
"step": 4675
},
{
"epoch": 0.0728591778433372,
"grad_norm": 3.86811900138855,
"learning_rate": 9.759590960489012e-07,
"loss": 0.8628,
"step": 4680
},
{
"epoch": 0.07293701884530658,
"grad_norm": 4.670974254608154,
"learning_rate": 9.75877157044296e-07,
"loss": 0.7741,
"step": 4685
},
{
"epoch": 0.07301485984727596,
"grad_norm": 3.4863369464874268,
"learning_rate": 9.757952180396912e-07,
"loss": 0.9401,
"step": 4690
},
{
"epoch": 0.07309270084924534,
"grad_norm": 4.012441158294678,
"learning_rate": 9.757132790350863e-07,
"loss": 0.8949,
"step": 4695
},
{
"epoch": 0.07317054185121472,
"grad_norm": 3.7120773792266846,
"learning_rate": 9.756313400304813e-07,
"loss": 0.912,
"step": 4700
},
{
"epoch": 0.07324838285318408,
"grad_norm": 4.149153232574463,
"learning_rate": 9.755494010258762e-07,
"loss": 0.7284,
"step": 4705
},
{
"epoch": 0.07332622385515346,
"grad_norm": 3.724862813949585,
"learning_rate": 9.754674620212713e-07,
"loss": 0.8486,
"step": 4710
},
{
"epoch": 0.07340406485712284,
"grad_norm": 5.275464057922363,
"learning_rate": 9.753855230166663e-07,
"loss": 0.7661,
"step": 4715
},
{
"epoch": 0.07348190585909221,
"grad_norm": 8.389967918395996,
"learning_rate": 9.753035840120614e-07,
"loss": 0.8646,
"step": 4720
},
{
"epoch": 0.07355974686106159,
"grad_norm": 4.1537017822265625,
"learning_rate": 9.752216450074565e-07,
"loss": 0.8596,
"step": 4725
},
{
"epoch": 0.07363758786303097,
"grad_norm": 3.4971349239349365,
"learning_rate": 9.751397060028514e-07,
"loss": 0.7888,
"step": 4730
},
{
"epoch": 0.07371542886500035,
"grad_norm": 2.9952375888824463,
"learning_rate": 9.750577669982464e-07,
"loss": 0.8127,
"step": 4735
},
{
"epoch": 0.07379326986696973,
"grad_norm": 3.0156424045562744,
"learning_rate": 9.749758279936415e-07,
"loss": 0.6775,
"step": 4740
},
{
"epoch": 0.0738711108689391,
"grad_norm": 4.386186122894287,
"learning_rate": 9.748938889890366e-07,
"loss": 0.8813,
"step": 4745
},
{
"epoch": 0.07394895187090848,
"grad_norm": 8.352777481079102,
"learning_rate": 9.748119499844314e-07,
"loss": 0.841,
"step": 4750
},
{
"epoch": 0.07402679287287786,
"grad_norm": 3.9071156978607178,
"learning_rate": 9.747300109798265e-07,
"loss": 0.8124,
"step": 4755
},
{
"epoch": 0.07410463387484724,
"grad_norm": 6.337040901184082,
"learning_rate": 9.746480719752216e-07,
"loss": 0.7601,
"step": 4760
},
{
"epoch": 0.07418247487681662,
"grad_norm": 4.740725040435791,
"learning_rate": 9.745661329706167e-07,
"loss": 0.8621,
"step": 4765
},
{
"epoch": 0.074260315878786,
"grad_norm": 3.6366703510284424,
"learning_rate": 9.744841939660117e-07,
"loss": 0.7521,
"step": 4770
},
{
"epoch": 0.07433815688075537,
"grad_norm": 5.869968891143799,
"learning_rate": 9.744022549614068e-07,
"loss": 0.8219,
"step": 4775
},
{
"epoch": 0.07441599788272475,
"grad_norm": 4.8249006271362305,
"learning_rate": 9.743203159568017e-07,
"loss": 0.8733,
"step": 4780
},
{
"epoch": 0.07449383888469413,
"grad_norm": 3.9930624961853027,
"learning_rate": 9.742383769521968e-07,
"loss": 0.7968,
"step": 4785
},
{
"epoch": 0.0745716798866635,
"grad_norm": 5.8335418701171875,
"learning_rate": 9.741564379475918e-07,
"loss": 0.8224,
"step": 4790
},
{
"epoch": 0.07464952088863287,
"grad_norm": 5.657021522521973,
"learning_rate": 9.740744989429867e-07,
"loss": 0.7896,
"step": 4795
},
{
"epoch": 0.07472736189060225,
"grad_norm": 6.225119590759277,
"learning_rate": 9.739925599383818e-07,
"loss": 0.8297,
"step": 4800
},
{
"epoch": 0.07480520289257163,
"grad_norm": 3.373596429824829,
"learning_rate": 9.739106209337768e-07,
"loss": 0.8255,
"step": 4805
},
{
"epoch": 0.07488304389454101,
"grad_norm": 2.2436752319335938,
"learning_rate": 9.73828681929172e-07,
"loss": 0.7097,
"step": 4810
},
{
"epoch": 0.07496088489651039,
"grad_norm": 3.6879262924194336,
"learning_rate": 9.73746742924567e-07,
"loss": 0.6912,
"step": 4815
},
{
"epoch": 0.07503872589847976,
"grad_norm": 3.399632692337036,
"learning_rate": 9.73664803919962e-07,
"loss": 0.8371,
"step": 4820
},
{
"epoch": 0.07511656690044914,
"grad_norm": 9.186985969543457,
"learning_rate": 9.73582864915357e-07,
"loss": 0.8157,
"step": 4825
},
{
"epoch": 0.07519440790241852,
"grad_norm": 3.6295411586761475,
"learning_rate": 9.73500925910752e-07,
"loss": 0.7785,
"step": 4830
},
{
"epoch": 0.0752722489043879,
"grad_norm": 3.534175395965576,
"learning_rate": 9.73418986906147e-07,
"loss": 0.878,
"step": 4835
},
{
"epoch": 0.07535008990635728,
"grad_norm": 3.0165436267852783,
"learning_rate": 9.73337047901542e-07,
"loss": 0.7593,
"step": 4840
},
{
"epoch": 0.07542793090832665,
"grad_norm": 4.980969429016113,
"learning_rate": 9.73255108896937e-07,
"loss": 0.7592,
"step": 4845
},
{
"epoch": 0.07550577191029603,
"grad_norm": 3.377429723739624,
"learning_rate": 9.73173169892332e-07,
"loss": 0.7849,
"step": 4850
},
{
"epoch": 0.07558361291226541,
"grad_norm": 5.916225910186768,
"learning_rate": 9.730912308877272e-07,
"loss": 0.8283,
"step": 4855
},
{
"epoch": 0.07566145391423479,
"grad_norm": 6.396664619445801,
"learning_rate": 9.730092918831222e-07,
"loss": 0.8801,
"step": 4860
},
{
"epoch": 0.07573929491620417,
"grad_norm": 3.2279000282287598,
"learning_rate": 9.729273528785173e-07,
"loss": 0.7746,
"step": 4865
},
{
"epoch": 0.07581713591817354,
"grad_norm": 3.3522236347198486,
"learning_rate": 9.728454138739122e-07,
"loss": 1.0139,
"step": 4870
},
{
"epoch": 0.07589497692014291,
"grad_norm": 7.16496467590332,
"learning_rate": 9.727634748693073e-07,
"loss": 0.8203,
"step": 4875
},
{
"epoch": 0.07597281792211229,
"grad_norm": 3.7520346641540527,
"learning_rate": 9.726815358647023e-07,
"loss": 0.7859,
"step": 4880
},
{
"epoch": 0.07605065892408167,
"grad_norm": 5.511653900146484,
"learning_rate": 9.725995968600972e-07,
"loss": 0.8594,
"step": 4885
},
{
"epoch": 0.07612849992605104,
"grad_norm": 5.89841365814209,
"learning_rate": 9.725176578554923e-07,
"loss": 0.6535,
"step": 4890
},
{
"epoch": 0.07620634092802042,
"grad_norm": 4.694098949432373,
"learning_rate": 9.724357188508873e-07,
"loss": 0.8221,
"step": 4895
},
{
"epoch": 0.0762841819299898,
"grad_norm": 4.192508220672607,
"learning_rate": 9.723537798462824e-07,
"loss": 0.8594,
"step": 4900
},
{
"epoch": 0.07636202293195918,
"grad_norm": 4.4052534103393555,
"learning_rate": 9.722718408416775e-07,
"loss": 0.7326,
"step": 4905
},
{
"epoch": 0.07643986393392856,
"grad_norm": 4.1674299240112305,
"learning_rate": 9.721899018370724e-07,
"loss": 0.7525,
"step": 4910
},
{
"epoch": 0.07651770493589793,
"grad_norm": 4.367162227630615,
"learning_rate": 9.721079628324674e-07,
"loss": 0.9038,
"step": 4915
},
{
"epoch": 0.07659554593786731,
"grad_norm": 3.0957272052764893,
"learning_rate": 9.720260238278625e-07,
"loss": 0.8247,
"step": 4920
},
{
"epoch": 0.07667338693983669,
"grad_norm": 6.5740532875061035,
"learning_rate": 9.719440848232576e-07,
"loss": 0.7707,
"step": 4925
},
{
"epoch": 0.07675122794180607,
"grad_norm": 5.08697509765625,
"learning_rate": 9.718621458186527e-07,
"loss": 0.7561,
"step": 4930
},
{
"epoch": 0.07682906894377545,
"grad_norm": 6.9134063720703125,
"learning_rate": 9.717802068140475e-07,
"loss": 0.7645,
"step": 4935
},
{
"epoch": 0.07690690994574483,
"grad_norm": 3.2047367095947266,
"learning_rate": 9.716982678094426e-07,
"loss": 0.854,
"step": 4940
},
{
"epoch": 0.0769847509477142,
"grad_norm": 3.4643442630767822,
"learning_rate": 9.716163288048377e-07,
"loss": 0.7598,
"step": 4945
},
{
"epoch": 0.07706259194968358,
"grad_norm": 5.208106517791748,
"learning_rate": 9.715343898002325e-07,
"loss": 0.7699,
"step": 4950
},
{
"epoch": 0.07714043295165296,
"grad_norm": 4.935080051422119,
"learning_rate": 9.714524507956276e-07,
"loss": 0.8839,
"step": 4955
},
{
"epoch": 0.07721827395362234,
"grad_norm": 4.052170753479004,
"learning_rate": 9.713705117910227e-07,
"loss": 0.841,
"step": 4960
},
{
"epoch": 0.0772961149555917,
"grad_norm": 3.409742593765259,
"learning_rate": 9.712885727864178e-07,
"loss": 0.8768,
"step": 4965
},
{
"epoch": 0.07737395595756108,
"grad_norm": 5.577835559844971,
"learning_rate": 9.712066337818128e-07,
"loss": 0.7609,
"step": 4970
},
{
"epoch": 0.07745179695953046,
"grad_norm": 4.966437816619873,
"learning_rate": 9.71124694777208e-07,
"loss": 0.9478,
"step": 4975
},
{
"epoch": 0.07752963796149984,
"grad_norm": 5.092791557312012,
"learning_rate": 9.710427557726028e-07,
"loss": 0.8237,
"step": 4980
},
{
"epoch": 0.07760747896346921,
"grad_norm": 2.992233991622925,
"learning_rate": 9.709608167679978e-07,
"loss": 0.8486,
"step": 4985
},
{
"epoch": 0.07768531996543859,
"grad_norm": 3.947547197341919,
"learning_rate": 9.70878877763393e-07,
"loss": 1.0368,
"step": 4990
},
{
"epoch": 0.07776316096740797,
"grad_norm": 4.660312652587891,
"learning_rate": 9.707969387587878e-07,
"loss": 0.7254,
"step": 4995
},
{
"epoch": 0.07784100196937735,
"grad_norm": 5.542099475860596,
"learning_rate": 9.707149997541829e-07,
"loss": 0.7871,
"step": 5000
},
{
"epoch": 0.07791884297134673,
"grad_norm": 2.8513717651367188,
"learning_rate": 9.70633060749578e-07,
"loss": 0.7405,
"step": 5005
},
{
"epoch": 0.0779966839733161,
"grad_norm": 3.067697286605835,
"learning_rate": 9.70551121744973e-07,
"loss": 0.8251,
"step": 5010
},
{
"epoch": 0.07807452497528548,
"grad_norm": 4.706809043884277,
"learning_rate": 9.70469182740368e-07,
"loss": 0.894,
"step": 5015
},
{
"epoch": 0.07815236597725486,
"grad_norm": 3.1183722019195557,
"learning_rate": 9.703872437357632e-07,
"loss": 0.7985,
"step": 5020
},
{
"epoch": 0.07823020697922424,
"grad_norm": 3.95314884185791,
"learning_rate": 9.703053047311582e-07,
"loss": 0.8673,
"step": 5025
},
{
"epoch": 0.07830804798119362,
"grad_norm": 5.186405658721924,
"learning_rate": 9.70223365726553e-07,
"loss": 0.8834,
"step": 5030
},
{
"epoch": 0.078385888983163,
"grad_norm": 6.825442790985107,
"learning_rate": 9.701414267219482e-07,
"loss": 0.8638,
"step": 5035
},
{
"epoch": 0.07846372998513237,
"grad_norm": 4.547275543212891,
"learning_rate": 9.700594877173432e-07,
"loss": 0.7591,
"step": 5040
},
{
"epoch": 0.07854157098710175,
"grad_norm": 3.055347204208374,
"learning_rate": 9.69977548712738e-07,
"loss": 0.7955,
"step": 5045
},
{
"epoch": 0.07861941198907112,
"grad_norm": 6.120547294616699,
"learning_rate": 9.698956097081332e-07,
"loss": 0.8606,
"step": 5050
},
{
"epoch": 0.0786972529910405,
"grad_norm": 8.125998497009277,
"learning_rate": 9.698136707035283e-07,
"loss": 0.7676,
"step": 5055
},
{
"epoch": 0.07877509399300987,
"grad_norm": 3.794414758682251,
"learning_rate": 9.697317316989233e-07,
"loss": 0.8594,
"step": 5060
},
{
"epoch": 0.07885293499497925,
"grad_norm": 4.892978191375732,
"learning_rate": 9.696497926943184e-07,
"loss": 0.774,
"step": 5065
},
{
"epoch": 0.07893077599694863,
"grad_norm": 4.139584064483643,
"learning_rate": 9.695678536897135e-07,
"loss": 0.83,
"step": 5070
},
{
"epoch": 0.07900861699891801,
"grad_norm": 7.144068241119385,
"learning_rate": 9.694859146851083e-07,
"loss": 0.8541,
"step": 5075
},
{
"epoch": 0.07908645800088739,
"grad_norm": 10.283439636230469,
"learning_rate": 9.694039756805034e-07,
"loss": 0.9355,
"step": 5080
},
{
"epoch": 0.07916429900285676,
"grad_norm": 3.0185656547546387,
"learning_rate": 9.693220366758985e-07,
"loss": 0.8761,
"step": 5085
},
{
"epoch": 0.07924214000482614,
"grad_norm": 3.299808979034424,
"learning_rate": 9.692400976712934e-07,
"loss": 0.715,
"step": 5090
},
{
"epoch": 0.07931998100679552,
"grad_norm": 7.163717746734619,
"learning_rate": 9.691581586666884e-07,
"loss": 0.6974,
"step": 5095
},
{
"epoch": 0.0793978220087649,
"grad_norm": 3.0995216369628906,
"learning_rate": 9.690762196620835e-07,
"loss": 0.8159,
"step": 5100
},
{
"epoch": 0.07947566301073428,
"grad_norm": 2.7312302589416504,
"learning_rate": 9.689942806574786e-07,
"loss": 0.8009,
"step": 5105
},
{
"epoch": 0.07955350401270365,
"grad_norm": 4.884325981140137,
"learning_rate": 9.689123416528737e-07,
"loss": 0.8839,
"step": 5110
},
{
"epoch": 0.07963134501467303,
"grad_norm": 3.1511213779449463,
"learning_rate": 9.688304026482685e-07,
"loss": 0.6633,
"step": 5115
},
{
"epoch": 0.07970918601664241,
"grad_norm": 3.034996271133423,
"learning_rate": 9.687484636436636e-07,
"loss": 0.8636,
"step": 5120
},
{
"epoch": 0.07978702701861179,
"grad_norm": 6.756342887878418,
"learning_rate": 9.686665246390587e-07,
"loss": 0.844,
"step": 5125
},
{
"epoch": 0.07986486802058117,
"grad_norm": 4.012609958648682,
"learning_rate": 9.685845856344537e-07,
"loss": 1.0347,
"step": 5130
},
{
"epoch": 0.07994270902255053,
"grad_norm": 4.039714336395264,
"learning_rate": 9.685026466298486e-07,
"loss": 0.8549,
"step": 5135
},
{
"epoch": 0.08002055002451991,
"grad_norm": 4.654749393463135,
"learning_rate": 9.684207076252437e-07,
"loss": 0.7294,
"step": 5140
},
{
"epoch": 0.08009839102648929,
"grad_norm": 5.652122497558594,
"learning_rate": 9.683387686206388e-07,
"loss": 0.8808,
"step": 5145
},
{
"epoch": 0.08017623202845867,
"grad_norm": 5.13718318939209,
"learning_rate": 9.682568296160338e-07,
"loss": 0.8012,
"step": 5150
},
{
"epoch": 0.08025407303042804,
"grad_norm": 4.274785995483398,
"learning_rate": 9.681748906114287e-07,
"loss": 0.9497,
"step": 5155
},
{
"epoch": 0.08033191403239742,
"grad_norm": 3.5715765953063965,
"learning_rate": 9.680929516068238e-07,
"loss": 0.6932,
"step": 5160
},
{
"epoch": 0.0804097550343668,
"grad_norm": 3.721369504928589,
"learning_rate": 9.680110126022188e-07,
"loss": 0.7604,
"step": 5165
},
{
"epoch": 0.08048759603633618,
"grad_norm": 4.815948486328125,
"learning_rate": 9.67929073597614e-07,
"loss": 0.8056,
"step": 5170
},
{
"epoch": 0.08056543703830556,
"grad_norm": 3.9973649978637695,
"learning_rate": 9.67847134593009e-07,
"loss": 0.8245,
"step": 5175
},
{
"epoch": 0.08064327804027493,
"grad_norm": 6.30864143371582,
"learning_rate": 9.67765195588404e-07,
"loss": 0.8158,
"step": 5180
},
{
"epoch": 0.08072111904224431,
"grad_norm": 3.627049207687378,
"learning_rate": 9.67683256583799e-07,
"loss": 0.6924,
"step": 5185
},
{
"epoch": 0.08079896004421369,
"grad_norm": 3.445680618286133,
"learning_rate": 9.67601317579194e-07,
"loss": 0.7619,
"step": 5190
},
{
"epoch": 0.08087680104618307,
"grad_norm": 5.6612868309021,
"learning_rate": 9.67519378574589e-07,
"loss": 0.8712,
"step": 5195
},
{
"epoch": 0.08095464204815245,
"grad_norm": 8.172099113464355,
"learning_rate": 9.67437439569984e-07,
"loss": 0.875,
"step": 5200
},
{
"epoch": 0.08103248305012183,
"grad_norm": 3.6549482345581055,
"learning_rate": 9.67355500565379e-07,
"loss": 0.7416,
"step": 5205
},
{
"epoch": 0.0811103240520912,
"grad_norm": 4.237252712249756,
"learning_rate": 9.67273561560774e-07,
"loss": 0.7864,
"step": 5210
},
{
"epoch": 0.08118816505406058,
"grad_norm": 3.6416895389556885,
"learning_rate": 9.671916225561692e-07,
"loss": 0.8346,
"step": 5215
},
{
"epoch": 0.08126600605602995,
"grad_norm": 7.055088996887207,
"learning_rate": 9.671096835515642e-07,
"loss": 0.8257,
"step": 5220
},
{
"epoch": 0.08134384705799932,
"grad_norm": 4.3031511306762695,
"learning_rate": 9.670277445469593e-07,
"loss": 0.9243,
"step": 5225
},
{
"epoch": 0.0814216880599687,
"grad_norm": 12.051529884338379,
"learning_rate": 9.669458055423542e-07,
"loss": 0.8012,
"step": 5230
},
{
"epoch": 0.08149952906193808,
"grad_norm": 3.5274226665496826,
"learning_rate": 9.668638665377493e-07,
"loss": 0.8752,
"step": 5235
},
{
"epoch": 0.08157737006390746,
"grad_norm": 3.1642568111419678,
"learning_rate": 9.667819275331443e-07,
"loss": 0.7385,
"step": 5240
},
{
"epoch": 0.08165521106587684,
"grad_norm": 3.645951271057129,
"learning_rate": 9.666999885285392e-07,
"loss": 0.7538,
"step": 5245
},
{
"epoch": 0.08173305206784622,
"grad_norm": 5.045301914215088,
"learning_rate": 9.666180495239343e-07,
"loss": 0.8496,
"step": 5250
},
{
"epoch": 0.0818108930698156,
"grad_norm": 3.8335864543914795,
"learning_rate": 9.665361105193293e-07,
"loss": 0.8149,
"step": 5255
},
{
"epoch": 0.08188873407178497,
"grad_norm": 5.525310516357422,
"learning_rate": 9.664541715147244e-07,
"loss": 0.8061,
"step": 5260
},
{
"epoch": 0.08196657507375435,
"grad_norm": 3.721007823944092,
"learning_rate": 9.663722325101195e-07,
"loss": 0.7835,
"step": 5265
},
{
"epoch": 0.08204441607572373,
"grad_norm": 4.0820393562316895,
"learning_rate": 9.662902935055146e-07,
"loss": 0.8629,
"step": 5270
},
{
"epoch": 0.0821222570776931,
"grad_norm": 2.5007712841033936,
"learning_rate": 9.662083545009094e-07,
"loss": 0.8987,
"step": 5275
},
{
"epoch": 0.08220009807966248,
"grad_norm": 5.49976110458374,
"learning_rate": 9.661264154963045e-07,
"loss": 0.8664,
"step": 5280
},
{
"epoch": 0.08227793908163186,
"grad_norm": 3.953249931335449,
"learning_rate": 9.660444764916996e-07,
"loss": 0.8364,
"step": 5285
},
{
"epoch": 0.08235578008360124,
"grad_norm": 5.422050476074219,
"learning_rate": 9.659625374870947e-07,
"loss": 0.825,
"step": 5290
},
{
"epoch": 0.08243362108557062,
"grad_norm": 6.019737720489502,
"learning_rate": 9.658805984824895e-07,
"loss": 0.7429,
"step": 5295
},
{
"epoch": 0.08251146208754,
"grad_norm": 4.360890865325928,
"learning_rate": 9.657986594778846e-07,
"loss": 0.8384,
"step": 5300
},
{
"epoch": 0.08258930308950937,
"grad_norm": 2.676135540008545,
"learning_rate": 9.657167204732797e-07,
"loss": 0.7928,
"step": 5305
},
{
"epoch": 0.08266714409147874,
"grad_norm": 2.602173328399658,
"learning_rate": 9.656347814686747e-07,
"loss": 0.8301,
"step": 5310
},
{
"epoch": 0.08274498509344812,
"grad_norm": 3.2521512508392334,
"learning_rate": 9.655528424640696e-07,
"loss": 0.8491,
"step": 5315
},
{
"epoch": 0.0828228260954175,
"grad_norm": 3.9603660106658936,
"learning_rate": 9.654709034594647e-07,
"loss": 0.9349,
"step": 5320
},
{
"epoch": 0.08290066709738687,
"grad_norm": 3.615999698638916,
"learning_rate": 9.653889644548598e-07,
"loss": 0.7493,
"step": 5325
},
{
"epoch": 0.08297850809935625,
"grad_norm": 4.19753360748291,
"learning_rate": 9.653070254502548e-07,
"loss": 0.8548,
"step": 5330
},
{
"epoch": 0.08305634910132563,
"grad_norm": 3.5472726821899414,
"learning_rate": 9.6522508644565e-07,
"loss": 0.6982,
"step": 5335
},
{
"epoch": 0.08313419010329501,
"grad_norm": 8.160552024841309,
"learning_rate": 9.651431474410448e-07,
"loss": 0.801,
"step": 5340
},
{
"epoch": 0.08321203110526439,
"grad_norm": 5.538876056671143,
"learning_rate": 9.650612084364398e-07,
"loss": 0.7735,
"step": 5345
},
{
"epoch": 0.08328987210723376,
"grad_norm": 5.047536849975586,
"learning_rate": 9.64979269431835e-07,
"loss": 0.9173,
"step": 5350
},
{
"epoch": 0.08336771310920314,
"grad_norm": 3.526073932647705,
"learning_rate": 9.648973304272298e-07,
"loss": 0.9703,
"step": 5355
},
{
"epoch": 0.08344555411117252,
"grad_norm": 12.305222511291504,
"learning_rate": 9.648153914226249e-07,
"loss": 0.8025,
"step": 5360
},
{
"epoch": 0.0835233951131419,
"grad_norm": 4.003148078918457,
"learning_rate": 9.6473345241802e-07,
"loss": 0.8206,
"step": 5365
},
{
"epoch": 0.08360123611511128,
"grad_norm": 3.3531124591827393,
"learning_rate": 9.64651513413415e-07,
"loss": 0.7922,
"step": 5370
},
{
"epoch": 0.08367907711708066,
"grad_norm": 3.3483853340148926,
"learning_rate": 9.6456957440881e-07,
"loss": 0.805,
"step": 5375
},
{
"epoch": 0.08375691811905003,
"grad_norm": 3.580211639404297,
"learning_rate": 9.644876354042052e-07,
"loss": 0.8038,
"step": 5380
},
{
"epoch": 0.08383475912101941,
"grad_norm": 4.441928863525391,
"learning_rate": 9.644056963996e-07,
"loss": 0.8188,
"step": 5385
},
{
"epoch": 0.08391260012298879,
"grad_norm": 4.342660903930664,
"learning_rate": 9.64323757394995e-07,
"loss": 0.9376,
"step": 5390
},
{
"epoch": 0.08399044112495815,
"grad_norm": 3.5513997077941895,
"learning_rate": 9.642418183903902e-07,
"loss": 0.8407,
"step": 5395
},
{
"epoch": 0.08406828212692753,
"grad_norm": 3.7131507396698,
"learning_rate": 9.64159879385785e-07,
"loss": 0.8832,
"step": 5400
},
{
"epoch": 0.08414612312889691,
"grad_norm": 4.675576686859131,
"learning_rate": 9.6407794038118e-07,
"loss": 0.8137,
"step": 5405
},
{
"epoch": 0.08422396413086629,
"grad_norm": 5.775442600250244,
"learning_rate": 9.639960013765752e-07,
"loss": 0.8705,
"step": 5410
},
{
"epoch": 0.08430180513283567,
"grad_norm": 3.2232508659362793,
"learning_rate": 9.639140623719703e-07,
"loss": 0.8266,
"step": 5415
},
{
"epoch": 0.08437964613480504,
"grad_norm": 3.5636298656463623,
"learning_rate": 9.638321233673653e-07,
"loss": 0.739,
"step": 5420
},
{
"epoch": 0.08445748713677442,
"grad_norm": 6.0133442878723145,
"learning_rate": 9.637501843627604e-07,
"loss": 0.7714,
"step": 5425
},
{
"epoch": 0.0845353281387438,
"grad_norm": 3.2928476333618164,
"learning_rate": 9.636682453581555e-07,
"loss": 0.7455,
"step": 5430
},
{
"epoch": 0.08461316914071318,
"grad_norm": 3.734174966812134,
"learning_rate": 9.635863063535503e-07,
"loss": 0.8668,
"step": 5435
},
{
"epoch": 0.08469101014268256,
"grad_norm": 3.125318765640259,
"learning_rate": 9.635043673489454e-07,
"loss": 0.8308,
"step": 5440
},
{
"epoch": 0.08476885114465194,
"grad_norm": 4.821923732757568,
"learning_rate": 9.634224283443405e-07,
"loss": 0.7993,
"step": 5445
},
{
"epoch": 0.08484669214662131,
"grad_norm": 3.52372407913208,
"learning_rate": 9.633404893397354e-07,
"loss": 0.9816,
"step": 5450
},
{
"epoch": 0.08492453314859069,
"grad_norm": 4.727661609649658,
"learning_rate": 9.632585503351304e-07,
"loss": 0.8819,
"step": 5455
},
{
"epoch": 0.08500237415056007,
"grad_norm": 6.401661396026611,
"learning_rate": 9.631766113305255e-07,
"loss": 0.879,
"step": 5460
},
{
"epoch": 0.08508021515252945,
"grad_norm": 3.203312873840332,
"learning_rate": 9.630946723259206e-07,
"loss": 0.7982,
"step": 5465
},
{
"epoch": 0.08515805615449883,
"grad_norm": 4.19862174987793,
"learning_rate": 9.630127333213157e-07,
"loss": 0.8441,
"step": 5470
},
{
"epoch": 0.0852358971564682,
"grad_norm": 3.9910812377929688,
"learning_rate": 9.629307943167107e-07,
"loss": 0.9595,
"step": 5475
},
{
"epoch": 0.08531373815843757,
"grad_norm": 3.739917755126953,
"learning_rate": 9.628488553121056e-07,
"loss": 0.9027,
"step": 5480
},
{
"epoch": 0.08539157916040695,
"grad_norm": 3.8963537216186523,
"learning_rate": 9.627669163075007e-07,
"loss": 0.7635,
"step": 5485
},
{
"epoch": 0.08546942016237633,
"grad_norm": 11.066873550415039,
"learning_rate": 9.626849773028957e-07,
"loss": 0.828,
"step": 5490
},
{
"epoch": 0.0855472611643457,
"grad_norm": 5.069997310638428,
"learning_rate": 9.626030382982906e-07,
"loss": 0.892,
"step": 5495
},
{
"epoch": 0.08562510216631508,
"grad_norm": 2.1011128425598145,
"learning_rate": 9.625210992936857e-07,
"loss": 0.7157,
"step": 5500
},
{
"epoch": 0.08570294316828446,
"grad_norm": 5.490849494934082,
"learning_rate": 9.624391602890808e-07,
"loss": 0.7912,
"step": 5505
},
{
"epoch": 0.08578078417025384,
"grad_norm": 5.189328670501709,
"learning_rate": 9.623572212844758e-07,
"loss": 0.6751,
"step": 5510
},
{
"epoch": 0.08585862517222322,
"grad_norm": 3.257615089416504,
"learning_rate": 9.62275282279871e-07,
"loss": 0.7616,
"step": 5515
},
{
"epoch": 0.0859364661741926,
"grad_norm": 3.8244619369506836,
"learning_rate": 9.621933432752658e-07,
"loss": 0.7356,
"step": 5520
},
{
"epoch": 0.08601430717616197,
"grad_norm": 4.616507530212402,
"learning_rate": 9.621114042706608e-07,
"loss": 0.8844,
"step": 5525
},
{
"epoch": 0.08609214817813135,
"grad_norm": 8.950932502746582,
"learning_rate": 9.62029465266056e-07,
"loss": 0.8493,
"step": 5530
},
{
"epoch": 0.08616998918010073,
"grad_norm": 3.257582187652588,
"learning_rate": 9.61947526261451e-07,
"loss": 0.7523,
"step": 5535
},
{
"epoch": 0.0862478301820701,
"grad_norm": 9.792999267578125,
"learning_rate": 9.61865587256846e-07,
"loss": 0.8037,
"step": 5540
},
{
"epoch": 0.08632567118403948,
"grad_norm": 3.294633626937866,
"learning_rate": 9.61783648252241e-07,
"loss": 0.7864,
"step": 5545
},
{
"epoch": 0.08640351218600886,
"grad_norm": 3.527974843978882,
"learning_rate": 9.61701709247636e-07,
"loss": 0.7226,
"step": 5550
},
{
"epoch": 0.08648135318797824,
"grad_norm": 10.905069351196289,
"learning_rate": 9.61619770243031e-07,
"loss": 0.8916,
"step": 5555
},
{
"epoch": 0.08655919418994762,
"grad_norm": 5.191342830657959,
"learning_rate": 9.61537831238426e-07,
"loss": 0.7855,
"step": 5560
},
{
"epoch": 0.08663703519191698,
"grad_norm": 4.45928430557251,
"learning_rate": 9.61455892233821e-07,
"loss": 0.6989,
"step": 5565
},
{
"epoch": 0.08671487619388636,
"grad_norm": 3.573596954345703,
"learning_rate": 9.61373953229216e-07,
"loss": 0.718,
"step": 5570
},
{
"epoch": 0.08679271719585574,
"grad_norm": 3.490968942642212,
"learning_rate": 9.612920142246112e-07,
"loss": 0.9475,
"step": 5575
},
{
"epoch": 0.08687055819782512,
"grad_norm": 5.315331935882568,
"learning_rate": 9.612100752200062e-07,
"loss": 0.7643,
"step": 5580
},
{
"epoch": 0.0869483991997945,
"grad_norm": 5.576305389404297,
"learning_rate": 9.611281362154013e-07,
"loss": 0.8273,
"step": 5585
},
{
"epoch": 0.08702624020176387,
"grad_norm": 3.3249528408050537,
"learning_rate": 9.610461972107962e-07,
"loss": 0.7327,
"step": 5590
},
{
"epoch": 0.08710408120373325,
"grad_norm": 5.021561622619629,
"learning_rate": 9.609642582061913e-07,
"loss": 0.7693,
"step": 5595
},
{
"epoch": 0.08718192220570263,
"grad_norm": 4.7560834884643555,
"learning_rate": 9.608823192015863e-07,
"loss": 0.7056,
"step": 5600
},
{
"epoch": 0.08725976320767201,
"grad_norm": 4.182785987854004,
"learning_rate": 9.608003801969812e-07,
"loss": 0.7807,
"step": 5605
},
{
"epoch": 0.08733760420964139,
"grad_norm": 3.104510545730591,
"learning_rate": 9.607184411923763e-07,
"loss": 0.771,
"step": 5610
},
{
"epoch": 0.08741544521161076,
"grad_norm": 4.316323280334473,
"learning_rate": 9.606365021877713e-07,
"loss": 0.8881,
"step": 5615
},
{
"epoch": 0.08749328621358014,
"grad_norm": 4.004445552825928,
"learning_rate": 9.605545631831664e-07,
"loss": 0.8125,
"step": 5620
},
{
"epoch": 0.08757112721554952,
"grad_norm": 5.998608112335205,
"learning_rate": 9.604726241785615e-07,
"loss": 0.8896,
"step": 5625
},
{
"epoch": 0.0876489682175189,
"grad_norm": 6.251708507537842,
"learning_rate": 9.603906851739566e-07,
"loss": 0.97,
"step": 5630
},
{
"epoch": 0.08772680921948828,
"grad_norm": 4.202377796173096,
"learning_rate": 9.603087461693514e-07,
"loss": 0.9106,
"step": 5635
},
{
"epoch": 0.08780465022145766,
"grad_norm": 7.748138427734375,
"learning_rate": 9.602268071647465e-07,
"loss": 0.8634,
"step": 5640
},
{
"epoch": 0.08788249122342703,
"grad_norm": 5.498707294464111,
"learning_rate": 9.601448681601416e-07,
"loss": 0.7508,
"step": 5645
},
{
"epoch": 0.08796033222539641,
"grad_norm": 3.504171133041382,
"learning_rate": 9.600629291555364e-07,
"loss": 0.8638,
"step": 5650
},
{
"epoch": 0.08803817322736578,
"grad_norm": 4.243772983551025,
"learning_rate": 9.599809901509315e-07,
"loss": 0.8651,
"step": 5655
},
{
"epoch": 0.08811601422933515,
"grad_norm": 2.332878589630127,
"learning_rate": 9.598990511463266e-07,
"loss": 0.6884,
"step": 5660
},
{
"epoch": 0.08819385523130453,
"grad_norm": 5.470850944519043,
"learning_rate": 9.598171121417217e-07,
"loss": 0.9159,
"step": 5665
},
{
"epoch": 0.08827169623327391,
"grad_norm": 3.167588710784912,
"learning_rate": 9.597351731371167e-07,
"loss": 0.7925,
"step": 5670
},
{
"epoch": 0.08834953723524329,
"grad_norm": 8.463876724243164,
"learning_rate": 9.596532341325118e-07,
"loss": 0.773,
"step": 5675
},
{
"epoch": 0.08842737823721267,
"grad_norm": 5.318755626678467,
"learning_rate": 9.595712951279067e-07,
"loss": 0.8523,
"step": 5680
},
{
"epoch": 0.08850521923918205,
"grad_norm": 8.276546478271484,
"learning_rate": 9.594893561233018e-07,
"loss": 0.7573,
"step": 5685
},
{
"epoch": 0.08858306024115142,
"grad_norm": 3.6410884857177734,
"learning_rate": 9.594074171186968e-07,
"loss": 1.0501,
"step": 5690
},
{
"epoch": 0.0886609012431208,
"grad_norm": 4.722231388092041,
"learning_rate": 9.59325478114092e-07,
"loss": 0.8005,
"step": 5695
},
{
"epoch": 0.08873874224509018,
"grad_norm": 4.808355808258057,
"learning_rate": 9.592435391094868e-07,
"loss": 0.8858,
"step": 5700
},
{
"epoch": 0.08881658324705956,
"grad_norm": 3.3222663402557373,
"learning_rate": 9.591616001048818e-07,
"loss": 0.8279,
"step": 5705
},
{
"epoch": 0.08889442424902894,
"grad_norm": 6.019637584686279,
"learning_rate": 9.59079661100277e-07,
"loss": 0.8425,
"step": 5710
},
{
"epoch": 0.08897226525099831,
"grad_norm": 3.4430840015411377,
"learning_rate": 9.58997722095672e-07,
"loss": 0.8277,
"step": 5715
},
{
"epoch": 0.08905010625296769,
"grad_norm": 2.4599595069885254,
"learning_rate": 9.58915783091067e-07,
"loss": 0.8047,
"step": 5720
},
{
"epoch": 0.08912794725493707,
"grad_norm": 5.123390197753906,
"learning_rate": 9.58833844086462e-07,
"loss": 0.7134,
"step": 5725
},
{
"epoch": 0.08920578825690645,
"grad_norm": 5.264007568359375,
"learning_rate": 9.58751905081857e-07,
"loss": 0.914,
"step": 5730
},
{
"epoch": 0.08928362925887583,
"grad_norm": 2.6615512371063232,
"learning_rate": 9.58669966077252e-07,
"loss": 0.8613,
"step": 5735
},
{
"epoch": 0.08936147026084519,
"grad_norm": 2.7306411266326904,
"learning_rate": 9.585880270726472e-07,
"loss": 0.6556,
"step": 5740
},
{
"epoch": 0.08943931126281457,
"grad_norm": 3.124546766281128,
"learning_rate": 9.58506088068042e-07,
"loss": 0.7402,
"step": 5745
},
{
"epoch": 0.08951715226478395,
"grad_norm": 3.2253921031951904,
"learning_rate": 9.58424149063437e-07,
"loss": 0.8324,
"step": 5750
},
{
"epoch": 0.08959499326675333,
"grad_norm": 4.765871524810791,
"learning_rate": 9.583422100588322e-07,
"loss": 0.8187,
"step": 5755
},
{
"epoch": 0.0896728342687227,
"grad_norm": 5.348093509674072,
"learning_rate": 9.582602710542272e-07,
"loss": 0.8607,
"step": 5760
},
{
"epoch": 0.08975067527069208,
"grad_norm": 3.306044340133667,
"learning_rate": 9.581783320496221e-07,
"loss": 0.8503,
"step": 5765
},
{
"epoch": 0.08982851627266146,
"grad_norm": 5.045707702636719,
"learning_rate": 9.580963930450172e-07,
"loss": 0.8423,
"step": 5770
},
{
"epoch": 0.08990635727463084,
"grad_norm": 6.576409816741943,
"learning_rate": 9.580144540404123e-07,
"loss": 0.9193,
"step": 5775
},
{
"epoch": 0.08998419827660022,
"grad_norm": 7.775379180908203,
"learning_rate": 9.579325150358073e-07,
"loss": 0.8257,
"step": 5780
},
{
"epoch": 0.0900620392785696,
"grad_norm": 8.183690071105957,
"learning_rate": 9.578505760312024e-07,
"loss": 0.7978,
"step": 5785
},
{
"epoch": 0.09013988028053897,
"grad_norm": 5.8509087562561035,
"learning_rate": 9.577686370265975e-07,
"loss": 0.7486,
"step": 5790
},
{
"epoch": 0.09021772128250835,
"grad_norm": 7.354578971862793,
"learning_rate": 9.576866980219923e-07,
"loss": 0.8432,
"step": 5795
},
{
"epoch": 0.09029556228447773,
"grad_norm": 3.6449766159057617,
"learning_rate": 9.576047590173874e-07,
"loss": 0.8296,
"step": 5800
},
{
"epoch": 0.09037340328644711,
"grad_norm": 9.557231903076172,
"learning_rate": 9.575228200127825e-07,
"loss": 0.7974,
"step": 5805
},
{
"epoch": 0.09045124428841649,
"grad_norm": 4.868302345275879,
"learning_rate": 9.574408810081774e-07,
"loss": 0.9218,
"step": 5810
},
{
"epoch": 0.09052908529038586,
"grad_norm": 4.260608196258545,
"learning_rate": 9.573589420035724e-07,
"loss": 0.7428,
"step": 5815
},
{
"epoch": 0.09060692629235524,
"grad_norm": 3.023204803466797,
"learning_rate": 9.572770029989675e-07,
"loss": 0.8903,
"step": 5820
},
{
"epoch": 0.0906847672943246,
"grad_norm": 3.036348819732666,
"learning_rate": 9.571950639943626e-07,
"loss": 0.89,
"step": 5825
},
{
"epoch": 0.09076260829629398,
"grad_norm": 4.273719310760498,
"learning_rate": 9.571131249897577e-07,
"loss": 0.8611,
"step": 5830
},
{
"epoch": 0.09084044929826336,
"grad_norm": 3.51576828956604,
"learning_rate": 9.570311859851527e-07,
"loss": 0.829,
"step": 5835
},
{
"epoch": 0.09091829030023274,
"grad_norm": 3.904651641845703,
"learning_rate": 9.569492469805476e-07,
"loss": 0.9588,
"step": 5840
},
{
"epoch": 0.09099613130220212,
"grad_norm": 3.824842691421509,
"learning_rate": 9.568673079759427e-07,
"loss": 0.7191,
"step": 5845
},
{
"epoch": 0.0910739723041715,
"grad_norm": 5.210089683532715,
"learning_rate": 9.567853689713377e-07,
"loss": 0.8695,
"step": 5850
},
{
"epoch": 0.09115181330614087,
"grad_norm": 2.847330093383789,
"learning_rate": 9.567034299667326e-07,
"loss": 0.8151,
"step": 5855
},
{
"epoch": 0.09122965430811025,
"grad_norm": 4.297481060028076,
"learning_rate": 9.566214909621277e-07,
"loss": 0.91,
"step": 5860
},
{
"epoch": 0.09130749531007963,
"grad_norm": 5.124939918518066,
"learning_rate": 9.565395519575228e-07,
"loss": 0.7646,
"step": 5865
},
{
"epoch": 0.09138533631204901,
"grad_norm": 4.084904193878174,
"learning_rate": 9.564576129529178e-07,
"loss": 0.8639,
"step": 5870
},
{
"epoch": 0.09146317731401839,
"grad_norm": 7.2979254722595215,
"learning_rate": 9.56375673948313e-07,
"loss": 0.909,
"step": 5875
},
{
"epoch": 0.09154101831598777,
"grad_norm": 3.893127202987671,
"learning_rate": 9.56293734943708e-07,
"loss": 0.7701,
"step": 5880
},
{
"epoch": 0.09161885931795714,
"grad_norm": 3.9665653705596924,
"learning_rate": 9.562117959391028e-07,
"loss": 0.8257,
"step": 5885
},
{
"epoch": 0.09169670031992652,
"grad_norm": 3.298375129699707,
"learning_rate": 9.56129856934498e-07,
"loss": 0.8005,
"step": 5890
},
{
"epoch": 0.0917745413218959,
"grad_norm": 3.643336057662964,
"learning_rate": 9.56047917929893e-07,
"loss": 0.8725,
"step": 5895
},
{
"epoch": 0.09185238232386528,
"grad_norm": 11.060576438903809,
"learning_rate": 9.559659789252879e-07,
"loss": 0.8681,
"step": 5900
},
{
"epoch": 0.09193022332583466,
"grad_norm": 4.433795928955078,
"learning_rate": 9.55884039920683e-07,
"loss": 0.8007,
"step": 5905
},
{
"epoch": 0.09200806432780402,
"grad_norm": 6.115171909332275,
"learning_rate": 9.55802100916078e-07,
"loss": 0.73,
"step": 5910
},
{
"epoch": 0.0920859053297734,
"grad_norm": 4.329387187957764,
"learning_rate": 9.55720161911473e-07,
"loss": 0.7698,
"step": 5915
},
{
"epoch": 0.09216374633174278,
"grad_norm": 4.206638813018799,
"learning_rate": 9.556382229068682e-07,
"loss": 0.7973,
"step": 5920
},
{
"epoch": 0.09224158733371216,
"grad_norm": 3.0813913345336914,
"learning_rate": 9.55556283902263e-07,
"loss": 0.7804,
"step": 5925
},
{
"epoch": 0.09231942833568153,
"grad_norm": 6.411551475524902,
"learning_rate": 9.55474344897658e-07,
"loss": 0.8881,
"step": 5930
},
{
"epoch": 0.09239726933765091,
"grad_norm": 2.5208792686462402,
"learning_rate": 9.553924058930532e-07,
"loss": 0.841,
"step": 5935
},
{
"epoch": 0.09247511033962029,
"grad_norm": 2.8447041511535645,
"learning_rate": 9.553104668884482e-07,
"loss": 0.791,
"step": 5940
},
{
"epoch": 0.09255295134158967,
"grad_norm": 4.374822616577148,
"learning_rate": 9.552285278838433e-07,
"loss": 0.8153,
"step": 5945
},
{
"epoch": 0.09263079234355905,
"grad_norm": 6.252150058746338,
"learning_rate": 9.551465888792382e-07,
"loss": 0.8223,
"step": 5950
},
{
"epoch": 0.09270863334552842,
"grad_norm": 3.3018994331359863,
"learning_rate": 9.550646498746333e-07,
"loss": 0.9067,
"step": 5955
},
{
"epoch": 0.0927864743474978,
"grad_norm": 4.026679515838623,
"learning_rate": 9.549827108700283e-07,
"loss": 0.7962,
"step": 5960
},
{
"epoch": 0.09286431534946718,
"grad_norm": 3.2476413249969482,
"learning_rate": 9.549007718654232e-07,
"loss": 0.8074,
"step": 5965
},
{
"epoch": 0.09294215635143656,
"grad_norm": 2.852954149246216,
"learning_rate": 9.548188328608183e-07,
"loss": 0.8071,
"step": 5970
},
{
"epoch": 0.09301999735340594,
"grad_norm": 3.4490416049957275,
"learning_rate": 9.547368938562133e-07,
"loss": 0.9519,
"step": 5975
},
{
"epoch": 0.09309783835537531,
"grad_norm": 2.473008155822754,
"learning_rate": 9.546549548516084e-07,
"loss": 0.8446,
"step": 5980
},
{
"epoch": 0.09317567935734469,
"grad_norm": 7.381313800811768,
"learning_rate": 9.545730158470035e-07,
"loss": 0.802,
"step": 5985
},
{
"epoch": 0.09325352035931407,
"grad_norm": 4.133596897125244,
"learning_rate": 9.544910768423986e-07,
"loss": 0.7167,
"step": 5990
},
{
"epoch": 0.09333136136128345,
"grad_norm": 4.466127872467041,
"learning_rate": 9.544091378377934e-07,
"loss": 0.9329,
"step": 5995
},
{
"epoch": 0.09340920236325281,
"grad_norm": 4.252684593200684,
"learning_rate": 9.543271988331885e-07,
"loss": 0.856,
"step": 6000
},
{
"epoch": 0.09348704336522219,
"grad_norm": 3.630127429962158,
"learning_rate": 9.542452598285836e-07,
"loss": 0.8738,
"step": 6005
},
{
"epoch": 0.09356488436719157,
"grad_norm": 8.133733749389648,
"learning_rate": 9.541633208239784e-07,
"loss": 0.8339,
"step": 6010
},
{
"epoch": 0.09364272536916095,
"grad_norm": 11.18271541595459,
"learning_rate": 9.540813818193735e-07,
"loss": 0.7978,
"step": 6015
},
{
"epoch": 0.09372056637113033,
"grad_norm": 6.3515214920043945,
"learning_rate": 9.539994428147686e-07,
"loss": 0.7742,
"step": 6020
},
{
"epoch": 0.0937984073730997,
"grad_norm": 3.030446767807007,
"learning_rate": 9.539175038101637e-07,
"loss": 0.6648,
"step": 6025
},
{
"epoch": 0.09387624837506908,
"grad_norm": 5.10403299331665,
"learning_rate": 9.538355648055587e-07,
"loss": 0.9393,
"step": 6030
},
{
"epoch": 0.09395408937703846,
"grad_norm": 2.5483992099761963,
"learning_rate": 9.537536258009538e-07,
"loss": 0.8155,
"step": 6035
},
{
"epoch": 0.09403193037900784,
"grad_norm": 6.4216814041137695,
"learning_rate": 9.536716867963488e-07,
"loss": 0.9805,
"step": 6040
},
{
"epoch": 0.09410977138097722,
"grad_norm": 8.188408851623535,
"learning_rate": 9.535897477917439e-07,
"loss": 0.8092,
"step": 6045
},
{
"epoch": 0.0941876123829466,
"grad_norm": 4.022781848907471,
"learning_rate": 9.535078087871387e-07,
"loss": 0.8009,
"step": 6050
},
{
"epoch": 0.09426545338491597,
"grad_norm": 3.303135871887207,
"learning_rate": 9.534258697825338e-07,
"loss": 0.7763,
"step": 6055
},
{
"epoch": 0.09434329438688535,
"grad_norm": 5.49419641494751,
"learning_rate": 9.533439307779289e-07,
"loss": 0.8246,
"step": 6060
},
{
"epoch": 0.09442113538885473,
"grad_norm": 4.208410739898682,
"learning_rate": 9.532619917733238e-07,
"loss": 0.8875,
"step": 6065
},
{
"epoch": 0.09449897639082411,
"grad_norm": 2.5194616317749023,
"learning_rate": 9.531800527687189e-07,
"loss": 0.8782,
"step": 6070
},
{
"epoch": 0.09457681739279349,
"grad_norm": 3.4753055572509766,
"learning_rate": 9.53098113764114e-07,
"loss": 0.7719,
"step": 6075
},
{
"epoch": 0.09465465839476286,
"grad_norm": 4.319244861602783,
"learning_rate": 9.53016174759509e-07,
"loss": 0.9141,
"step": 6080
},
{
"epoch": 0.09473249939673223,
"grad_norm": 2.9613096714019775,
"learning_rate": 9.52934235754904e-07,
"loss": 0.7196,
"step": 6085
},
{
"epoch": 0.0948103403987016,
"grad_norm": 6.506518840789795,
"learning_rate": 9.52852296750299e-07,
"loss": 0.8589,
"step": 6090
},
{
"epoch": 0.09488818140067098,
"grad_norm": 7.105751037597656,
"learning_rate": 9.527703577456941e-07,
"loss": 0.8392,
"step": 6095
},
{
"epoch": 0.09496602240264036,
"grad_norm": 3.0105667114257812,
"learning_rate": 9.52688418741089e-07,
"loss": 0.6962,
"step": 6100
},
{
"epoch": 0.09504386340460974,
"grad_norm": 7.148667812347412,
"learning_rate": 9.526064797364841e-07,
"loss": 0.7958,
"step": 6105
},
{
"epoch": 0.09512170440657912,
"grad_norm": 11.00757122039795,
"learning_rate": 9.525245407318792e-07,
"loss": 0.8516,
"step": 6110
},
{
"epoch": 0.0951995454085485,
"grad_norm": 5.520313739776611,
"learning_rate": 9.524426017272742e-07,
"loss": 0.9142,
"step": 6115
},
{
"epoch": 0.09527738641051788,
"grad_norm": 3.418109893798828,
"learning_rate": 9.523606627226692e-07,
"loss": 0.7326,
"step": 6120
},
{
"epoch": 0.09535522741248725,
"grad_norm": 3.9674932956695557,
"learning_rate": 9.522787237180643e-07,
"loss": 0.8561,
"step": 6125
},
{
"epoch": 0.09543306841445663,
"grad_norm": 3.5758800506591797,
"learning_rate": 9.521967847134592e-07,
"loss": 0.8119,
"step": 6130
},
{
"epoch": 0.09551090941642601,
"grad_norm": 4.39679479598999,
"learning_rate": 9.521148457088543e-07,
"loss": 0.8427,
"step": 6135
},
{
"epoch": 0.09558875041839539,
"grad_norm": 5.498544216156006,
"learning_rate": 9.520329067042493e-07,
"loss": 0.8786,
"step": 6140
},
{
"epoch": 0.09566659142036477,
"grad_norm": 3.5914194583892822,
"learning_rate": 9.519509676996443e-07,
"loss": 0.8465,
"step": 6145
},
{
"epoch": 0.09574443242233414,
"grad_norm": 5.639887809753418,
"learning_rate": 9.518690286950394e-07,
"loss": 0.764,
"step": 6150
},
{
"epoch": 0.09582227342430352,
"grad_norm": 7.638282299041748,
"learning_rate": 9.517870896904345e-07,
"loss": 0.858,
"step": 6155
},
{
"epoch": 0.0959001144262729,
"grad_norm": 3.7768096923828125,
"learning_rate": 9.517051506858294e-07,
"loss": 0.8739,
"step": 6160
},
{
"epoch": 0.09597795542824228,
"grad_norm": 3.388122081756592,
"learning_rate": 9.516232116812245e-07,
"loss": 0.7646,
"step": 6165
},
{
"epoch": 0.09605579643021164,
"grad_norm": 3.0460891723632812,
"learning_rate": 9.515412726766195e-07,
"loss": 0.9288,
"step": 6170
},
{
"epoch": 0.09613363743218102,
"grad_norm": 3.9041805267333984,
"learning_rate": 9.514593336720144e-07,
"loss": 0.735,
"step": 6175
},
{
"epoch": 0.0962114784341504,
"grad_norm": 3.894850254058838,
"learning_rate": 9.513773946674095e-07,
"loss": 0.7104,
"step": 6180
},
{
"epoch": 0.09628931943611978,
"grad_norm": 3.6872172355651855,
"learning_rate": 9.512954556628046e-07,
"loss": 0.7743,
"step": 6185
},
{
"epoch": 0.09636716043808916,
"grad_norm": 2.9574503898620605,
"learning_rate": 9.512135166581996e-07,
"loss": 0.8923,
"step": 6190
},
{
"epoch": 0.09644500144005853,
"grad_norm": 3.6874301433563232,
"learning_rate": 9.511315776535946e-07,
"loss": 0.7798,
"step": 6195
},
{
"epoch": 0.09652284244202791,
"grad_norm": 5.110114574432373,
"learning_rate": 9.510496386489897e-07,
"loss": 0.8202,
"step": 6200
},
{
"epoch": 0.09660068344399729,
"grad_norm": 4.243130683898926,
"learning_rate": 9.509676996443848e-07,
"loss": 0.9956,
"step": 6205
},
{
"epoch": 0.09667852444596667,
"grad_norm": 9.388118743896484,
"learning_rate": 9.508857606397796e-07,
"loss": 0.8103,
"step": 6210
},
{
"epoch": 0.09675636544793605,
"grad_norm": 4.399020671844482,
"learning_rate": 9.508038216351747e-07,
"loss": 0.9899,
"step": 6215
},
{
"epoch": 0.09683420644990542,
"grad_norm": 5.260294437408447,
"learning_rate": 9.507218826305698e-07,
"loss": 0.7776,
"step": 6220
},
{
"epoch": 0.0969120474518748,
"grad_norm": 2.903243064880371,
"learning_rate": 9.506399436259648e-07,
"loss": 0.8065,
"step": 6225
},
{
"epoch": 0.09698988845384418,
"grad_norm": 7.704418182373047,
"learning_rate": 9.505580046213598e-07,
"loss": 0.6625,
"step": 6230
},
{
"epoch": 0.09706772945581356,
"grad_norm": 3.7152814865112305,
"learning_rate": 9.504760656167549e-07,
"loss": 0.8702,
"step": 6235
},
{
"epoch": 0.09714557045778294,
"grad_norm": 6.636418342590332,
"learning_rate": 9.503941266121499e-07,
"loss": 0.7592,
"step": 6240
},
{
"epoch": 0.09722341145975232,
"grad_norm": 5.01901388168335,
"learning_rate": 9.50312187607545e-07,
"loss": 0.8377,
"step": 6245
},
{
"epoch": 0.0973012524617217,
"grad_norm": 6.149816989898682,
"learning_rate": 9.502302486029399e-07,
"loss": 0.7698,
"step": 6250
},
{
"epoch": 0.09737909346369107,
"grad_norm": 4.017423152923584,
"learning_rate": 9.501483095983349e-07,
"loss": 0.7522,
"step": 6255
},
{
"epoch": 0.09745693446566044,
"grad_norm": 3.395038366317749,
"learning_rate": 9.5006637059373e-07,
"loss": 0.7814,
"step": 6260
},
{
"epoch": 0.09753477546762981,
"grad_norm": 8.359529495239258,
"learning_rate": 9.49984431589125e-07,
"loss": 0.8817,
"step": 6265
},
{
"epoch": 0.09761261646959919,
"grad_norm": 5.801593780517578,
"learning_rate": 9.4990249258452e-07,
"loss": 0.8346,
"step": 6270
},
{
"epoch": 0.09769045747156857,
"grad_norm": 3.440136432647705,
"learning_rate": 9.498205535799151e-07,
"loss": 0.7793,
"step": 6275
},
{
"epoch": 0.09776829847353795,
"grad_norm": 3.3891918659210205,
"learning_rate": 9.497386145753102e-07,
"loss": 0.7846,
"step": 6280
},
{
"epoch": 0.09784613947550733,
"grad_norm": 3.6862120628356934,
"learning_rate": 9.496566755707051e-07,
"loss": 0.8316,
"step": 6285
},
{
"epoch": 0.0979239804774767,
"grad_norm": 4.2608642578125,
"learning_rate": 9.495747365661001e-07,
"loss": 0.8244,
"step": 6290
},
{
"epoch": 0.09800182147944608,
"grad_norm": 4.8404459953308105,
"learning_rate": 9.494927975614952e-07,
"loss": 0.7919,
"step": 6295
},
{
"epoch": 0.09807966248141546,
"grad_norm": 2.3203227519989014,
"learning_rate": 9.494108585568901e-07,
"loss": 0.7593,
"step": 6300
},
{
"epoch": 0.09815750348338484,
"grad_norm": 2.870492935180664,
"learning_rate": 9.493289195522852e-07,
"loss": 0.7388,
"step": 6305
},
{
"epoch": 0.09823534448535422,
"grad_norm": 3.6634552478790283,
"learning_rate": 9.492469805476803e-07,
"loss": 0.6632,
"step": 6310
},
{
"epoch": 0.0983131854873236,
"grad_norm": 4.569755554199219,
"learning_rate": 9.491650415430753e-07,
"loss": 0.866,
"step": 6315
},
{
"epoch": 0.09839102648929297,
"grad_norm": 3.319843053817749,
"learning_rate": 9.490831025384703e-07,
"loss": 0.8812,
"step": 6320
},
{
"epoch": 0.09846886749126235,
"grad_norm": 5.647189140319824,
"learning_rate": 9.490011635338654e-07,
"loss": 0.893,
"step": 6325
},
{
"epoch": 0.09854670849323173,
"grad_norm": 4.285895347595215,
"learning_rate": 9.489192245292603e-07,
"loss": 0.9623,
"step": 6330
},
{
"epoch": 0.09862454949520111,
"grad_norm": 4.257463455200195,
"learning_rate": 9.488372855246553e-07,
"loss": 0.7959,
"step": 6335
},
{
"epoch": 0.09870239049717049,
"grad_norm": 4.747158050537109,
"learning_rate": 9.487553465200504e-07,
"loss": 0.6869,
"step": 6340
},
{
"epoch": 0.09878023149913985,
"grad_norm": 4.191068172454834,
"learning_rate": 9.486734075154455e-07,
"loss": 0.8163,
"step": 6345
},
{
"epoch": 0.09885807250110923,
"grad_norm": 4.583565711975098,
"learning_rate": 9.485914685108405e-07,
"loss": 0.7348,
"step": 6350
},
{
"epoch": 0.0989359135030786,
"grad_norm": 3.9108059406280518,
"learning_rate": 9.485095295062355e-07,
"loss": 0.7968,
"step": 6355
},
{
"epoch": 0.09901375450504799,
"grad_norm": 5.722688674926758,
"learning_rate": 9.484275905016306e-07,
"loss": 0.8138,
"step": 6360
},
{
"epoch": 0.09909159550701736,
"grad_norm": 2.972755193710327,
"learning_rate": 9.483456514970256e-07,
"loss": 0.795,
"step": 6365
},
{
"epoch": 0.09916943650898674,
"grad_norm": 8.901226997375488,
"learning_rate": 9.482637124924207e-07,
"loss": 0.7491,
"step": 6370
},
{
"epoch": 0.09924727751095612,
"grad_norm": 7.961559772491455,
"learning_rate": 9.481817734878156e-07,
"loss": 1.0205,
"step": 6375
},
{
"epoch": 0.0993251185129255,
"grad_norm": 6.042298316955566,
"learning_rate": 9.480998344832106e-07,
"loss": 0.7984,
"step": 6380
},
{
"epoch": 0.09940295951489488,
"grad_norm": 10.556697845458984,
"learning_rate": 9.480178954786057e-07,
"loss": 0.8639,
"step": 6385
},
{
"epoch": 0.09948080051686425,
"grad_norm": 2.7401647567749023,
"learning_rate": 9.479359564740007e-07,
"loss": 0.799,
"step": 6390
},
{
"epoch": 0.09955864151883363,
"grad_norm": 6.288196086883545,
"learning_rate": 9.478540174693957e-07,
"loss": 0.9338,
"step": 6395
},
{
"epoch": 0.09963648252080301,
"grad_norm": 4.34282112121582,
"learning_rate": 9.477720784647908e-07,
"loss": 0.8355,
"step": 6400
},
{
"epoch": 0.09971432352277239,
"grad_norm": 3.5038483142852783,
"learning_rate": 9.476901394601859e-07,
"loss": 0.7796,
"step": 6405
},
{
"epoch": 0.09979216452474177,
"grad_norm": 4.715381622314453,
"learning_rate": 9.476082004555808e-07,
"loss": 0.815,
"step": 6410
},
{
"epoch": 0.09987000552671114,
"grad_norm": 3.7928483486175537,
"learning_rate": 9.475262614509758e-07,
"loss": 0.8346,
"step": 6415
},
{
"epoch": 0.09994784652868052,
"grad_norm": 5.622752666473389,
"learning_rate": 9.474443224463709e-07,
"loss": 0.8558,
"step": 6420
},
{
"epoch": 0.1000256875306499,
"grad_norm": 5.325289726257324,
"learning_rate": 9.473623834417658e-07,
"loss": 0.8078,
"step": 6425
},
{
"epoch": 0.10010352853261927,
"grad_norm": 3.5389554500579834,
"learning_rate": 9.472804444371609e-07,
"loss": 0.7461,
"step": 6430
},
{
"epoch": 0.10018136953458864,
"grad_norm": 8.74923038482666,
"learning_rate": 9.47198505432556e-07,
"loss": 0.7609,
"step": 6435
},
{
"epoch": 0.10025921053655802,
"grad_norm": 4.170187473297119,
"learning_rate": 9.47116566427951e-07,
"loss": 0.8243,
"step": 6440
},
{
"epoch": 0.1003370515385274,
"grad_norm": 12.243910789489746,
"learning_rate": 9.47034627423346e-07,
"loss": 0.7365,
"step": 6445
},
{
"epoch": 0.10041489254049678,
"grad_norm": 6.56355094909668,
"learning_rate": 9.469526884187411e-07,
"loss": 0.8693,
"step": 6450
},
{
"epoch": 0.10049273354246616,
"grad_norm": 4.917191982269287,
"learning_rate": 9.46870749414136e-07,
"loss": 0.8323,
"step": 6455
},
{
"epoch": 0.10057057454443553,
"grad_norm": 4.455476760864258,
"learning_rate": 9.467888104095311e-07,
"loss": 0.7539,
"step": 6460
},
{
"epoch": 0.10064841554640491,
"grad_norm": 4.135006904602051,
"learning_rate": 9.467068714049261e-07,
"loss": 0.772,
"step": 6465
},
{
"epoch": 0.10072625654837429,
"grad_norm": 5.814565658569336,
"learning_rate": 9.466249324003212e-07,
"loss": 0.8389,
"step": 6470
},
{
"epoch": 0.10080409755034367,
"grad_norm": 3.4807469844818115,
"learning_rate": 9.465429933957162e-07,
"loss": 0.8365,
"step": 6475
},
{
"epoch": 0.10088193855231305,
"grad_norm": 5.241673946380615,
"learning_rate": 9.464610543911112e-07,
"loss": 0.6689,
"step": 6480
},
{
"epoch": 0.10095977955428243,
"grad_norm": 6.0900678634643555,
"learning_rate": 9.463791153865063e-07,
"loss": 0.813,
"step": 6485
},
{
"epoch": 0.1010376205562518,
"grad_norm": 4.659064769744873,
"learning_rate": 9.462971763819013e-07,
"loss": 0.8624,
"step": 6490
},
{
"epoch": 0.10111546155822118,
"grad_norm": 7.9358320236206055,
"learning_rate": 9.462152373772963e-07,
"loss": 0.7209,
"step": 6495
},
{
"epoch": 0.10119330256019056,
"grad_norm": 3.9600491523742676,
"learning_rate": 9.461332983726913e-07,
"loss": 0.7995,
"step": 6500
},
{
"epoch": 0.10127114356215994,
"grad_norm": 4.832655906677246,
"learning_rate": 9.460513593680863e-07,
"loss": 0.8935,
"step": 6505
},
{
"epoch": 0.10134898456412932,
"grad_norm": 4.184332370758057,
"learning_rate": 9.459694203634814e-07,
"loss": 0.7468,
"step": 6510
},
{
"epoch": 0.10142682556609868,
"grad_norm": 3.2403645515441895,
"learning_rate": 9.458874813588765e-07,
"loss": 0.7502,
"step": 6515
},
{
"epoch": 0.10150466656806806,
"grad_norm": 6.45439338684082,
"learning_rate": 9.458055423542714e-07,
"loss": 0.8359,
"step": 6520
},
{
"epoch": 0.10158250757003744,
"grad_norm": 3.9225785732269287,
"learning_rate": 9.457236033496665e-07,
"loss": 0.822,
"step": 6525
},
{
"epoch": 0.10166034857200681,
"grad_norm": 6.211043834686279,
"learning_rate": 9.456416643450616e-07,
"loss": 0.7675,
"step": 6530
},
{
"epoch": 0.10173818957397619,
"grad_norm": 5.109851360321045,
"learning_rate": 9.455597253404564e-07,
"loss": 0.8539,
"step": 6535
},
{
"epoch": 0.10181603057594557,
"grad_norm": 3.1654608249664307,
"learning_rate": 9.454777863358515e-07,
"loss": 0.851,
"step": 6540
},
{
"epoch": 0.10189387157791495,
"grad_norm": 6.30355167388916,
"learning_rate": 9.453958473312466e-07,
"loss": 0.8668,
"step": 6545
},
{
"epoch": 0.10197171257988433,
"grad_norm": 2.9073293209075928,
"learning_rate": 9.453139083266416e-07,
"loss": 0.8354,
"step": 6550
},
{
"epoch": 0.1020495535818537,
"grad_norm": 4.239645481109619,
"learning_rate": 9.452319693220366e-07,
"loss": 0.8394,
"step": 6555
},
{
"epoch": 0.10212739458382308,
"grad_norm": 4.341432094573975,
"learning_rate": 9.451500303174317e-07,
"loss": 0.8248,
"step": 6560
},
{
"epoch": 0.10220523558579246,
"grad_norm": 5.958523273468018,
"learning_rate": 9.450680913128267e-07,
"loss": 0.7696,
"step": 6565
},
{
"epoch": 0.10228307658776184,
"grad_norm": 2.9546141624450684,
"learning_rate": 9.449861523082217e-07,
"loss": 0.8008,
"step": 6570
},
{
"epoch": 0.10236091758973122,
"grad_norm": 3.216296672821045,
"learning_rate": 9.449042133036167e-07,
"loss": 0.8286,
"step": 6575
},
{
"epoch": 0.1024387585917006,
"grad_norm": 5.784662246704102,
"learning_rate": 9.448222742990117e-07,
"loss": 0.8118,
"step": 6580
},
{
"epoch": 0.10251659959366997,
"grad_norm": 18.038049697875977,
"learning_rate": 9.447403352944068e-07,
"loss": 0.7844,
"step": 6585
},
{
"epoch": 0.10259444059563935,
"grad_norm": 3.434221029281616,
"learning_rate": 9.446583962898018e-07,
"loss": 0.7745,
"step": 6590
},
{
"epoch": 0.10267228159760873,
"grad_norm": 3.0332863330841064,
"learning_rate": 9.445764572851969e-07,
"loss": 0.9026,
"step": 6595
},
{
"epoch": 0.10275012259957811,
"grad_norm": 4.459526538848877,
"learning_rate": 9.444945182805919e-07,
"loss": 0.8231,
"step": 6600
},
{
"epoch": 0.10282796360154747,
"grad_norm": 11.914100646972656,
"learning_rate": 9.44412579275987e-07,
"loss": 0.7191,
"step": 6605
},
{
"epoch": 0.10290580460351685,
"grad_norm": 9.491118431091309,
"learning_rate": 9.44330640271382e-07,
"loss": 0.8438,
"step": 6610
},
{
"epoch": 0.10298364560548623,
"grad_norm": 3.312546968460083,
"learning_rate": 9.442487012667769e-07,
"loss": 0.7754,
"step": 6615
},
{
"epoch": 0.10306148660745561,
"grad_norm": 2.4198150634765625,
"learning_rate": 9.44166762262172e-07,
"loss": 0.7373,
"step": 6620
},
{
"epoch": 0.10313932760942499,
"grad_norm": 3.8953001499176025,
"learning_rate": 9.44084823257567e-07,
"loss": 0.6972,
"step": 6625
},
{
"epoch": 0.10321716861139436,
"grad_norm": 3.971245050430298,
"learning_rate": 9.44002884252962e-07,
"loss": 0.7959,
"step": 6630
},
{
"epoch": 0.10329500961336374,
"grad_norm": 3.9119505882263184,
"learning_rate": 9.439209452483571e-07,
"loss": 0.8072,
"step": 6635
},
{
"epoch": 0.10337285061533312,
"grad_norm": 3.6322784423828125,
"learning_rate": 9.438390062437522e-07,
"loss": 0.8389,
"step": 6640
},
{
"epoch": 0.1034506916173025,
"grad_norm": 3.221548557281494,
"learning_rate": 9.437570672391471e-07,
"loss": 0.8113,
"step": 6645
},
{
"epoch": 0.10352853261927188,
"grad_norm": 3.768453598022461,
"learning_rate": 9.436751282345422e-07,
"loss": 0.8859,
"step": 6650
},
{
"epoch": 0.10360637362124125,
"grad_norm": 3.436704635620117,
"learning_rate": 9.435931892299372e-07,
"loss": 0.9308,
"step": 6655
},
{
"epoch": 0.10368421462321063,
"grad_norm": 3.5479848384857178,
"learning_rate": 9.435112502253321e-07,
"loss": 0.8624,
"step": 6660
},
{
"epoch": 0.10376205562518001,
"grad_norm": 5.35614538192749,
"learning_rate": 9.434293112207272e-07,
"loss": 0.8095,
"step": 6665
},
{
"epoch": 0.10383989662714939,
"grad_norm": 3.0815038681030273,
"learning_rate": 9.433473722161223e-07,
"loss": 0.6846,
"step": 6670
},
{
"epoch": 0.10391773762911877,
"grad_norm": 5.047412872314453,
"learning_rate": 9.432654332115173e-07,
"loss": 0.7785,
"step": 6675
},
{
"epoch": 0.10399557863108815,
"grad_norm": 4.322173595428467,
"learning_rate": 9.431834942069123e-07,
"loss": 0.8362,
"step": 6680
},
{
"epoch": 0.10407341963305752,
"grad_norm": 4.15039587020874,
"learning_rate": 9.431015552023074e-07,
"loss": 0.7475,
"step": 6685
},
{
"epoch": 0.10415126063502689,
"grad_norm": 3.9758059978485107,
"learning_rate": 9.430196161977024e-07,
"loss": 0.8058,
"step": 6690
},
{
"epoch": 0.10422910163699627,
"grad_norm": 3.778308629989624,
"learning_rate": 9.429376771930975e-07,
"loss": 0.9209,
"step": 6695
},
{
"epoch": 0.10430694263896564,
"grad_norm": 7.6523566246032715,
"learning_rate": 9.428557381884924e-07,
"loss": 0.765,
"step": 6700
},
{
"epoch": 0.10438478364093502,
"grad_norm": 4.295438289642334,
"learning_rate": 9.427737991838874e-07,
"loss": 0.8556,
"step": 6705
},
{
"epoch": 0.1044626246429044,
"grad_norm": 9.01634407043457,
"learning_rate": 9.426918601792825e-07,
"loss": 0.7078,
"step": 6710
},
{
"epoch": 0.10454046564487378,
"grad_norm": 4.1538987159729,
"learning_rate": 9.426099211746775e-07,
"loss": 1.0486,
"step": 6715
},
{
"epoch": 0.10461830664684316,
"grad_norm": 5.460824489593506,
"learning_rate": 9.425279821700726e-07,
"loss": 0.7296,
"step": 6720
},
{
"epoch": 0.10469614764881253,
"grad_norm": 8.995347023010254,
"learning_rate": 9.424460431654676e-07,
"loss": 0.7228,
"step": 6725
},
{
"epoch": 0.10477398865078191,
"grad_norm": 3.254420042037964,
"learning_rate": 9.423641041608627e-07,
"loss": 0.8683,
"step": 6730
},
{
"epoch": 0.10485182965275129,
"grad_norm": 3.987894058227539,
"learning_rate": 9.422821651562577e-07,
"loss": 0.7663,
"step": 6735
},
{
"epoch": 0.10492967065472067,
"grad_norm": 3.244363307952881,
"learning_rate": 9.422002261516526e-07,
"loss": 0.8443,
"step": 6740
},
{
"epoch": 0.10500751165669005,
"grad_norm": 4.715000152587891,
"learning_rate": 9.421182871470477e-07,
"loss": 0.8059,
"step": 6745
},
{
"epoch": 0.10508535265865943,
"grad_norm": 5.014405727386475,
"learning_rate": 9.420363481424427e-07,
"loss": 0.8381,
"step": 6750
},
{
"epoch": 0.1051631936606288,
"grad_norm": 4.085587978363037,
"learning_rate": 9.419544091378377e-07,
"loss": 0.8104,
"step": 6755
},
{
"epoch": 0.10524103466259818,
"grad_norm": 12.998879432678223,
"learning_rate": 9.418724701332328e-07,
"loss": 0.8814,
"step": 6760
},
{
"epoch": 0.10531887566456756,
"grad_norm": 5.057702541351318,
"learning_rate": 9.417905311286279e-07,
"loss": 0.7355,
"step": 6765
},
{
"epoch": 0.10539671666653694,
"grad_norm": 4.116156578063965,
"learning_rate": 9.417085921240228e-07,
"loss": 0.907,
"step": 6770
},
{
"epoch": 0.1054745576685063,
"grad_norm": 3.0526468753814697,
"learning_rate": 9.416266531194179e-07,
"loss": 0.832,
"step": 6775
},
{
"epoch": 0.10555239867047568,
"grad_norm": 5.218168258666992,
"learning_rate": 9.415447141148129e-07,
"loss": 0.7835,
"step": 6780
},
{
"epoch": 0.10563023967244506,
"grad_norm": 7.534468650817871,
"learning_rate": 9.414627751102078e-07,
"loss": 0.8966,
"step": 6785
},
{
"epoch": 0.10570808067441444,
"grad_norm": 14.327566146850586,
"learning_rate": 9.413808361056029e-07,
"loss": 0.789,
"step": 6790
},
{
"epoch": 0.10578592167638382,
"grad_norm": 4.58953332901001,
"learning_rate": 9.41298897100998e-07,
"loss": 0.8134,
"step": 6795
},
{
"epoch": 0.1058637626783532,
"grad_norm": 3.4943652153015137,
"learning_rate": 9.41216958096393e-07,
"loss": 0.9615,
"step": 6800
},
{
"epoch": 0.10594160368032257,
"grad_norm": 5.815013408660889,
"learning_rate": 9.41135019091788e-07,
"loss": 0.7671,
"step": 6805
},
{
"epoch": 0.10601944468229195,
"grad_norm": 4.9490580558776855,
"learning_rate": 9.410530800871831e-07,
"loss": 0.7993,
"step": 6810
},
{
"epoch": 0.10609728568426133,
"grad_norm": 3.030304193496704,
"learning_rate": 9.409711410825781e-07,
"loss": 0.7573,
"step": 6815
},
{
"epoch": 0.1061751266862307,
"grad_norm": 3.3977646827697754,
"learning_rate": 9.408892020779731e-07,
"loss": 0.7966,
"step": 6820
},
{
"epoch": 0.10625296768820008,
"grad_norm": 9.117260932922363,
"learning_rate": 9.408072630733681e-07,
"loss": 0.8123,
"step": 6825
},
{
"epoch": 0.10633080869016946,
"grad_norm": 3.8861453533172607,
"learning_rate": 9.407253240687631e-07,
"loss": 0.8245,
"step": 6830
},
{
"epoch": 0.10640864969213884,
"grad_norm": 3.4242775440216064,
"learning_rate": 9.406433850641582e-07,
"loss": 0.7318,
"step": 6835
},
{
"epoch": 0.10648649069410822,
"grad_norm": 4.729854106903076,
"learning_rate": 9.405614460595532e-07,
"loss": 0.7959,
"step": 6840
},
{
"epoch": 0.1065643316960776,
"grad_norm": 3.1164026260375977,
"learning_rate": 9.404795070549483e-07,
"loss": 0.8327,
"step": 6845
},
{
"epoch": 0.10664217269804697,
"grad_norm": 4.031877040863037,
"learning_rate": 9.403975680503433e-07,
"loss": 0.807,
"step": 6850
},
{
"epoch": 0.10672001370001635,
"grad_norm": 3.205714702606201,
"learning_rate": 9.403156290457384e-07,
"loss": 0.7962,
"step": 6855
},
{
"epoch": 0.10679785470198572,
"grad_norm": 3.2358205318450928,
"learning_rate": 9.402336900411333e-07,
"loss": 0.8694,
"step": 6860
},
{
"epoch": 0.1068756957039551,
"grad_norm": 2.7498748302459717,
"learning_rate": 9.401517510365283e-07,
"loss": 0.7554,
"step": 6865
},
{
"epoch": 0.10695353670592447,
"grad_norm": 7.2536420822143555,
"learning_rate": 9.400698120319234e-07,
"loss": 0.9135,
"step": 6870
},
{
"epoch": 0.10703137770789385,
"grad_norm": 5.090606689453125,
"learning_rate": 9.399878730273185e-07,
"loss": 0.8358,
"step": 6875
},
{
"epoch": 0.10710921870986323,
"grad_norm": 3.6972696781158447,
"learning_rate": 9.399059340227134e-07,
"loss": 0.9106,
"step": 6880
},
{
"epoch": 0.10718705971183261,
"grad_norm": 3.833972692489624,
"learning_rate": 9.398239950181085e-07,
"loss": 0.9021,
"step": 6885
},
{
"epoch": 0.10726490071380199,
"grad_norm": 6.692166805267334,
"learning_rate": 9.397420560135036e-07,
"loss": 0.9014,
"step": 6890
},
{
"epoch": 0.10734274171577136,
"grad_norm": 3.5323872566223145,
"learning_rate": 9.396601170088985e-07,
"loss": 0.8122,
"step": 6895
},
{
"epoch": 0.10742058271774074,
"grad_norm": 5.148552894592285,
"learning_rate": 9.395781780042935e-07,
"loss": 0.7712,
"step": 6900
},
{
"epoch": 0.10749842371971012,
"grad_norm": 4.791245460510254,
"learning_rate": 9.394962389996886e-07,
"loss": 0.8229,
"step": 6905
},
{
"epoch": 0.1075762647216795,
"grad_norm": 7.922582149505615,
"learning_rate": 9.394142999950836e-07,
"loss": 0.8641,
"step": 6910
},
{
"epoch": 0.10765410572364888,
"grad_norm": 4.787046432495117,
"learning_rate": 9.393323609904786e-07,
"loss": 0.8567,
"step": 6915
},
{
"epoch": 0.10773194672561826,
"grad_norm": 7.581035137176514,
"learning_rate": 9.392504219858737e-07,
"loss": 0.9418,
"step": 6920
},
{
"epoch": 0.10780978772758763,
"grad_norm": 3.7408881187438965,
"learning_rate": 9.391684829812687e-07,
"loss": 0.7569,
"step": 6925
},
{
"epoch": 0.10788762872955701,
"grad_norm": 4.957324981689453,
"learning_rate": 9.390865439766637e-07,
"loss": 0.9865,
"step": 6930
},
{
"epoch": 0.10796546973152639,
"grad_norm": 4.249368190765381,
"learning_rate": 9.390046049720588e-07,
"loss": 0.7513,
"step": 6935
},
{
"epoch": 0.10804331073349577,
"grad_norm": 4.029480934143066,
"learning_rate": 9.389226659674537e-07,
"loss": 0.8097,
"step": 6940
},
{
"epoch": 0.10812115173546515,
"grad_norm": 3.9717726707458496,
"learning_rate": 9.388407269628488e-07,
"loss": 0.8199,
"step": 6945
},
{
"epoch": 0.10819899273743451,
"grad_norm": 4.825889587402344,
"learning_rate": 9.387587879582438e-07,
"loss": 0.8086,
"step": 6950
},
{
"epoch": 0.10827683373940389,
"grad_norm": 6.288622856140137,
"learning_rate": 9.386768489536388e-07,
"loss": 0.764,
"step": 6955
},
{
"epoch": 0.10835467474137327,
"grad_norm": 4.316305637359619,
"learning_rate": 9.385949099490339e-07,
"loss": 0.8855,
"step": 6960
},
{
"epoch": 0.10843251574334264,
"grad_norm": 2.9733645915985107,
"learning_rate": 9.38512970944429e-07,
"loss": 0.8997,
"step": 6965
},
{
"epoch": 0.10851035674531202,
"grad_norm": 7.586787700653076,
"learning_rate": 9.38431031939824e-07,
"loss": 0.7612,
"step": 6970
},
{
"epoch": 0.1085881977472814,
"grad_norm": 6.496944904327393,
"learning_rate": 9.38349092935219e-07,
"loss": 0.8064,
"step": 6975
},
{
"epoch": 0.10866603874925078,
"grad_norm": 10.352307319641113,
"learning_rate": 9.382671539306141e-07,
"loss": 0.8054,
"step": 6980
},
{
"epoch": 0.10874387975122016,
"grad_norm": 3.3039493560791016,
"learning_rate": 9.38185214926009e-07,
"loss": 0.7777,
"step": 6985
},
{
"epoch": 0.10882172075318954,
"grad_norm": 2.823133945465088,
"learning_rate": 9.38103275921404e-07,
"loss": 0.8755,
"step": 6990
},
{
"epoch": 0.10889956175515891,
"grad_norm": 5.029725074768066,
"learning_rate": 9.380213369167991e-07,
"loss": 0.7765,
"step": 6995
},
{
"epoch": 0.10897740275712829,
"grad_norm": 5.5392889976501465,
"learning_rate": 9.379393979121942e-07,
"loss": 0.8166,
"step": 7000
},
{
"epoch": 0.10905524375909767,
"grad_norm": 5.657524585723877,
"learning_rate": 9.378574589075891e-07,
"loss": 0.7781,
"step": 7005
},
{
"epoch": 0.10913308476106705,
"grad_norm": 5.030917167663574,
"learning_rate": 9.377755199029842e-07,
"loss": 0.8928,
"step": 7010
},
{
"epoch": 0.10921092576303643,
"grad_norm": 4.488454341888428,
"learning_rate": 9.376935808983793e-07,
"loss": 0.835,
"step": 7015
},
{
"epoch": 0.1092887667650058,
"grad_norm": 3.592827081680298,
"learning_rate": 9.376116418937743e-07,
"loss": 0.7534,
"step": 7020
},
{
"epoch": 0.10936660776697518,
"grad_norm": 6.4170331954956055,
"learning_rate": 9.375297028891692e-07,
"loss": 0.7978,
"step": 7025
},
{
"epoch": 0.10944444876894456,
"grad_norm": 3.166126251220703,
"learning_rate": 9.374477638845643e-07,
"loss": 0.8483,
"step": 7030
},
{
"epoch": 0.10952228977091392,
"grad_norm": 2.965501070022583,
"learning_rate": 9.373658248799593e-07,
"loss": 0.7824,
"step": 7035
},
{
"epoch": 0.1096001307728833,
"grad_norm": 4.2378058433532715,
"learning_rate": 9.372838858753543e-07,
"loss": 0.9217,
"step": 7040
},
{
"epoch": 0.10967797177485268,
"grad_norm": 5.209421634674072,
"learning_rate": 9.372019468707494e-07,
"loss": 0.835,
"step": 7045
},
{
"epoch": 0.10975581277682206,
"grad_norm": 4.27461576461792,
"learning_rate": 9.371200078661444e-07,
"loss": 0.9258,
"step": 7050
},
{
"epoch": 0.10983365377879144,
"grad_norm": 2.5676474571228027,
"learning_rate": 9.370380688615395e-07,
"loss": 0.8691,
"step": 7055
},
{
"epoch": 0.10991149478076082,
"grad_norm": 3.422879934310913,
"learning_rate": 9.369561298569345e-07,
"loss": 0.7527,
"step": 7060
},
{
"epoch": 0.1099893357827302,
"grad_norm": 4.083531379699707,
"learning_rate": 9.368741908523294e-07,
"loss": 0.7531,
"step": 7065
},
{
"epoch": 0.11006717678469957,
"grad_norm": 4.684252738952637,
"learning_rate": 9.367922518477245e-07,
"loss": 0.7772,
"step": 7070
},
{
"epoch": 0.11014501778666895,
"grad_norm": 3.0496606826782227,
"learning_rate": 9.367103128431195e-07,
"loss": 0.7993,
"step": 7075
},
{
"epoch": 0.11022285878863833,
"grad_norm": 3.641996145248413,
"learning_rate": 9.366283738385145e-07,
"loss": 0.7938,
"step": 7080
},
{
"epoch": 0.1103006997906077,
"grad_norm": 3.0637736320495605,
"learning_rate": 9.365464348339096e-07,
"loss": 0.8944,
"step": 7085
},
{
"epoch": 0.11037854079257708,
"grad_norm": 4.9412455558776855,
"learning_rate": 9.364644958293047e-07,
"loss": 0.7653,
"step": 7090
},
{
"epoch": 0.11045638179454646,
"grad_norm": 4.0071516036987305,
"learning_rate": 9.363825568246997e-07,
"loss": 0.8582,
"step": 7095
},
{
"epoch": 0.11053422279651584,
"grad_norm": 3.297551155090332,
"learning_rate": 9.363006178200947e-07,
"loss": 0.6726,
"step": 7100
},
{
"epoch": 0.11061206379848522,
"grad_norm": 6.013480186462402,
"learning_rate": 9.362186788154897e-07,
"loss": 0.777,
"step": 7105
},
{
"epoch": 0.1106899048004546,
"grad_norm": 4.557566165924072,
"learning_rate": 9.361367398108848e-07,
"loss": 0.9073,
"step": 7110
},
{
"epoch": 0.11076774580242398,
"grad_norm": 3.922395706176758,
"learning_rate": 9.360548008062797e-07,
"loss": 0.7608,
"step": 7115
},
{
"epoch": 0.11084558680439334,
"grad_norm": 4.4782867431640625,
"learning_rate": 9.359728618016748e-07,
"loss": 0.8138,
"step": 7120
},
{
"epoch": 0.11092342780636272,
"grad_norm": 3.320688486099243,
"learning_rate": 9.358909227970699e-07,
"loss": 0.7954,
"step": 7125
},
{
"epoch": 0.1110012688083321,
"grad_norm": 14.582179069519043,
"learning_rate": 9.358089837924648e-07,
"loss": 0.8295,
"step": 7130
},
{
"epoch": 0.11107910981030147,
"grad_norm": 5.56791877746582,
"learning_rate": 9.357270447878599e-07,
"loss": 0.7778,
"step": 7135
},
{
"epoch": 0.11115695081227085,
"grad_norm": 4.387538909912109,
"learning_rate": 9.35645105783255e-07,
"loss": 0.7354,
"step": 7140
},
{
"epoch": 0.11123479181424023,
"grad_norm": 3.590179443359375,
"learning_rate": 9.355631667786499e-07,
"loss": 0.7492,
"step": 7145
},
{
"epoch": 0.11131263281620961,
"grad_norm": 2.9361941814422607,
"learning_rate": 9.354812277740449e-07,
"loss": 0.9192,
"step": 7150
},
{
"epoch": 0.11139047381817899,
"grad_norm": 4.104539394378662,
"learning_rate": 9.3539928876944e-07,
"loss": 0.7818,
"step": 7155
},
{
"epoch": 0.11146831482014836,
"grad_norm": 3.3516862392425537,
"learning_rate": 9.35317349764835e-07,
"loss": 0.7937,
"step": 7160
},
{
"epoch": 0.11154615582211774,
"grad_norm": 3.5534565448760986,
"learning_rate": 9.3523541076023e-07,
"loss": 0.7168,
"step": 7165
},
{
"epoch": 0.11162399682408712,
"grad_norm": 3.8620402812957764,
"learning_rate": 9.351534717556251e-07,
"loss": 0.7666,
"step": 7170
},
{
"epoch": 0.1117018378260565,
"grad_norm": 5.330255031585693,
"learning_rate": 9.350715327510201e-07,
"loss": 0.7302,
"step": 7175
},
{
"epoch": 0.11177967882802588,
"grad_norm": 3.8225488662719727,
"learning_rate": 9.349895937464152e-07,
"loss": 0.7087,
"step": 7180
},
{
"epoch": 0.11185751982999526,
"grad_norm": 4.536187648773193,
"learning_rate": 9.349076547418101e-07,
"loss": 0.6757,
"step": 7185
},
{
"epoch": 0.11193536083196463,
"grad_norm": 3.3316333293914795,
"learning_rate": 9.348257157372051e-07,
"loss": 0.8506,
"step": 7190
},
{
"epoch": 0.11201320183393401,
"grad_norm": 4.451030731201172,
"learning_rate": 9.347437767326002e-07,
"loss": 0.8501,
"step": 7195
},
{
"epoch": 0.11209104283590339,
"grad_norm": 6.453036308288574,
"learning_rate": 9.346618377279953e-07,
"loss": 0.7665,
"step": 7200
},
{
"epoch": 0.11216888383787275,
"grad_norm": 3.7804341316223145,
"learning_rate": 9.345798987233902e-07,
"loss": 0.8482,
"step": 7205
},
{
"epoch": 0.11224672483984213,
"grad_norm": 4.703028678894043,
"learning_rate": 9.344979597187853e-07,
"loss": 0.8001,
"step": 7210
},
{
"epoch": 0.11232456584181151,
"grad_norm": 3.5996668338775635,
"learning_rate": 9.344160207141804e-07,
"loss": 0.8508,
"step": 7215
},
{
"epoch": 0.11240240684378089,
"grad_norm": 3.470485210418701,
"learning_rate": 9.343340817095754e-07,
"loss": 0.8929,
"step": 7220
},
{
"epoch": 0.11248024784575027,
"grad_norm": 9.36178970336914,
"learning_rate": 9.342521427049703e-07,
"loss": 0.7707,
"step": 7225
},
{
"epoch": 0.11255808884771965,
"grad_norm": 7.091135025024414,
"learning_rate": 9.341702037003654e-07,
"loss": 0.9335,
"step": 7230
},
{
"epoch": 0.11263592984968902,
"grad_norm": 4.552675247192383,
"learning_rate": 9.340882646957605e-07,
"loss": 0.9234,
"step": 7235
},
{
"epoch": 0.1127137708516584,
"grad_norm": 2.9877877235412598,
"learning_rate": 9.340063256911554e-07,
"loss": 0.8213,
"step": 7240
},
{
"epoch": 0.11279161185362778,
"grad_norm": 3.49109148979187,
"learning_rate": 9.339243866865505e-07,
"loss": 0.7713,
"step": 7245
},
{
"epoch": 0.11286945285559716,
"grad_norm": 3.662997245788574,
"learning_rate": 9.338424476819456e-07,
"loss": 0.8424,
"step": 7250
},
{
"epoch": 0.11294729385756654,
"grad_norm": 3.4681499004364014,
"learning_rate": 9.337605086773405e-07,
"loss": 0.9026,
"step": 7255
},
{
"epoch": 0.11302513485953591,
"grad_norm": 3.360700845718384,
"learning_rate": 9.336785696727356e-07,
"loss": 0.9131,
"step": 7260
},
{
"epoch": 0.11310297586150529,
"grad_norm": 3.470808982849121,
"learning_rate": 9.335966306681306e-07,
"loss": 0.8477,
"step": 7265
},
{
"epoch": 0.11318081686347467,
"grad_norm": 4.002136707305908,
"learning_rate": 9.335146916635256e-07,
"loss": 0.8753,
"step": 7270
},
{
"epoch": 0.11325865786544405,
"grad_norm": 3.301177978515625,
"learning_rate": 9.334327526589206e-07,
"loss": 0.8266,
"step": 7275
},
{
"epoch": 0.11333649886741343,
"grad_norm": 3.6634960174560547,
"learning_rate": 9.333508136543157e-07,
"loss": 0.6648,
"step": 7280
},
{
"epoch": 0.1134143398693828,
"grad_norm": 4.124035835266113,
"learning_rate": 9.332688746497107e-07,
"loss": 0.8151,
"step": 7285
},
{
"epoch": 0.11349218087135218,
"grad_norm": 5.273459434509277,
"learning_rate": 9.331869356451058e-07,
"loss": 0.7747,
"step": 7290
},
{
"epoch": 0.11357002187332155,
"grad_norm": 3.6614978313446045,
"learning_rate": 9.331049966405008e-07,
"loss": 0.8157,
"step": 7295
},
{
"epoch": 0.11364786287529093,
"grad_norm": 5.9759368896484375,
"learning_rate": 9.330230576358958e-07,
"loss": 0.7475,
"step": 7300
},
{
"epoch": 0.1137257038772603,
"grad_norm": 3.271934747695923,
"learning_rate": 9.329411186312909e-07,
"loss": 0.8176,
"step": 7305
},
{
"epoch": 0.11380354487922968,
"grad_norm": 6.224942207336426,
"learning_rate": 9.328591796266858e-07,
"loss": 0.7874,
"step": 7310
},
{
"epoch": 0.11388138588119906,
"grad_norm": 4.060842990875244,
"learning_rate": 9.327772406220808e-07,
"loss": 0.7104,
"step": 7315
},
{
"epoch": 0.11395922688316844,
"grad_norm": 3.273303985595703,
"learning_rate": 9.326953016174759e-07,
"loss": 0.9288,
"step": 7320
},
{
"epoch": 0.11403706788513782,
"grad_norm": 5.721134662628174,
"learning_rate": 9.32613362612871e-07,
"loss": 0.9447,
"step": 7325
},
{
"epoch": 0.1141149088871072,
"grad_norm": 3.942401885986328,
"learning_rate": 9.325314236082659e-07,
"loss": 0.6934,
"step": 7330
},
{
"epoch": 0.11419274988907657,
"grad_norm": 8.272555351257324,
"learning_rate": 9.32449484603661e-07,
"loss": 0.7917,
"step": 7335
},
{
"epoch": 0.11427059089104595,
"grad_norm": 9.704336166381836,
"learning_rate": 9.323675455990561e-07,
"loss": 0.816,
"step": 7340
},
{
"epoch": 0.11434843189301533,
"grad_norm": 3.2745420932769775,
"learning_rate": 9.322856065944512e-07,
"loss": 0.8432,
"step": 7345
},
{
"epoch": 0.1144262728949847,
"grad_norm": 4.300100803375244,
"learning_rate": 9.32203667589846e-07,
"loss": 0.8954,
"step": 7350
},
{
"epoch": 0.11450411389695409,
"grad_norm": 3.520085334777832,
"learning_rate": 9.321217285852411e-07,
"loss": 0.6849,
"step": 7355
},
{
"epoch": 0.11458195489892346,
"grad_norm": 3.1472392082214355,
"learning_rate": 9.320397895806362e-07,
"loss": 0.9081,
"step": 7360
},
{
"epoch": 0.11465979590089284,
"grad_norm": 5.217727184295654,
"learning_rate": 9.319578505760311e-07,
"loss": 0.9009,
"step": 7365
},
{
"epoch": 0.11473763690286222,
"grad_norm": 2.461811065673828,
"learning_rate": 9.318759115714262e-07,
"loss": 0.8018,
"step": 7370
},
{
"epoch": 0.1148154779048316,
"grad_norm": 4.464178562164307,
"learning_rate": 9.317939725668213e-07,
"loss": 0.7952,
"step": 7375
},
{
"epoch": 0.11489331890680096,
"grad_norm": 8.113191604614258,
"learning_rate": 9.317120335622163e-07,
"loss": 0.8338,
"step": 7380
},
{
"epoch": 0.11497115990877034,
"grad_norm": 3.7389204502105713,
"learning_rate": 9.316300945576113e-07,
"loss": 0.7343,
"step": 7385
},
{
"epoch": 0.11504900091073972,
"grad_norm": 3.955479145050049,
"learning_rate": 9.315481555530063e-07,
"loss": 0.8868,
"step": 7390
},
{
"epoch": 0.1151268419127091,
"grad_norm": 3.5472121238708496,
"learning_rate": 9.314662165484013e-07,
"loss": 0.7299,
"step": 7395
},
{
"epoch": 0.11520468291467847,
"grad_norm": 3.3682563304901123,
"learning_rate": 9.313842775437963e-07,
"loss": 0.8079,
"step": 7400
},
{
"epoch": 0.11528252391664785,
"grad_norm": 4.840713977813721,
"learning_rate": 9.313023385391914e-07,
"loss": 0.8041,
"step": 7405
},
{
"epoch": 0.11536036491861723,
"grad_norm": 6.368528366088867,
"learning_rate": 9.312203995345864e-07,
"loss": 0.8727,
"step": 7410
},
{
"epoch": 0.11543820592058661,
"grad_norm": 8.412065505981445,
"learning_rate": 9.311384605299815e-07,
"loss": 0.7993,
"step": 7415
},
{
"epoch": 0.11551604692255599,
"grad_norm": 4.714696884155273,
"learning_rate": 9.310565215253765e-07,
"loss": 0.7354,
"step": 7420
},
{
"epoch": 0.11559388792452537,
"grad_norm": 5.104187488555908,
"learning_rate": 9.309745825207715e-07,
"loss": 0.8301,
"step": 7425
},
{
"epoch": 0.11567172892649474,
"grad_norm": 2.960247039794922,
"learning_rate": 9.308926435161665e-07,
"loss": 0.7949,
"step": 7430
},
{
"epoch": 0.11574956992846412,
"grad_norm": 5.484702110290527,
"learning_rate": 9.308107045115615e-07,
"loss": 0.7747,
"step": 7435
},
{
"epoch": 0.1158274109304335,
"grad_norm": 3.192422866821289,
"learning_rate": 9.307287655069565e-07,
"loss": 0.8138,
"step": 7440
},
{
"epoch": 0.11590525193240288,
"grad_norm": 4.892508029937744,
"learning_rate": 9.306468265023516e-07,
"loss": 0.8059,
"step": 7445
},
{
"epoch": 0.11598309293437226,
"grad_norm": 3.488111972808838,
"learning_rate": 9.305648874977467e-07,
"loss": 0.7975,
"step": 7450
},
{
"epoch": 0.11606093393634163,
"grad_norm": 4.7030029296875,
"learning_rate": 9.304829484931416e-07,
"loss": 0.752,
"step": 7455
},
{
"epoch": 0.11613877493831101,
"grad_norm": 5.344095706939697,
"learning_rate": 9.304010094885367e-07,
"loss": 0.774,
"step": 7460
},
{
"epoch": 0.11621661594028038,
"grad_norm": 2.944584846496582,
"learning_rate": 9.303190704839318e-07,
"loss": 0.7948,
"step": 7465
},
{
"epoch": 0.11629445694224975,
"grad_norm": 2.8411998748779297,
"learning_rate": 9.302371314793266e-07,
"loss": 0.7698,
"step": 7470
},
{
"epoch": 0.11637229794421913,
"grad_norm": 6.780023097991943,
"learning_rate": 9.301551924747217e-07,
"loss": 0.8843,
"step": 7475
},
{
"epoch": 0.11645013894618851,
"grad_norm": 4.495726108551025,
"learning_rate": 9.300732534701168e-07,
"loss": 0.9019,
"step": 7480
},
{
"epoch": 0.11652797994815789,
"grad_norm": 4.61745023727417,
"learning_rate": 9.299913144655119e-07,
"loss": 0.8717,
"step": 7485
},
{
"epoch": 0.11660582095012727,
"grad_norm": 3.0337278842926025,
"learning_rate": 9.299093754609068e-07,
"loss": 0.791,
"step": 7490
},
{
"epoch": 0.11668366195209665,
"grad_norm": 6.76452112197876,
"learning_rate": 9.298274364563019e-07,
"loss": 0.7373,
"step": 7495
},
{
"epoch": 0.11676150295406602,
"grad_norm": 2.7412028312683105,
"learning_rate": 9.29745497451697e-07,
"loss": 0.7638,
"step": 7500
},
{
"epoch": 0.1168393439560354,
"grad_norm": 4.742910861968994,
"learning_rate": 9.29663558447092e-07,
"loss": 0.8951,
"step": 7505
},
{
"epoch": 0.11691718495800478,
"grad_norm": 2.368957042694092,
"learning_rate": 9.295816194424869e-07,
"loss": 0.7237,
"step": 7510
},
{
"epoch": 0.11699502595997416,
"grad_norm": 4.270255088806152,
"learning_rate": 9.29499680437882e-07,
"loss": 0.789,
"step": 7515
},
{
"epoch": 0.11707286696194354,
"grad_norm": 4.186617851257324,
"learning_rate": 9.29417741433277e-07,
"loss": 0.8655,
"step": 7520
},
{
"epoch": 0.11715070796391291,
"grad_norm": 2.4891741275787354,
"learning_rate": 9.29335802428672e-07,
"loss": 0.8008,
"step": 7525
},
{
"epoch": 0.11722854896588229,
"grad_norm": 3.5113582611083984,
"learning_rate": 9.292538634240671e-07,
"loss": 0.8465,
"step": 7530
},
{
"epoch": 0.11730638996785167,
"grad_norm": 4.365012168884277,
"learning_rate": 9.291719244194621e-07,
"loss": 0.8378,
"step": 7535
},
{
"epoch": 0.11738423096982105,
"grad_norm": 4.17802619934082,
"learning_rate": 9.290899854148572e-07,
"loss": 0.7605,
"step": 7540
},
{
"epoch": 0.11746207197179043,
"grad_norm": 3.046833038330078,
"learning_rate": 9.290080464102522e-07,
"loss": 0.6796,
"step": 7545
},
{
"epoch": 0.11753991297375979,
"grad_norm": 2.9827167987823486,
"learning_rate": 9.289261074056471e-07,
"loss": 0.7666,
"step": 7550
},
{
"epoch": 0.11761775397572917,
"grad_norm": 2.8555498123168945,
"learning_rate": 9.288441684010422e-07,
"loss": 0.8337,
"step": 7555
},
{
"epoch": 0.11769559497769855,
"grad_norm": 4.013402462005615,
"learning_rate": 9.287622293964373e-07,
"loss": 0.9353,
"step": 7560
},
{
"epoch": 0.11777343597966793,
"grad_norm": 6.579110622406006,
"learning_rate": 9.286802903918322e-07,
"loss": 0.7464,
"step": 7565
},
{
"epoch": 0.1178512769816373,
"grad_norm": 6.643786907196045,
"learning_rate": 9.285983513872273e-07,
"loss": 0.8852,
"step": 7570
},
{
"epoch": 0.11792911798360668,
"grad_norm": 3.837496042251587,
"learning_rate": 9.285164123826224e-07,
"loss": 0.7909,
"step": 7575
},
{
"epoch": 0.11800695898557606,
"grad_norm": 3.4954562187194824,
"learning_rate": 9.284344733780173e-07,
"loss": 0.8559,
"step": 7580
},
{
"epoch": 0.11808479998754544,
"grad_norm": 3.962890148162842,
"learning_rate": 9.283525343734124e-07,
"loss": 0.833,
"step": 7585
},
{
"epoch": 0.11816264098951482,
"grad_norm": 7.724937438964844,
"learning_rate": 9.282705953688074e-07,
"loss": 0.9339,
"step": 7590
},
{
"epoch": 0.1182404819914842,
"grad_norm": 5.779001235961914,
"learning_rate": 9.281886563642024e-07,
"loss": 0.6968,
"step": 7595
},
{
"epoch": 0.11831832299345357,
"grad_norm": 6.85791540145874,
"learning_rate": 9.281067173595974e-07,
"loss": 0.8279,
"step": 7600
},
{
"epoch": 0.11839616399542295,
"grad_norm": 3.720306158065796,
"learning_rate": 9.280247783549925e-07,
"loss": 0.7751,
"step": 7605
},
{
"epoch": 0.11847400499739233,
"grad_norm": 4.524914741516113,
"learning_rate": 9.279428393503876e-07,
"loss": 0.7227,
"step": 7610
},
{
"epoch": 0.11855184599936171,
"grad_norm": 3.6757309436798096,
"learning_rate": 9.278609003457825e-07,
"loss": 0.8816,
"step": 7615
},
{
"epoch": 0.11862968700133109,
"grad_norm": 3.6419453620910645,
"learning_rate": 9.277789613411776e-07,
"loss": 0.7741,
"step": 7620
},
{
"epoch": 0.11870752800330046,
"grad_norm": 3.0829155445098877,
"learning_rate": 9.276970223365727e-07,
"loss": 0.7712,
"step": 7625
},
{
"epoch": 0.11878536900526984,
"grad_norm": 6.524753570556641,
"learning_rate": 9.276150833319677e-07,
"loss": 0.8025,
"step": 7630
},
{
"epoch": 0.11886321000723922,
"grad_norm": 2.959907054901123,
"learning_rate": 9.275331443273626e-07,
"loss": 0.932,
"step": 7635
},
{
"epoch": 0.11894105100920858,
"grad_norm": 3.6245338916778564,
"learning_rate": 9.274512053227577e-07,
"loss": 0.6997,
"step": 7640
},
{
"epoch": 0.11901889201117796,
"grad_norm": 4.328200340270996,
"learning_rate": 9.273692663181527e-07,
"loss": 0.8786,
"step": 7645
},
{
"epoch": 0.11909673301314734,
"grad_norm": 4.048555374145508,
"learning_rate": 9.272873273135478e-07,
"loss": 0.6936,
"step": 7650
},
{
"epoch": 0.11917457401511672,
"grad_norm": 3.2797539234161377,
"learning_rate": 9.272053883089428e-07,
"loss": 0.7915,
"step": 7655
},
{
"epoch": 0.1192524150170861,
"grad_norm": 9.63089656829834,
"learning_rate": 9.271234493043378e-07,
"loss": 0.8417,
"step": 7660
},
{
"epoch": 0.11933025601905548,
"grad_norm": 4.521198272705078,
"learning_rate": 9.270415102997329e-07,
"loss": 0.8831,
"step": 7665
},
{
"epoch": 0.11940809702102485,
"grad_norm": 16.66404151916504,
"learning_rate": 9.269595712951279e-07,
"loss": 0.8582,
"step": 7670
},
{
"epoch": 0.11948593802299423,
"grad_norm": 8.25177001953125,
"learning_rate": 9.268776322905228e-07,
"loss": 0.818,
"step": 7675
},
{
"epoch": 0.11956377902496361,
"grad_norm": 3.3690059185028076,
"learning_rate": 9.267956932859179e-07,
"loss": 0.689,
"step": 7680
},
{
"epoch": 0.11964162002693299,
"grad_norm": 4.987194061279297,
"learning_rate": 9.26713754281313e-07,
"loss": 0.8771,
"step": 7685
},
{
"epoch": 0.11971946102890237,
"grad_norm": 5.207099914550781,
"learning_rate": 9.266318152767079e-07,
"loss": 0.898,
"step": 7690
},
{
"epoch": 0.11979730203087174,
"grad_norm": 4.396019458770752,
"learning_rate": 9.26549876272103e-07,
"loss": 0.7181,
"step": 7695
},
{
"epoch": 0.11987514303284112,
"grad_norm": 3.1832330226898193,
"learning_rate": 9.264679372674981e-07,
"loss": 0.8264,
"step": 7700
},
{
"epoch": 0.1199529840348105,
"grad_norm": 3.0821919441223145,
"learning_rate": 9.26385998262893e-07,
"loss": 0.8412,
"step": 7705
},
{
"epoch": 0.12003082503677988,
"grad_norm": 4.4039626121521,
"learning_rate": 9.263040592582881e-07,
"loss": 0.7891,
"step": 7710
},
{
"epoch": 0.12010866603874926,
"grad_norm": 3.0394608974456787,
"learning_rate": 9.262221202536831e-07,
"loss": 0.7594,
"step": 7715
},
{
"epoch": 0.12018650704071863,
"grad_norm": 3.430525779724121,
"learning_rate": 9.261401812490781e-07,
"loss": 0.9487,
"step": 7720
},
{
"epoch": 0.120264348042688,
"grad_norm": 3.6181464195251465,
"learning_rate": 9.260582422444731e-07,
"loss": 0.881,
"step": 7725
},
{
"epoch": 0.12034218904465738,
"grad_norm": 9.917842864990234,
"learning_rate": 9.259763032398682e-07,
"loss": 0.7738,
"step": 7730
},
{
"epoch": 0.12042003004662676,
"grad_norm": 5.7242255210876465,
"learning_rate": 9.258943642352633e-07,
"loss": 0.9353,
"step": 7735
},
{
"epoch": 0.12049787104859613,
"grad_norm": 5.354982852935791,
"learning_rate": 9.258124252306583e-07,
"loss": 0.7719,
"step": 7740
},
{
"epoch": 0.12057571205056551,
"grad_norm": 5.967920780181885,
"learning_rate": 9.257304862260533e-07,
"loss": 0.7304,
"step": 7745
},
{
"epoch": 0.12065355305253489,
"grad_norm": 3.245041847229004,
"learning_rate": 9.256485472214484e-07,
"loss": 0.6752,
"step": 7750
},
{
"epoch": 0.12073139405450427,
"grad_norm": 6.362682342529297,
"learning_rate": 9.255666082168433e-07,
"loss": 0.8159,
"step": 7755
},
{
"epoch": 0.12080923505647365,
"grad_norm": 5.759777069091797,
"learning_rate": 9.254846692122383e-07,
"loss": 0.6947,
"step": 7760
},
{
"epoch": 0.12088707605844302,
"grad_norm": 3.7885186672210693,
"learning_rate": 9.254027302076334e-07,
"loss": 0.8279,
"step": 7765
},
{
"epoch": 0.1209649170604124,
"grad_norm": 2.970975399017334,
"learning_rate": 9.253207912030284e-07,
"loss": 0.8366,
"step": 7770
},
{
"epoch": 0.12104275806238178,
"grad_norm": 3.9208946228027344,
"learning_rate": 9.252388521984235e-07,
"loss": 0.8122,
"step": 7775
},
{
"epoch": 0.12112059906435116,
"grad_norm": 3.411137580871582,
"learning_rate": 9.251569131938185e-07,
"loss": 0.736,
"step": 7780
},
{
"epoch": 0.12119844006632054,
"grad_norm": 10.479853630065918,
"learning_rate": 9.250749741892135e-07,
"loss": 0.9075,
"step": 7785
},
{
"epoch": 0.12127628106828992,
"grad_norm": 4.920512676239014,
"learning_rate": 9.249930351846086e-07,
"loss": 0.8346,
"step": 7790
},
{
"epoch": 0.1213541220702593,
"grad_norm": 5.350609302520752,
"learning_rate": 9.249110961800035e-07,
"loss": 0.8034,
"step": 7795
},
{
"epoch": 0.12143196307222867,
"grad_norm": 6.461511611938477,
"learning_rate": 9.248291571753985e-07,
"loss": 0.6603,
"step": 7800
},
{
"epoch": 0.12150980407419805,
"grad_norm": 4.6280837059021,
"learning_rate": 9.247472181707936e-07,
"loss": 0.801,
"step": 7805
},
{
"epoch": 0.12158764507616741,
"grad_norm": 8.22148323059082,
"learning_rate": 9.246652791661887e-07,
"loss": 0.8335,
"step": 7810
},
{
"epoch": 0.12166548607813679,
"grad_norm": 7.433074951171875,
"learning_rate": 9.245833401615836e-07,
"loss": 0.7171,
"step": 7815
},
{
"epoch": 0.12174332708010617,
"grad_norm": 5.692368507385254,
"learning_rate": 9.245014011569787e-07,
"loss": 0.8649,
"step": 7820
},
{
"epoch": 0.12182116808207555,
"grad_norm": 3.0104875564575195,
"learning_rate": 9.244194621523738e-07,
"loss": 0.6021,
"step": 7825
},
{
"epoch": 0.12189900908404493,
"grad_norm": 6.160313606262207,
"learning_rate": 9.243375231477688e-07,
"loss": 0.8652,
"step": 7830
},
{
"epoch": 0.1219768500860143,
"grad_norm": 4.373056411743164,
"learning_rate": 9.242555841431637e-07,
"loss": 0.7689,
"step": 7835
},
{
"epoch": 0.12205469108798368,
"grad_norm": 5.760363578796387,
"learning_rate": 9.241736451385588e-07,
"loss": 0.8888,
"step": 7840
},
{
"epoch": 0.12213253208995306,
"grad_norm": 4.07029914855957,
"learning_rate": 9.240917061339538e-07,
"loss": 0.7693,
"step": 7845
},
{
"epoch": 0.12221037309192244,
"grad_norm": 3.7920992374420166,
"learning_rate": 9.240097671293488e-07,
"loss": 0.8221,
"step": 7850
},
{
"epoch": 0.12228821409389182,
"grad_norm": 7.572727203369141,
"learning_rate": 9.239278281247439e-07,
"loss": 0.8255,
"step": 7855
},
{
"epoch": 0.1223660550958612,
"grad_norm": 5.047962188720703,
"learning_rate": 9.23845889120139e-07,
"loss": 0.8189,
"step": 7860
},
{
"epoch": 0.12244389609783057,
"grad_norm": 6.445250988006592,
"learning_rate": 9.23763950115534e-07,
"loss": 0.9137,
"step": 7865
},
{
"epoch": 0.12252173709979995,
"grad_norm": 2.9722390174865723,
"learning_rate": 9.23682011110929e-07,
"loss": 0.895,
"step": 7870
},
{
"epoch": 0.12259957810176933,
"grad_norm": 3.0519466400146484,
"learning_rate": 9.23600072106324e-07,
"loss": 0.8048,
"step": 7875
},
{
"epoch": 0.12267741910373871,
"grad_norm": 5.44154167175293,
"learning_rate": 9.23518133101719e-07,
"loss": 0.6992,
"step": 7880
},
{
"epoch": 0.12275526010570809,
"grad_norm": 11.307611465454102,
"learning_rate": 9.23436194097114e-07,
"loss": 0.8595,
"step": 7885
},
{
"epoch": 0.12283310110767746,
"grad_norm": 6.504110336303711,
"learning_rate": 9.233542550925091e-07,
"loss": 0.7882,
"step": 7890
},
{
"epoch": 0.12291094210964683,
"grad_norm": 2.5726773738861084,
"learning_rate": 9.232723160879041e-07,
"loss": 0.8535,
"step": 7895
},
{
"epoch": 0.1229887831116162,
"grad_norm": 3.1990294456481934,
"learning_rate": 9.231903770832992e-07,
"loss": 0.7941,
"step": 7900
},
{
"epoch": 0.12306662411358558,
"grad_norm": 3.614875555038452,
"learning_rate": 9.231084380786942e-07,
"loss": 0.6803,
"step": 7905
},
{
"epoch": 0.12314446511555496,
"grad_norm": 5.973575592041016,
"learning_rate": 9.230264990740892e-07,
"loss": 0.7994,
"step": 7910
},
{
"epoch": 0.12322230611752434,
"grad_norm": 3.558375358581543,
"learning_rate": 9.229445600694842e-07,
"loss": 0.8292,
"step": 7915
},
{
"epoch": 0.12330014711949372,
"grad_norm": 4.37818717956543,
"learning_rate": 9.228626210648793e-07,
"loss": 0.8512,
"step": 7920
},
{
"epoch": 0.1233779881214631,
"grad_norm": 7.052612781524658,
"learning_rate": 9.227806820602742e-07,
"loss": 0.7453,
"step": 7925
},
{
"epoch": 0.12345582912343248,
"grad_norm": 9.181270599365234,
"learning_rate": 9.226987430556693e-07,
"loss": 0.6847,
"step": 7930
},
{
"epoch": 0.12353367012540185,
"grad_norm": 4.499727249145508,
"learning_rate": 9.226168040510644e-07,
"loss": 0.7718,
"step": 7935
},
{
"epoch": 0.12361151112737123,
"grad_norm": 3.2490360736846924,
"learning_rate": 9.225348650464593e-07,
"loss": 0.7631,
"step": 7940
},
{
"epoch": 0.12368935212934061,
"grad_norm": 3.26198410987854,
"learning_rate": 9.224529260418544e-07,
"loss": 0.8257,
"step": 7945
},
{
"epoch": 0.12376719313130999,
"grad_norm": 7.192509174346924,
"learning_rate": 9.223709870372495e-07,
"loss": 0.809,
"step": 7950
},
{
"epoch": 0.12384503413327937,
"grad_norm": 3.1407008171081543,
"learning_rate": 9.222890480326445e-07,
"loss": 0.8444,
"step": 7955
},
{
"epoch": 0.12392287513524874,
"grad_norm": 4.12625789642334,
"learning_rate": 9.222071090280394e-07,
"loss": 0.8275,
"step": 7960
},
{
"epoch": 0.12400071613721812,
"grad_norm": 5.525479316711426,
"learning_rate": 9.221251700234345e-07,
"loss": 0.8999,
"step": 7965
},
{
"epoch": 0.1240785571391875,
"grad_norm": 6.302455902099609,
"learning_rate": 9.220432310188295e-07,
"loss": 0.7642,
"step": 7970
},
{
"epoch": 0.12415639814115688,
"grad_norm": 4.649979114532471,
"learning_rate": 9.219612920142245e-07,
"loss": 0.7002,
"step": 7975
},
{
"epoch": 0.12423423914312626,
"grad_norm": 5.463395595550537,
"learning_rate": 9.218793530096196e-07,
"loss": 0.8473,
"step": 7980
},
{
"epoch": 0.12431208014509562,
"grad_norm": 6.704756736755371,
"learning_rate": 9.217974140050147e-07,
"loss": 0.6967,
"step": 7985
},
{
"epoch": 0.124389921147065,
"grad_norm": 4.5808539390563965,
"learning_rate": 9.217154750004097e-07,
"loss": 0.7898,
"step": 7990
},
{
"epoch": 0.12446776214903438,
"grad_norm": 2.9757680892944336,
"learning_rate": 9.216335359958047e-07,
"loss": 0.698,
"step": 7995
},
{
"epoch": 0.12454560315100376,
"grad_norm": 5.5388617515563965,
"learning_rate": 9.215515969911997e-07,
"loss": 0.8798,
"step": 8000
},
{
"epoch": 0.12462344415297313,
"grad_norm": 5.424650192260742,
"learning_rate": 9.214696579865947e-07,
"loss": 0.814,
"step": 8005
},
{
"epoch": 0.12470128515494251,
"grad_norm": 4.0730109214782715,
"learning_rate": 9.213877189819898e-07,
"loss": 0.6547,
"step": 8010
},
{
"epoch": 0.12477912615691189,
"grad_norm": 5.436715126037598,
"learning_rate": 9.213057799773848e-07,
"loss": 0.776,
"step": 8015
},
{
"epoch": 0.12485696715888127,
"grad_norm": 3.6354191303253174,
"learning_rate": 9.212238409727798e-07,
"loss": 0.7196,
"step": 8020
},
{
"epoch": 0.12493480816085065,
"grad_norm": 3.5990068912506104,
"learning_rate": 9.211419019681749e-07,
"loss": 0.8079,
"step": 8025
},
{
"epoch": 0.12501264916282,
"grad_norm": 4.007763385772705,
"learning_rate": 9.2105996296357e-07,
"loss": 0.7711,
"step": 8030
},
{
"epoch": 0.1250904901647894,
"grad_norm": 4.223349571228027,
"learning_rate": 9.209780239589649e-07,
"loss": 0.8481,
"step": 8035
},
{
"epoch": 0.12516833116675877,
"grad_norm": 4.40108060836792,
"learning_rate": 9.208960849543599e-07,
"loss": 0.7981,
"step": 8040
},
{
"epoch": 0.12524617216872816,
"grad_norm": 2.7487142086029053,
"learning_rate": 9.20814145949755e-07,
"loss": 0.6693,
"step": 8045
},
{
"epoch": 0.12532401317069752,
"grad_norm": 3.568763017654419,
"learning_rate": 9.207322069451499e-07,
"loss": 0.7716,
"step": 8050
},
{
"epoch": 0.12540185417266692,
"grad_norm": 3.7271010875701904,
"learning_rate": 9.20650267940545e-07,
"loss": 0.7969,
"step": 8055
},
{
"epoch": 0.12547969517463628,
"grad_norm": 4.352176189422607,
"learning_rate": 9.205683289359401e-07,
"loss": 0.8513,
"step": 8060
},
{
"epoch": 0.12555753617660567,
"grad_norm": 3.4279236793518066,
"learning_rate": 9.20486389931335e-07,
"loss": 0.8315,
"step": 8065
},
{
"epoch": 0.12563537717857504,
"grad_norm": 5.192807197570801,
"learning_rate": 9.204044509267301e-07,
"loss": 0.6887,
"step": 8070
},
{
"epoch": 0.12571321818054443,
"grad_norm": 7.44572114944458,
"learning_rate": 9.203225119221252e-07,
"loss": 0.8179,
"step": 8075
},
{
"epoch": 0.1257910591825138,
"grad_norm": 2.792656183242798,
"learning_rate": 9.202405729175201e-07,
"loss": 0.8136,
"step": 8080
},
{
"epoch": 0.12586890018448318,
"grad_norm": 3.840090036392212,
"learning_rate": 9.201586339129151e-07,
"loss": 0.7216,
"step": 8085
},
{
"epoch": 0.12594674118645255,
"grad_norm": 6.609809398651123,
"learning_rate": 9.200766949083102e-07,
"loss": 0.8109,
"step": 8090
},
{
"epoch": 0.12602458218842194,
"grad_norm": 3.8064773082733154,
"learning_rate": 9.199947559037052e-07,
"loss": 0.7874,
"step": 8095
},
{
"epoch": 0.1261024231903913,
"grad_norm": 4.612837791442871,
"learning_rate": 9.199128168991003e-07,
"loss": 0.8989,
"step": 8100
},
{
"epoch": 0.1261802641923607,
"grad_norm": 3.837954044342041,
"learning_rate": 9.198308778944953e-07,
"loss": 0.7964,
"step": 8105
},
{
"epoch": 0.12625810519433006,
"grad_norm": 5.080657482147217,
"learning_rate": 9.197489388898904e-07,
"loss": 0.7835,
"step": 8110
},
{
"epoch": 0.12633594619629943,
"grad_norm": 5.589513301849365,
"learning_rate": 9.196669998852854e-07,
"loss": 0.8027,
"step": 8115
},
{
"epoch": 0.12641378719826882,
"grad_norm": 5.392527103424072,
"learning_rate": 9.195850608806803e-07,
"loss": 0.7721,
"step": 8120
},
{
"epoch": 0.12649162820023818,
"grad_norm": 8.648225784301758,
"learning_rate": 9.195031218760754e-07,
"loss": 0.8396,
"step": 8125
},
{
"epoch": 0.12656946920220757,
"grad_norm": 3.0926201343536377,
"learning_rate": 9.194211828714704e-07,
"loss": 0.8282,
"step": 8130
},
{
"epoch": 0.12664731020417694,
"grad_norm": 4.515932083129883,
"learning_rate": 9.193392438668655e-07,
"loss": 0.8238,
"step": 8135
},
{
"epoch": 0.12672515120614633,
"grad_norm": 6.131096839904785,
"learning_rate": 9.192573048622605e-07,
"loss": 0.8638,
"step": 8140
},
{
"epoch": 0.1268029922081157,
"grad_norm": 3.634267807006836,
"learning_rate": 9.191753658576555e-07,
"loss": 0.716,
"step": 8145
},
{
"epoch": 0.1268808332100851,
"grad_norm": 4.676586627960205,
"learning_rate": 9.190934268530506e-07,
"loss": 0.8273,
"step": 8150
},
{
"epoch": 0.12695867421205445,
"grad_norm": 3.584019422531128,
"learning_rate": 9.190114878484457e-07,
"loss": 0.805,
"step": 8155
},
{
"epoch": 0.12703651521402384,
"grad_norm": 3.7464358806610107,
"learning_rate": 9.189295488438405e-07,
"loss": 0.7724,
"step": 8160
},
{
"epoch": 0.1271143562159932,
"grad_norm": 3.974726438522339,
"learning_rate": 9.188476098392356e-07,
"loss": 0.807,
"step": 8165
},
{
"epoch": 0.1271921972179626,
"grad_norm": 5.144652843475342,
"learning_rate": 9.187656708346307e-07,
"loss": 0.8721,
"step": 8170
},
{
"epoch": 0.12727003821993196,
"grad_norm": 7.679945945739746,
"learning_rate": 9.186837318300256e-07,
"loss": 0.7945,
"step": 8175
},
{
"epoch": 0.12734787922190136,
"grad_norm": 5.530436992645264,
"learning_rate": 9.186017928254207e-07,
"loss": 0.9354,
"step": 8180
},
{
"epoch": 0.12742572022387072,
"grad_norm": 3.981515884399414,
"learning_rate": 9.185198538208158e-07,
"loss": 0.6024,
"step": 8185
},
{
"epoch": 0.1275035612258401,
"grad_norm": 3.5425384044647217,
"learning_rate": 9.184379148162108e-07,
"loss": 0.8061,
"step": 8190
},
{
"epoch": 0.12758140222780948,
"grad_norm": 3.2570059299468994,
"learning_rate": 9.183559758116058e-07,
"loss": 0.7803,
"step": 8195
},
{
"epoch": 0.12765924322977884,
"grad_norm": 3.267265558242798,
"learning_rate": 9.182740368070008e-07,
"loss": 0.7927,
"step": 8200
},
{
"epoch": 0.12773708423174823,
"grad_norm": 3.4302942752838135,
"learning_rate": 9.181920978023958e-07,
"loss": 0.8104,
"step": 8205
},
{
"epoch": 0.1278149252337176,
"grad_norm": 4.691220760345459,
"learning_rate": 9.181101587977908e-07,
"loss": 0.7826,
"step": 8210
},
{
"epoch": 0.127892766235687,
"grad_norm": 4.321291446685791,
"learning_rate": 9.180282197931859e-07,
"loss": 0.8503,
"step": 8215
},
{
"epoch": 0.12797060723765635,
"grad_norm": 6.356113433837891,
"learning_rate": 9.179462807885809e-07,
"loss": 0.8026,
"step": 8220
},
{
"epoch": 0.12804844823962575,
"grad_norm": 6.146854400634766,
"learning_rate": 9.17864341783976e-07,
"loss": 0.7779,
"step": 8225
},
{
"epoch": 0.1281262892415951,
"grad_norm": 4.845507621765137,
"learning_rate": 9.17782402779371e-07,
"loss": 0.855,
"step": 8230
},
{
"epoch": 0.1282041302435645,
"grad_norm": 4.020603179931641,
"learning_rate": 9.177004637747661e-07,
"loss": 0.8117,
"step": 8235
},
{
"epoch": 0.12828197124553387,
"grad_norm": 3.1681554317474365,
"learning_rate": 9.17618524770161e-07,
"loss": 1.0093,
"step": 8240
},
{
"epoch": 0.12835981224750326,
"grad_norm": 12.957433700561523,
"learning_rate": 9.17536585765556e-07,
"loss": 0.8047,
"step": 8245
},
{
"epoch": 0.12843765324947262,
"grad_norm": 3.9429116249084473,
"learning_rate": 9.174546467609511e-07,
"loss": 0.7415,
"step": 8250
},
{
"epoch": 0.12851549425144201,
"grad_norm": 5.232780933380127,
"learning_rate": 9.173727077563461e-07,
"loss": 0.9999,
"step": 8255
},
{
"epoch": 0.12859333525341138,
"grad_norm": 5.215351104736328,
"learning_rate": 9.172907687517412e-07,
"loss": 0.7994,
"step": 8260
},
{
"epoch": 0.12867117625538077,
"grad_norm": 3.917405366897583,
"learning_rate": 9.172088297471362e-07,
"loss": 0.7882,
"step": 8265
},
{
"epoch": 0.12874901725735013,
"grad_norm": 5.005404949188232,
"learning_rate": 9.171268907425312e-07,
"loss": 0.8112,
"step": 8270
},
{
"epoch": 0.12882685825931953,
"grad_norm": 3.008211851119995,
"learning_rate": 9.170449517379263e-07,
"loss": 0.8249,
"step": 8275
},
{
"epoch": 0.1289046992612889,
"grad_norm": 9.127318382263184,
"learning_rate": 9.169630127333214e-07,
"loss": 0.8224,
"step": 8280
},
{
"epoch": 0.12898254026325826,
"grad_norm": 4.283410549163818,
"learning_rate": 9.168810737287162e-07,
"loss": 0.8319,
"step": 8285
},
{
"epoch": 0.12906038126522765,
"grad_norm": 9.957722663879395,
"learning_rate": 9.167991347241113e-07,
"loss": 0.7951,
"step": 8290
},
{
"epoch": 0.129138222267197,
"grad_norm": 3.6089370250701904,
"learning_rate": 9.167171957195064e-07,
"loss": 0.7409,
"step": 8295
},
{
"epoch": 0.1292160632691664,
"grad_norm": 6.038301467895508,
"learning_rate": 9.166352567149013e-07,
"loss": 0.802,
"step": 8300
},
{
"epoch": 0.12929390427113577,
"grad_norm": 5.123950481414795,
"learning_rate": 9.165533177102964e-07,
"loss": 0.723,
"step": 8305
},
{
"epoch": 0.12937174527310516,
"grad_norm": 3.86824369430542,
"learning_rate": 9.164713787056915e-07,
"loss": 0.811,
"step": 8310
},
{
"epoch": 0.12944958627507452,
"grad_norm": 4.312297821044922,
"learning_rate": 9.163894397010865e-07,
"loss": 0.7,
"step": 8315
},
{
"epoch": 0.12952742727704392,
"grad_norm": 2.920485258102417,
"learning_rate": 9.163075006964815e-07,
"loss": 0.8012,
"step": 8320
},
{
"epoch": 0.12960526827901328,
"grad_norm": 3.8033828735351562,
"learning_rate": 9.162255616918765e-07,
"loss": 0.8236,
"step": 8325
},
{
"epoch": 0.12968310928098267,
"grad_norm": 7.963630199432373,
"learning_rate": 9.161436226872715e-07,
"loss": 0.8041,
"step": 8330
},
{
"epoch": 0.12976095028295204,
"grad_norm": 3.964761734008789,
"learning_rate": 9.160616836826666e-07,
"loss": 0.8897,
"step": 8335
},
{
"epoch": 0.12983879128492143,
"grad_norm": 3.360156297683716,
"learning_rate": 9.159797446780616e-07,
"loss": 0.7087,
"step": 8340
},
{
"epoch": 0.1299166322868908,
"grad_norm": 4.731776237487793,
"learning_rate": 9.158978056734567e-07,
"loss": 0.7926,
"step": 8345
},
{
"epoch": 0.12999447328886019,
"grad_norm": 3.076554775238037,
"learning_rate": 9.158158666688517e-07,
"loss": 0.7662,
"step": 8350
},
{
"epoch": 0.13007231429082955,
"grad_norm": 3.2537529468536377,
"learning_rate": 9.157339276642467e-07,
"loss": 0.8568,
"step": 8355
},
{
"epoch": 0.13015015529279894,
"grad_norm": 4.160289287567139,
"learning_rate": 9.156519886596418e-07,
"loss": 0.7748,
"step": 8360
},
{
"epoch": 0.1302279962947683,
"grad_norm": 3.388763666152954,
"learning_rate": 9.155700496550367e-07,
"loss": 0.7411,
"step": 8365
},
{
"epoch": 0.13030583729673767,
"grad_norm": 3.7193074226379395,
"learning_rate": 9.154881106504318e-07,
"loss": 0.7553,
"step": 8370
},
{
"epoch": 0.13038367829870706,
"grad_norm": 4.124868392944336,
"learning_rate": 9.154061716458268e-07,
"loss": 0.8061,
"step": 8375
},
{
"epoch": 0.13046151930067643,
"grad_norm": 3.1243176460266113,
"learning_rate": 9.153242326412218e-07,
"loss": 0.7673,
"step": 8380
},
{
"epoch": 0.13053936030264582,
"grad_norm": 3.175187587738037,
"learning_rate": 9.152422936366169e-07,
"loss": 0.636,
"step": 8385
},
{
"epoch": 0.13061720130461518,
"grad_norm": 3.486941337585449,
"learning_rate": 9.15160354632012e-07,
"loss": 0.768,
"step": 8390
},
{
"epoch": 0.13069504230658457,
"grad_norm": 3.408848762512207,
"learning_rate": 9.150784156274069e-07,
"loss": 0.8326,
"step": 8395
},
{
"epoch": 0.13077288330855394,
"grad_norm": 5.37129545211792,
"learning_rate": 9.14996476622802e-07,
"loss": 0.7712,
"step": 8400
},
{
"epoch": 0.13085072431052333,
"grad_norm": 2.642165422439575,
"learning_rate": 9.14914537618197e-07,
"loss": 0.8611,
"step": 8405
},
{
"epoch": 0.1309285653124927,
"grad_norm": 3.8213489055633545,
"learning_rate": 9.148325986135919e-07,
"loss": 0.7938,
"step": 8410
},
{
"epoch": 0.1310064063144621,
"grad_norm": 3.893542528152466,
"learning_rate": 9.14750659608987e-07,
"loss": 0.7761,
"step": 8415
},
{
"epoch": 0.13108424731643145,
"grad_norm": 3.6185567378997803,
"learning_rate": 9.146687206043821e-07,
"loss": 0.776,
"step": 8420
},
{
"epoch": 0.13116208831840084,
"grad_norm": 7.76255989074707,
"learning_rate": 9.14586781599777e-07,
"loss": 0.8723,
"step": 8425
},
{
"epoch": 0.1312399293203702,
"grad_norm": 3.2361936569213867,
"learning_rate": 9.145048425951721e-07,
"loss": 0.7151,
"step": 8430
},
{
"epoch": 0.1313177703223396,
"grad_norm": 7.259923458099365,
"learning_rate": 9.144229035905672e-07,
"loss": 0.7714,
"step": 8435
},
{
"epoch": 0.13139561132430896,
"grad_norm": 8.949355125427246,
"learning_rate": 9.143409645859622e-07,
"loss": 0.8829,
"step": 8440
},
{
"epoch": 0.13147345232627836,
"grad_norm": 5.755862712860107,
"learning_rate": 9.142590255813571e-07,
"loss": 0.8672,
"step": 8445
},
{
"epoch": 0.13155129332824772,
"grad_norm": 4.543202877044678,
"learning_rate": 9.141770865767522e-07,
"loss": 0.8165,
"step": 8450
},
{
"epoch": 0.1316291343302171,
"grad_norm": 3.6541123390197754,
"learning_rate": 9.140951475721472e-07,
"loss": 0.8017,
"step": 8455
},
{
"epoch": 0.13170697533218648,
"grad_norm": 3.5702321529388428,
"learning_rate": 9.140132085675423e-07,
"loss": 0.8063,
"step": 8460
},
{
"epoch": 0.13178481633415584,
"grad_norm": 3.831411361694336,
"learning_rate": 9.139312695629373e-07,
"loss": 0.832,
"step": 8465
},
{
"epoch": 0.13186265733612523,
"grad_norm": 3.0359880924224854,
"learning_rate": 9.138493305583324e-07,
"loss": 0.6776,
"step": 8470
},
{
"epoch": 0.1319404983380946,
"grad_norm": 4.264082908630371,
"learning_rate": 9.137673915537274e-07,
"loss": 0.6582,
"step": 8475
},
{
"epoch": 0.132018339340064,
"grad_norm": 4.2727508544921875,
"learning_rate": 9.136854525491225e-07,
"loss": 0.7432,
"step": 8480
},
{
"epoch": 0.13209618034203335,
"grad_norm": 7.4130539894104,
"learning_rate": 9.136035135445174e-07,
"loss": 0.7692,
"step": 8485
},
{
"epoch": 0.13217402134400275,
"grad_norm": 5.631756782531738,
"learning_rate": 9.135215745399124e-07,
"loss": 0.7297,
"step": 8490
},
{
"epoch": 0.1322518623459721,
"grad_norm": 4.975935459136963,
"learning_rate": 9.134396355353075e-07,
"loss": 0.8036,
"step": 8495
},
{
"epoch": 0.1323297033479415,
"grad_norm": 3.5903608798980713,
"learning_rate": 9.133576965307025e-07,
"loss": 0.8833,
"step": 8500
},
{
"epoch": 0.13240754434991087,
"grad_norm": 4.563084602355957,
"learning_rate": 9.132757575260975e-07,
"loss": 0.7386,
"step": 8505
},
{
"epoch": 0.13248538535188026,
"grad_norm": 5.170467853546143,
"learning_rate": 9.131938185214926e-07,
"loss": 0.8154,
"step": 8510
},
{
"epoch": 0.13256322635384962,
"grad_norm": 6.0783233642578125,
"learning_rate": 9.131118795168877e-07,
"loss": 0.8525,
"step": 8515
},
{
"epoch": 0.13264106735581901,
"grad_norm": 3.2167932987213135,
"learning_rate": 9.130299405122826e-07,
"loss": 0.7605,
"step": 8520
},
{
"epoch": 0.13271890835778838,
"grad_norm": 4.35626745223999,
"learning_rate": 9.129480015076776e-07,
"loss": 0.7577,
"step": 8525
},
{
"epoch": 0.13279674935975777,
"grad_norm": 6.491021633148193,
"learning_rate": 9.128660625030727e-07,
"loss": 0.8463,
"step": 8530
},
{
"epoch": 0.13287459036172714,
"grad_norm": 3.174940347671509,
"learning_rate": 9.127841234984676e-07,
"loss": 0.7382,
"step": 8535
},
{
"epoch": 0.13295243136369653,
"grad_norm": 4.856359004974365,
"learning_rate": 9.127021844938627e-07,
"loss": 0.7025,
"step": 8540
},
{
"epoch": 0.1330302723656659,
"grad_norm": 3.6859755516052246,
"learning_rate": 9.126202454892578e-07,
"loss": 0.642,
"step": 8545
},
{
"epoch": 0.13310811336763526,
"grad_norm": 3.6811821460723877,
"learning_rate": 9.125383064846528e-07,
"loss": 0.828,
"step": 8550
},
{
"epoch": 0.13318595436960465,
"grad_norm": 7.908188819885254,
"learning_rate": 9.124563674800478e-07,
"loss": 0.8043,
"step": 8555
},
{
"epoch": 0.133263795371574,
"grad_norm": 3.508521795272827,
"learning_rate": 9.123744284754429e-07,
"loss": 0.8052,
"step": 8560
},
{
"epoch": 0.1333416363735434,
"grad_norm": 4.004594326019287,
"learning_rate": 9.122924894708378e-07,
"loss": 0.8126,
"step": 8565
},
{
"epoch": 0.13341947737551277,
"grad_norm": 4.229026794433594,
"learning_rate": 9.122105504662328e-07,
"loss": 0.7793,
"step": 8570
},
{
"epoch": 0.13349731837748216,
"grad_norm": 5.319640159606934,
"learning_rate": 9.121286114616279e-07,
"loss": 0.765,
"step": 8575
},
{
"epoch": 0.13357515937945152,
"grad_norm": 3.8783299922943115,
"learning_rate": 9.120466724570229e-07,
"loss": 0.7036,
"step": 8580
},
{
"epoch": 0.13365300038142092,
"grad_norm": 5.243676662445068,
"learning_rate": 9.11964733452418e-07,
"loss": 0.8525,
"step": 8585
},
{
"epoch": 0.13373084138339028,
"grad_norm": 6.118826866149902,
"learning_rate": 9.11882794447813e-07,
"loss": 0.8391,
"step": 8590
},
{
"epoch": 0.13380868238535967,
"grad_norm": 2.1617751121520996,
"learning_rate": 9.118008554432081e-07,
"loss": 0.7353,
"step": 8595
},
{
"epoch": 0.13388652338732904,
"grad_norm": 7.590507507324219,
"learning_rate": 9.117189164386031e-07,
"loss": 0.8919,
"step": 8600
},
{
"epoch": 0.13396436438929843,
"grad_norm": 3.345130443572998,
"learning_rate": 9.116369774339982e-07,
"loss": 0.7615,
"step": 8605
},
{
"epoch": 0.1340422053912678,
"grad_norm": 3.144171953201294,
"learning_rate": 9.115550384293931e-07,
"loss": 0.7714,
"step": 8610
},
{
"epoch": 0.13412004639323719,
"grad_norm": 5.711132049560547,
"learning_rate": 9.114730994247881e-07,
"loss": 0.9119,
"step": 8615
},
{
"epoch": 0.13419788739520655,
"grad_norm": 3.997664451599121,
"learning_rate": 9.113911604201832e-07,
"loss": 0.7821,
"step": 8620
},
{
"epoch": 0.13427572839717594,
"grad_norm": 5.540621757507324,
"learning_rate": 9.113092214155782e-07,
"loss": 0.8796,
"step": 8625
},
{
"epoch": 0.1343535693991453,
"grad_norm": 4.296466827392578,
"learning_rate": 9.112272824109732e-07,
"loss": 0.839,
"step": 8630
},
{
"epoch": 0.13443141040111467,
"grad_norm": 5.527231693267822,
"learning_rate": 9.111453434063683e-07,
"loss": 0.7966,
"step": 8635
},
{
"epoch": 0.13450925140308406,
"grad_norm": 4.798453330993652,
"learning_rate": 9.110634044017634e-07,
"loss": 0.8808,
"step": 8640
},
{
"epoch": 0.13458709240505343,
"grad_norm": 4.989645957946777,
"learning_rate": 9.109814653971583e-07,
"loss": 0.7845,
"step": 8645
},
{
"epoch": 0.13466493340702282,
"grad_norm": 5.687048435211182,
"learning_rate": 9.108995263925533e-07,
"loss": 0.8627,
"step": 8650
},
{
"epoch": 0.13474277440899218,
"grad_norm": 3.1098756790161133,
"learning_rate": 9.108175873879484e-07,
"loss": 0.7594,
"step": 8655
},
{
"epoch": 0.13482061541096158,
"grad_norm": 8.40995979309082,
"learning_rate": 9.107356483833433e-07,
"loss": 0.7599,
"step": 8660
},
{
"epoch": 0.13489845641293094,
"grad_norm": 6.420483589172363,
"learning_rate": 9.106537093787384e-07,
"loss": 0.8199,
"step": 8665
},
{
"epoch": 0.13497629741490033,
"grad_norm": 5.545563697814941,
"learning_rate": 9.105717703741335e-07,
"loss": 0.7403,
"step": 8670
},
{
"epoch": 0.1350541384168697,
"grad_norm": 8.34080696105957,
"learning_rate": 9.104898313695285e-07,
"loss": 0.7979,
"step": 8675
},
{
"epoch": 0.1351319794188391,
"grad_norm": 6.4696550369262695,
"learning_rate": 9.104078923649235e-07,
"loss": 0.6935,
"step": 8680
},
{
"epoch": 0.13520982042080845,
"grad_norm": 7.9759521484375,
"learning_rate": 9.103259533603186e-07,
"loss": 0.7174,
"step": 8685
},
{
"epoch": 0.13528766142277784,
"grad_norm": 4.094228267669678,
"learning_rate": 9.102440143557135e-07,
"loss": 0.9191,
"step": 8690
},
{
"epoch": 0.1353655024247472,
"grad_norm": 3.2784035205841064,
"learning_rate": 9.101620753511086e-07,
"loss": 0.8157,
"step": 8695
},
{
"epoch": 0.1354433434267166,
"grad_norm": 3.680067300796509,
"learning_rate": 9.100801363465036e-07,
"loss": 0.8478,
"step": 8700
},
{
"epoch": 0.13552118442868596,
"grad_norm": 3.991107225418091,
"learning_rate": 9.099981973418986e-07,
"loss": 0.9003,
"step": 8705
},
{
"epoch": 0.13559902543065536,
"grad_norm": 2.9558584690093994,
"learning_rate": 9.099162583372937e-07,
"loss": 0.7571,
"step": 8710
},
{
"epoch": 0.13567686643262472,
"grad_norm": 2.952221632003784,
"learning_rate": 9.098343193326887e-07,
"loss": 0.8597,
"step": 8715
},
{
"epoch": 0.13575470743459409,
"grad_norm": 8.548612594604492,
"learning_rate": 9.097523803280838e-07,
"loss": 0.7225,
"step": 8720
},
{
"epoch": 0.13583254843656348,
"grad_norm": 9.161630630493164,
"learning_rate": 9.096704413234788e-07,
"loss": 0.6646,
"step": 8725
},
{
"epoch": 0.13591038943853284,
"grad_norm": 4.948508262634277,
"learning_rate": 9.095885023188738e-07,
"loss": 0.6339,
"step": 8730
},
{
"epoch": 0.13598823044050223,
"grad_norm": 6.353402614593506,
"learning_rate": 9.095065633142688e-07,
"loss": 0.7525,
"step": 8735
},
{
"epoch": 0.1360660714424716,
"grad_norm": 2.8166439533233643,
"learning_rate": 9.094246243096638e-07,
"loss": 0.7942,
"step": 8740
},
{
"epoch": 0.136143912444441,
"grad_norm": 3.2891948223114014,
"learning_rate": 9.093426853050589e-07,
"loss": 0.7832,
"step": 8745
},
{
"epoch": 0.13622175344641035,
"grad_norm": 6.029998779296875,
"learning_rate": 9.09260746300454e-07,
"loss": 0.9606,
"step": 8750
},
{
"epoch": 0.13629959444837975,
"grad_norm": 3.9812381267547607,
"learning_rate": 9.091788072958489e-07,
"loss": 0.8119,
"step": 8755
},
{
"epoch": 0.1363774354503491,
"grad_norm": 2.964101791381836,
"learning_rate": 9.09096868291244e-07,
"loss": 0.7792,
"step": 8760
},
{
"epoch": 0.1364552764523185,
"grad_norm": 5.025110244750977,
"learning_rate": 9.090149292866391e-07,
"loss": 0.9316,
"step": 8765
},
{
"epoch": 0.13653311745428787,
"grad_norm": 8.131609916687012,
"learning_rate": 9.089329902820339e-07,
"loss": 0.8358,
"step": 8770
},
{
"epoch": 0.13661095845625726,
"grad_norm": 6.380354881286621,
"learning_rate": 9.08851051277429e-07,
"loss": 0.8002,
"step": 8775
},
{
"epoch": 0.13668879945822662,
"grad_norm": 3.922022581100464,
"learning_rate": 9.087691122728241e-07,
"loss": 0.7736,
"step": 8780
},
{
"epoch": 0.13676664046019602,
"grad_norm": 7.275602340698242,
"learning_rate": 9.08687173268219e-07,
"loss": 0.8083,
"step": 8785
},
{
"epoch": 0.13684448146216538,
"grad_norm": 4.583987236022949,
"learning_rate": 9.086052342636141e-07,
"loss": 0.7587,
"step": 8790
},
{
"epoch": 0.13692232246413477,
"grad_norm": 2.945908784866333,
"learning_rate": 9.085232952590092e-07,
"loss": 0.7746,
"step": 8795
},
{
"epoch": 0.13700016346610414,
"grad_norm": 3.0301320552825928,
"learning_rate": 9.084413562544042e-07,
"loss": 0.7701,
"step": 8800
},
{
"epoch": 0.1370780044680735,
"grad_norm": 2.6927056312561035,
"learning_rate": 9.083594172497992e-07,
"loss": 0.6471,
"step": 8805
},
{
"epoch": 0.1371558454700429,
"grad_norm": 3.8082404136657715,
"learning_rate": 9.082774782451942e-07,
"loss": 0.8069,
"step": 8810
},
{
"epoch": 0.13723368647201226,
"grad_norm": 3.635481595993042,
"learning_rate": 9.081955392405892e-07,
"loss": 0.7559,
"step": 8815
},
{
"epoch": 0.13731152747398165,
"grad_norm": 3.12910795211792,
"learning_rate": 9.081136002359843e-07,
"loss": 0.7618,
"step": 8820
},
{
"epoch": 0.137389368475951,
"grad_norm": 4.329128742218018,
"learning_rate": 9.080316612313793e-07,
"loss": 0.7491,
"step": 8825
},
{
"epoch": 0.1374672094779204,
"grad_norm": 3.8912899494171143,
"learning_rate": 9.079497222267743e-07,
"loss": 0.8315,
"step": 8830
},
{
"epoch": 0.13754505047988977,
"grad_norm": 5.182728290557861,
"learning_rate": 9.078677832221694e-07,
"loss": 0.8504,
"step": 8835
},
{
"epoch": 0.13762289148185916,
"grad_norm": 2.6342267990112305,
"learning_rate": 9.077858442175645e-07,
"loss": 0.8111,
"step": 8840
},
{
"epoch": 0.13770073248382853,
"grad_norm": 5.059619903564453,
"learning_rate": 9.077039052129595e-07,
"loss": 0.9168,
"step": 8845
},
{
"epoch": 0.13777857348579792,
"grad_norm": 3.4851198196411133,
"learning_rate": 9.076219662083544e-07,
"loss": 0.8299,
"step": 8850
},
{
"epoch": 0.13785641448776728,
"grad_norm": 8.638031959533691,
"learning_rate": 9.075400272037495e-07,
"loss": 0.6855,
"step": 8855
},
{
"epoch": 0.13793425548973667,
"grad_norm": 4.0812811851501465,
"learning_rate": 9.074580881991445e-07,
"loss": 0.7596,
"step": 8860
},
{
"epoch": 0.13801209649170604,
"grad_norm": 3.0092952251434326,
"learning_rate": 9.073761491945395e-07,
"loss": 0.7839,
"step": 8865
},
{
"epoch": 0.13808993749367543,
"grad_norm": 3.5875368118286133,
"learning_rate": 9.072942101899346e-07,
"loss": 0.8173,
"step": 8870
},
{
"epoch": 0.1381677784956448,
"grad_norm": 6.807621955871582,
"learning_rate": 9.072122711853297e-07,
"loss": 0.8204,
"step": 8875
},
{
"epoch": 0.13824561949761419,
"grad_norm": 5.541608810424805,
"learning_rate": 9.071303321807246e-07,
"loss": 0.7516,
"step": 8880
},
{
"epoch": 0.13832346049958355,
"grad_norm": 4.05478048324585,
"learning_rate": 9.070483931761197e-07,
"loss": 0.841,
"step": 8885
},
{
"epoch": 0.13840130150155291,
"grad_norm": 3.128432512283325,
"learning_rate": 9.069664541715148e-07,
"loss": 0.694,
"step": 8890
},
{
"epoch": 0.1384791425035223,
"grad_norm": 4.354421138763428,
"learning_rate": 9.068845151669096e-07,
"loss": 0.885,
"step": 8895
},
{
"epoch": 0.13855698350549167,
"grad_norm": 4.6781134605407715,
"learning_rate": 9.068025761623047e-07,
"loss": 0.9041,
"step": 8900
},
{
"epoch": 0.13863482450746106,
"grad_norm": 12.059392929077148,
"learning_rate": 9.067206371576998e-07,
"loss": 0.7997,
"step": 8905
},
{
"epoch": 0.13871266550943043,
"grad_norm": 2.994907855987549,
"learning_rate": 9.066386981530948e-07,
"loss": 0.7988,
"step": 8910
},
{
"epoch": 0.13879050651139982,
"grad_norm": 4.420156478881836,
"learning_rate": 9.065567591484898e-07,
"loss": 0.8653,
"step": 8915
},
{
"epoch": 0.13886834751336918,
"grad_norm": 8.454998016357422,
"learning_rate": 9.064748201438849e-07,
"loss": 0.8991,
"step": 8920
},
{
"epoch": 0.13894618851533858,
"grad_norm": 3.3839731216430664,
"learning_rate": 9.063928811392799e-07,
"loss": 0.7173,
"step": 8925
},
{
"epoch": 0.13902402951730794,
"grad_norm": 5.453253746032715,
"learning_rate": 9.06310942134675e-07,
"loss": 0.7141,
"step": 8930
},
{
"epoch": 0.13910187051927733,
"grad_norm": 9.229926109313965,
"learning_rate": 9.062290031300699e-07,
"loss": 0.7292,
"step": 8935
},
{
"epoch": 0.1391797115212467,
"grad_norm": 5.860846042633057,
"learning_rate": 9.061470641254649e-07,
"loss": 0.8563,
"step": 8940
},
{
"epoch": 0.1392575525232161,
"grad_norm": 4.182551383972168,
"learning_rate": 9.0606512512086e-07,
"loss": 0.7552,
"step": 8945
},
{
"epoch": 0.13933539352518545,
"grad_norm": 3.165614604949951,
"learning_rate": 9.05983186116255e-07,
"loss": 0.7887,
"step": 8950
},
{
"epoch": 0.13941323452715484,
"grad_norm": 3.805906057357788,
"learning_rate": 9.0590124711165e-07,
"loss": 0.8226,
"step": 8955
},
{
"epoch": 0.1394910755291242,
"grad_norm": 4.190842151641846,
"learning_rate": 9.058193081070451e-07,
"loss": 0.8951,
"step": 8960
},
{
"epoch": 0.1395689165310936,
"grad_norm": 3.0468575954437256,
"learning_rate": 9.057373691024402e-07,
"loss": 0.7346,
"step": 8965
},
{
"epoch": 0.13964675753306297,
"grad_norm": 4.917840003967285,
"learning_rate": 9.056554300978352e-07,
"loss": 0.8478,
"step": 8970
},
{
"epoch": 0.13972459853503233,
"grad_norm": 3.5989246368408203,
"learning_rate": 9.055734910932301e-07,
"loss": 0.702,
"step": 8975
},
{
"epoch": 0.13980243953700172,
"grad_norm": 3.706799030303955,
"learning_rate": 9.054915520886252e-07,
"loss": 0.6731,
"step": 8980
},
{
"epoch": 0.13988028053897109,
"grad_norm": 2.194471836090088,
"learning_rate": 9.054096130840202e-07,
"loss": 0.7718,
"step": 8985
},
{
"epoch": 0.13995812154094048,
"grad_norm": 4.610592842102051,
"learning_rate": 9.053276740794152e-07,
"loss": 0.657,
"step": 8990
},
{
"epoch": 0.14003596254290984,
"grad_norm": 9.641939163208008,
"learning_rate": 9.052457350748103e-07,
"loss": 0.7753,
"step": 8995
},
{
"epoch": 0.14011380354487923,
"grad_norm": 4.634001731872559,
"learning_rate": 9.051637960702054e-07,
"loss": 0.8575,
"step": 9000
},
{
"epoch": 0.1401916445468486,
"grad_norm": 4.204237461090088,
"learning_rate": 9.050818570656003e-07,
"loss": 0.8538,
"step": 9005
},
{
"epoch": 0.140269485548818,
"grad_norm": 4.357415199279785,
"learning_rate": 9.049999180609954e-07,
"loss": 0.7938,
"step": 9010
},
{
"epoch": 0.14034732655078735,
"grad_norm": 6.758501052856445,
"learning_rate": 9.049179790563904e-07,
"loss": 0.867,
"step": 9015
},
{
"epoch": 0.14042516755275675,
"grad_norm": 4.51594877243042,
"learning_rate": 9.048360400517853e-07,
"loss": 0.7706,
"step": 9020
},
{
"epoch": 0.1405030085547261,
"grad_norm": 3.3741414546966553,
"learning_rate": 9.047541010471804e-07,
"loss": 0.8655,
"step": 9025
},
{
"epoch": 0.1405808495566955,
"grad_norm": 5.02528190612793,
"learning_rate": 9.046721620425755e-07,
"loss": 0.7714,
"step": 9030
},
{
"epoch": 0.14065869055866487,
"grad_norm": 3.8374040126800537,
"learning_rate": 9.045902230379705e-07,
"loss": 0.8834,
"step": 9035
},
{
"epoch": 0.14073653156063426,
"grad_norm": 3.829531192779541,
"learning_rate": 9.045082840333655e-07,
"loss": 0.8477,
"step": 9040
},
{
"epoch": 0.14081437256260362,
"grad_norm": 7.790329456329346,
"learning_rate": 9.044263450287606e-07,
"loss": 0.8359,
"step": 9045
},
{
"epoch": 0.14089221356457302,
"grad_norm": 4.2702460289001465,
"learning_rate": 9.043444060241556e-07,
"loss": 0.8458,
"step": 9050
},
{
"epoch": 0.14097005456654238,
"grad_norm": 5.226367950439453,
"learning_rate": 9.042624670195506e-07,
"loss": 0.9577,
"step": 9055
},
{
"epoch": 0.14104789556851174,
"grad_norm": 4.627621650695801,
"learning_rate": 9.041805280149456e-07,
"loss": 0.8082,
"step": 9060
},
{
"epoch": 0.14112573657048114,
"grad_norm": 4.152894973754883,
"learning_rate": 9.040985890103406e-07,
"loss": 0.6948,
"step": 9065
},
{
"epoch": 0.1412035775724505,
"grad_norm": 6.219531059265137,
"learning_rate": 9.040166500057357e-07,
"loss": 0.8201,
"step": 9070
},
{
"epoch": 0.1412814185744199,
"grad_norm": 4.419485569000244,
"learning_rate": 9.039347110011307e-07,
"loss": 0.8803,
"step": 9075
},
{
"epoch": 0.14135925957638926,
"grad_norm": 4.359714508056641,
"learning_rate": 9.038527719965257e-07,
"loss": 0.7647,
"step": 9080
},
{
"epoch": 0.14143710057835865,
"grad_norm": 6.366148948669434,
"learning_rate": 9.037708329919208e-07,
"loss": 0.8826,
"step": 9085
},
{
"epoch": 0.141514941580328,
"grad_norm": 3.5685646533966064,
"learning_rate": 9.036888939873159e-07,
"loss": 0.7997,
"step": 9090
},
{
"epoch": 0.1415927825822974,
"grad_norm": 3.0451033115386963,
"learning_rate": 9.036069549827107e-07,
"loss": 0.7774,
"step": 9095
},
{
"epoch": 0.14167062358426677,
"grad_norm": 4.7669291496276855,
"learning_rate": 9.035250159781058e-07,
"loss": 0.8111,
"step": 9100
},
{
"epoch": 0.14174846458623616,
"grad_norm": 3.8222289085388184,
"learning_rate": 9.034430769735009e-07,
"loss": 0.8369,
"step": 9105
},
{
"epoch": 0.14182630558820553,
"grad_norm": 9.232769966125488,
"learning_rate": 9.03361137968896e-07,
"loss": 0.7547,
"step": 9110
},
{
"epoch": 0.14190414659017492,
"grad_norm": 2.8610751628875732,
"learning_rate": 9.032791989642909e-07,
"loss": 0.6919,
"step": 9115
},
{
"epoch": 0.14198198759214428,
"grad_norm": 5.900112152099609,
"learning_rate": 9.03197259959686e-07,
"loss": 0.8343,
"step": 9120
},
{
"epoch": 0.14205982859411367,
"grad_norm": 4.022780418395996,
"learning_rate": 9.031153209550811e-07,
"loss": 0.867,
"step": 9125
},
{
"epoch": 0.14213766959608304,
"grad_norm": 3.068723201751709,
"learning_rate": 9.03033381950476e-07,
"loss": 0.7645,
"step": 9130
},
{
"epoch": 0.14221551059805243,
"grad_norm": 3.156970977783203,
"learning_rate": 9.02951442945871e-07,
"loss": 0.6631,
"step": 9135
},
{
"epoch": 0.1422933516000218,
"grad_norm": 3.7697770595550537,
"learning_rate": 9.028695039412661e-07,
"loss": 0.7945,
"step": 9140
},
{
"epoch": 0.1423711926019912,
"grad_norm": 9.904714584350586,
"learning_rate": 9.027875649366611e-07,
"loss": 0.8339,
"step": 9145
},
{
"epoch": 0.14244903360396055,
"grad_norm": 3.7701642513275146,
"learning_rate": 9.027056259320561e-07,
"loss": 0.7597,
"step": 9150
},
{
"epoch": 0.14252687460592992,
"grad_norm": 4.5266828536987305,
"learning_rate": 9.026236869274512e-07,
"loss": 0.7383,
"step": 9155
},
{
"epoch": 0.1426047156078993,
"grad_norm": 5.614555358886719,
"learning_rate": 9.025417479228462e-07,
"loss": 0.7889,
"step": 9160
},
{
"epoch": 0.14268255660986867,
"grad_norm": 7.812586784362793,
"learning_rate": 9.024598089182412e-07,
"loss": 0.8535,
"step": 9165
},
{
"epoch": 0.14276039761183806,
"grad_norm": 5.218164443969727,
"learning_rate": 9.023778699136363e-07,
"loss": 0.814,
"step": 9170
},
{
"epoch": 0.14283823861380743,
"grad_norm": 2.824345588684082,
"learning_rate": 9.022959309090312e-07,
"loss": 0.8305,
"step": 9175
},
{
"epoch": 0.14291607961577682,
"grad_norm": 3.954688787460327,
"learning_rate": 9.022139919044263e-07,
"loss": 0.8707,
"step": 9180
},
{
"epoch": 0.14299392061774618,
"grad_norm": 4.215836048126221,
"learning_rate": 9.021320528998213e-07,
"loss": 0.9382,
"step": 9185
},
{
"epoch": 0.14307176161971558,
"grad_norm": 9.17077350616455,
"learning_rate": 9.020501138952163e-07,
"loss": 0.8275,
"step": 9190
},
{
"epoch": 0.14314960262168494,
"grad_norm": 6.527652740478516,
"learning_rate": 9.019681748906114e-07,
"loss": 0.7504,
"step": 9195
},
{
"epoch": 0.14322744362365433,
"grad_norm": 3.313584089279175,
"learning_rate": 9.018862358860065e-07,
"loss": 0.8984,
"step": 9200
},
{
"epoch": 0.1433052846256237,
"grad_norm": 4.872673988342285,
"learning_rate": 9.018042968814014e-07,
"loss": 0.9078,
"step": 9205
},
{
"epoch": 0.1433831256275931,
"grad_norm": 4.736050128936768,
"learning_rate": 9.017223578767965e-07,
"loss": 0.7167,
"step": 9210
},
{
"epoch": 0.14346096662956245,
"grad_norm": 3.0113682746887207,
"learning_rate": 9.016404188721916e-07,
"loss": 0.901,
"step": 9215
},
{
"epoch": 0.14353880763153185,
"grad_norm": 4.565042495727539,
"learning_rate": 9.015584798675864e-07,
"loss": 0.7495,
"step": 9220
},
{
"epoch": 0.1436166486335012,
"grad_norm": 3.0236637592315674,
"learning_rate": 9.014765408629815e-07,
"loss": 0.7655,
"step": 9225
},
{
"epoch": 0.1436944896354706,
"grad_norm": 5.903986930847168,
"learning_rate": 9.013946018583766e-07,
"loss": 0.8993,
"step": 9230
},
{
"epoch": 0.14377233063743997,
"grad_norm": 4.84224796295166,
"learning_rate": 9.013126628537717e-07,
"loss": 0.7633,
"step": 9235
},
{
"epoch": 0.14385017163940933,
"grad_norm": 3.5455453395843506,
"learning_rate": 9.012307238491666e-07,
"loss": 0.7984,
"step": 9240
},
{
"epoch": 0.14392801264137872,
"grad_norm": 9.098531723022461,
"learning_rate": 9.011487848445617e-07,
"loss": 0.8721,
"step": 9245
},
{
"epoch": 0.1440058536433481,
"grad_norm": 3.4469218254089355,
"learning_rate": 9.010668458399568e-07,
"loss": 0.7364,
"step": 9250
},
{
"epoch": 0.14408369464531748,
"grad_norm": 3.0461971759796143,
"learning_rate": 9.009849068353517e-07,
"loss": 0.8158,
"step": 9255
},
{
"epoch": 0.14416153564728684,
"grad_norm": 5.753521919250488,
"learning_rate": 9.009029678307467e-07,
"loss": 0.7637,
"step": 9260
},
{
"epoch": 0.14423937664925623,
"grad_norm": 10.035242080688477,
"learning_rate": 9.008210288261418e-07,
"loss": 0.8554,
"step": 9265
},
{
"epoch": 0.1443172176512256,
"grad_norm": 3.796072483062744,
"learning_rate": 9.007390898215368e-07,
"loss": 0.8241,
"step": 9270
},
{
"epoch": 0.144395058653195,
"grad_norm": 4.330554962158203,
"learning_rate": 9.006571508169318e-07,
"loss": 0.7456,
"step": 9275
},
{
"epoch": 0.14447289965516436,
"grad_norm": 3.3564655780792236,
"learning_rate": 9.005752118123269e-07,
"loss": 0.8016,
"step": 9280
},
{
"epoch": 0.14455074065713375,
"grad_norm": 2.9985568523406982,
"learning_rate": 9.004932728077219e-07,
"loss": 0.8099,
"step": 9285
},
{
"epoch": 0.1446285816591031,
"grad_norm": 2.705263376235962,
"learning_rate": 9.00411333803117e-07,
"loss": 0.7769,
"step": 9290
},
{
"epoch": 0.1447064226610725,
"grad_norm": 3.700831890106201,
"learning_rate": 9.00329394798512e-07,
"loss": 0.7327,
"step": 9295
},
{
"epoch": 0.14478426366304187,
"grad_norm": 6.56169319152832,
"learning_rate": 9.002474557939069e-07,
"loss": 0.7674,
"step": 9300
},
{
"epoch": 0.14486210466501126,
"grad_norm": 4.469850063323975,
"learning_rate": 9.00165516789302e-07,
"loss": 0.8658,
"step": 9305
},
{
"epoch": 0.14493994566698062,
"grad_norm": 8.878783226013184,
"learning_rate": 9.00083577784697e-07,
"loss": 0.7462,
"step": 9310
},
{
"epoch": 0.14501778666895002,
"grad_norm": 2.435040235519409,
"learning_rate": 9.00001638780092e-07,
"loss": 0.8363,
"step": 9315
},
{
"epoch": 0.14509562767091938,
"grad_norm": 3.0248606204986572,
"learning_rate": 8.999196997754871e-07,
"loss": 0.8078,
"step": 9320
},
{
"epoch": 0.14517346867288874,
"grad_norm": 3.466975450515747,
"learning_rate": 8.998377607708822e-07,
"loss": 0.7242,
"step": 9325
},
{
"epoch": 0.14525130967485814,
"grad_norm": 3.6273536682128906,
"learning_rate": 8.997558217662771e-07,
"loss": 0.8677,
"step": 9330
},
{
"epoch": 0.1453291506768275,
"grad_norm": 6.970489025115967,
"learning_rate": 8.996738827616722e-07,
"loss": 0.8854,
"step": 9335
},
{
"epoch": 0.1454069916787969,
"grad_norm": 3.735153913497925,
"learning_rate": 8.995919437570672e-07,
"loss": 0.7732,
"step": 9340
},
{
"epoch": 0.14548483268076626,
"grad_norm": 8.700521469116211,
"learning_rate": 8.995100047524621e-07,
"loss": 0.9042,
"step": 9345
},
{
"epoch": 0.14556267368273565,
"grad_norm": 7.9652276039123535,
"learning_rate": 8.994280657478572e-07,
"loss": 0.867,
"step": 9350
},
{
"epoch": 0.145640514684705,
"grad_norm": 3.8336143493652344,
"learning_rate": 8.993461267432523e-07,
"loss": 0.8283,
"step": 9355
},
{
"epoch": 0.1457183556866744,
"grad_norm": 7.037674903869629,
"learning_rate": 8.992641877386474e-07,
"loss": 0.7827,
"step": 9360
},
{
"epoch": 0.14579619668864377,
"grad_norm": 6.455174922943115,
"learning_rate": 8.991822487340423e-07,
"loss": 0.7666,
"step": 9365
},
{
"epoch": 0.14587403769061316,
"grad_norm": 3.610822916030884,
"learning_rate": 8.991003097294374e-07,
"loss": 0.8234,
"step": 9370
},
{
"epoch": 0.14595187869258253,
"grad_norm": 4.823882102966309,
"learning_rate": 8.990183707248325e-07,
"loss": 0.7568,
"step": 9375
},
{
"epoch": 0.14602971969455192,
"grad_norm": 5.625290393829346,
"learning_rate": 8.989364317202273e-07,
"loss": 0.7957,
"step": 9380
},
{
"epoch": 0.14610756069652128,
"grad_norm": 5.454258441925049,
"learning_rate": 8.988544927156224e-07,
"loss": 0.6412,
"step": 9385
},
{
"epoch": 0.14618540169849067,
"grad_norm": 5.896919250488281,
"learning_rate": 8.987725537110175e-07,
"loss": 0.7889,
"step": 9390
},
{
"epoch": 0.14626324270046004,
"grad_norm": 3.3774161338806152,
"learning_rate": 8.986906147064125e-07,
"loss": 0.9058,
"step": 9395
},
{
"epoch": 0.14634108370242943,
"grad_norm": 3.2311689853668213,
"learning_rate": 8.986086757018075e-07,
"loss": 0.7764,
"step": 9400
},
{
"epoch": 0.1464189247043988,
"grad_norm": 5.298305511474609,
"learning_rate": 8.985267366972026e-07,
"loss": 0.7435,
"step": 9405
},
{
"epoch": 0.14649676570636816,
"grad_norm": 4.45401668548584,
"learning_rate": 8.984447976925976e-07,
"loss": 0.7631,
"step": 9410
},
{
"epoch": 0.14657460670833755,
"grad_norm": 3.3566505908966064,
"learning_rate": 8.983628586879927e-07,
"loss": 0.8497,
"step": 9415
},
{
"epoch": 0.14665244771030692,
"grad_norm": 3.440115213394165,
"learning_rate": 8.982809196833876e-07,
"loss": 0.7614,
"step": 9420
},
{
"epoch": 0.1467302887122763,
"grad_norm": 3.5758979320526123,
"learning_rate": 8.981989806787826e-07,
"loss": 0.651,
"step": 9425
},
{
"epoch": 0.14680812971424567,
"grad_norm": 4.706757545471191,
"learning_rate": 8.981170416741777e-07,
"loss": 0.9389,
"step": 9430
},
{
"epoch": 0.14688597071621506,
"grad_norm": 4.142934799194336,
"learning_rate": 8.980351026695727e-07,
"loss": 0.9398,
"step": 9435
},
{
"epoch": 0.14696381171818443,
"grad_norm": 3.549654483795166,
"learning_rate": 8.979531636649677e-07,
"loss": 0.7552,
"step": 9440
},
{
"epoch": 0.14704165272015382,
"grad_norm": 3.9092509746551514,
"learning_rate": 8.978712246603628e-07,
"loss": 0.8207,
"step": 9445
},
{
"epoch": 0.14711949372212318,
"grad_norm": 4.208457946777344,
"learning_rate": 8.977892856557579e-07,
"loss": 0.7046,
"step": 9450
},
{
"epoch": 0.14719733472409258,
"grad_norm": 8.339073181152344,
"learning_rate": 8.977073466511528e-07,
"loss": 0.7736,
"step": 9455
},
{
"epoch": 0.14727517572606194,
"grad_norm": 2.834137439727783,
"learning_rate": 8.976254076465478e-07,
"loss": 0.893,
"step": 9460
},
{
"epoch": 0.14735301672803133,
"grad_norm": 4.870460033416748,
"learning_rate": 8.975434686419429e-07,
"loss": 0.819,
"step": 9465
},
{
"epoch": 0.1474308577300007,
"grad_norm": 4.1774582862854,
"learning_rate": 8.974615296373378e-07,
"loss": 0.7164,
"step": 9470
},
{
"epoch": 0.1475086987319701,
"grad_norm": 3.573882818222046,
"learning_rate": 8.973795906327329e-07,
"loss": 0.7982,
"step": 9475
},
{
"epoch": 0.14758653973393945,
"grad_norm": 3.5654754638671875,
"learning_rate": 8.97297651628128e-07,
"loss": 0.8293,
"step": 9480
},
{
"epoch": 0.14766438073590885,
"grad_norm": 3.387242078781128,
"learning_rate": 8.972157126235231e-07,
"loss": 0.7351,
"step": 9485
},
{
"epoch": 0.1477422217378782,
"grad_norm": 7.289836883544922,
"learning_rate": 8.97133773618918e-07,
"loss": 0.8013,
"step": 9490
},
{
"epoch": 0.14782006273984757,
"grad_norm": 4.657222270965576,
"learning_rate": 8.970518346143131e-07,
"loss": 0.8237,
"step": 9495
},
{
"epoch": 0.14789790374181697,
"grad_norm": 4.462035179138184,
"learning_rate": 8.969698956097081e-07,
"loss": 0.7025,
"step": 9500
},
{
"epoch": 0.14797574474378633,
"grad_norm": 4.082338809967041,
"learning_rate": 8.968879566051031e-07,
"loss": 0.7665,
"step": 9505
},
{
"epoch": 0.14805358574575572,
"grad_norm": 4.678539276123047,
"learning_rate": 8.968060176004981e-07,
"loss": 0.8325,
"step": 9510
},
{
"epoch": 0.1481314267477251,
"grad_norm": 4.395145893096924,
"learning_rate": 8.967240785958932e-07,
"loss": 0.7172,
"step": 9515
},
{
"epoch": 0.14820926774969448,
"grad_norm": 3.5010783672332764,
"learning_rate": 8.966421395912882e-07,
"loss": 0.7459,
"step": 9520
},
{
"epoch": 0.14828710875166384,
"grad_norm": 5.013630390167236,
"learning_rate": 8.965602005866832e-07,
"loss": 0.7536,
"step": 9525
},
{
"epoch": 0.14836494975363324,
"grad_norm": 3.0716755390167236,
"learning_rate": 8.964782615820783e-07,
"loss": 0.8559,
"step": 9530
},
{
"epoch": 0.1484427907556026,
"grad_norm": 8.94082260131836,
"learning_rate": 8.963963225774733e-07,
"loss": 0.8765,
"step": 9535
},
{
"epoch": 0.148520631757572,
"grad_norm": 5.629791736602783,
"learning_rate": 8.963143835728684e-07,
"loss": 0.9076,
"step": 9540
},
{
"epoch": 0.14859847275954136,
"grad_norm": 4.679615020751953,
"learning_rate": 8.962324445682633e-07,
"loss": 0.7733,
"step": 9545
},
{
"epoch": 0.14867631376151075,
"grad_norm": 2.7258129119873047,
"learning_rate": 8.961505055636583e-07,
"loss": 0.7082,
"step": 9550
},
{
"epoch": 0.1487541547634801,
"grad_norm": 4.701254844665527,
"learning_rate": 8.960685665590534e-07,
"loss": 0.6848,
"step": 9555
},
{
"epoch": 0.1488319957654495,
"grad_norm": 3.3973538875579834,
"learning_rate": 8.959866275544485e-07,
"loss": 0.87,
"step": 9560
},
{
"epoch": 0.14890983676741887,
"grad_norm": 5.04672384262085,
"learning_rate": 8.959046885498434e-07,
"loss": 0.7941,
"step": 9565
},
{
"epoch": 0.14898767776938826,
"grad_norm": 5.399540901184082,
"learning_rate": 8.958227495452385e-07,
"loss": 0.7677,
"step": 9570
},
{
"epoch": 0.14906551877135762,
"grad_norm": 3.8206582069396973,
"learning_rate": 8.957408105406336e-07,
"loss": 0.7326,
"step": 9575
},
{
"epoch": 0.149143359773327,
"grad_norm": 2.07293963432312,
"learning_rate": 8.956588715360285e-07,
"loss": 0.7492,
"step": 9580
},
{
"epoch": 0.14922120077529638,
"grad_norm": 8.431818008422852,
"learning_rate": 8.955769325314235e-07,
"loss": 0.7455,
"step": 9585
},
{
"epoch": 0.14929904177726575,
"grad_norm": 7.549500465393066,
"learning_rate": 8.954949935268186e-07,
"loss": 0.7746,
"step": 9590
},
{
"epoch": 0.14937688277923514,
"grad_norm": 5.316057205200195,
"learning_rate": 8.954130545222136e-07,
"loss": 0.7876,
"step": 9595
},
{
"epoch": 0.1494547237812045,
"grad_norm": 7.522194862365723,
"learning_rate": 8.953311155176086e-07,
"loss": 0.802,
"step": 9600
},
{
"epoch": 0.1495325647831739,
"grad_norm": 3.6271915435791016,
"learning_rate": 8.952491765130037e-07,
"loss": 0.7795,
"step": 9605
},
{
"epoch": 0.14961040578514326,
"grad_norm": 5.297993183135986,
"learning_rate": 8.951672375083988e-07,
"loss": 0.7834,
"step": 9610
},
{
"epoch": 0.14968824678711265,
"grad_norm": 3.3175265789031982,
"learning_rate": 8.950852985037937e-07,
"loss": 0.6759,
"step": 9615
},
{
"epoch": 0.14976608778908201,
"grad_norm": 3.857908010482788,
"learning_rate": 8.950033594991888e-07,
"loss": 0.7961,
"step": 9620
},
{
"epoch": 0.1498439287910514,
"grad_norm": 6.590905666351318,
"learning_rate": 8.949214204945838e-07,
"loss": 0.8008,
"step": 9625
},
{
"epoch": 0.14992176979302077,
"grad_norm": 3.9845802783966064,
"learning_rate": 8.948394814899788e-07,
"loss": 0.8439,
"step": 9630
},
{
"epoch": 0.14999961079499016,
"grad_norm": 4.234025001525879,
"learning_rate": 8.947575424853738e-07,
"loss": 0.829,
"step": 9635
},
{
"epoch": 0.15007745179695953,
"grad_norm": 4.937478542327881,
"learning_rate": 8.946756034807689e-07,
"loss": 0.8498,
"step": 9640
},
{
"epoch": 0.15015529279892892,
"grad_norm": 5.999597072601318,
"learning_rate": 8.945936644761639e-07,
"loss": 0.7593,
"step": 9645
},
{
"epoch": 0.15023313380089828,
"grad_norm": 5.461806774139404,
"learning_rate": 8.94511725471559e-07,
"loss": 0.8598,
"step": 9650
},
{
"epoch": 0.15031097480286768,
"grad_norm": 5.157901763916016,
"learning_rate": 8.94429786466954e-07,
"loss": 0.8738,
"step": 9655
},
{
"epoch": 0.15038881580483704,
"grad_norm": 3.0504953861236572,
"learning_rate": 8.94347847462349e-07,
"loss": 0.7486,
"step": 9660
},
{
"epoch": 0.1504666568068064,
"grad_norm": 10.305487632751465,
"learning_rate": 8.94265908457744e-07,
"loss": 0.7844,
"step": 9665
},
{
"epoch": 0.1505444978087758,
"grad_norm": 3.5192887783050537,
"learning_rate": 8.94183969453139e-07,
"loss": 0.8386,
"step": 9670
},
{
"epoch": 0.15062233881074516,
"grad_norm": 4.425221920013428,
"learning_rate": 8.94102030448534e-07,
"loss": 0.7428,
"step": 9675
},
{
"epoch": 0.15070017981271455,
"grad_norm": 5.125747203826904,
"learning_rate": 8.940200914439291e-07,
"loss": 0.7855,
"step": 9680
},
{
"epoch": 0.15077802081468392,
"grad_norm": 5.673059463500977,
"learning_rate": 8.939381524393242e-07,
"loss": 0.9193,
"step": 9685
},
{
"epoch": 0.1508558618166533,
"grad_norm": 2.934014320373535,
"learning_rate": 8.938562134347191e-07,
"loss": 0.6473,
"step": 9690
},
{
"epoch": 0.15093370281862267,
"grad_norm": 6.39523983001709,
"learning_rate": 8.937742744301142e-07,
"loss": 0.7309,
"step": 9695
},
{
"epoch": 0.15101154382059206,
"grad_norm": 3.2489750385284424,
"learning_rate": 8.936923354255093e-07,
"loss": 0.8391,
"step": 9700
},
{
"epoch": 0.15108938482256143,
"grad_norm": 3.2904443740844727,
"learning_rate": 8.936103964209041e-07,
"loss": 0.6639,
"step": 9705
},
{
"epoch": 0.15116722582453082,
"grad_norm": 5.773887634277344,
"learning_rate": 8.935284574162992e-07,
"loss": 0.7859,
"step": 9710
},
{
"epoch": 0.15124506682650019,
"grad_norm": 4.7919697761535645,
"learning_rate": 8.934465184116943e-07,
"loss": 0.7519,
"step": 9715
},
{
"epoch": 0.15132290782846958,
"grad_norm": 4.2467942237854,
"learning_rate": 8.933645794070893e-07,
"loss": 0.8234,
"step": 9720
},
{
"epoch": 0.15140074883043894,
"grad_norm": 3.2644877433776855,
"learning_rate": 8.932826404024843e-07,
"loss": 0.6703,
"step": 9725
},
{
"epoch": 0.15147858983240833,
"grad_norm": 4.7553534507751465,
"learning_rate": 8.932007013978794e-07,
"loss": 0.7273,
"step": 9730
},
{
"epoch": 0.1515564308343777,
"grad_norm": 3.009917974472046,
"learning_rate": 8.931187623932745e-07,
"loss": 0.8056,
"step": 9735
},
{
"epoch": 0.1516342718363471,
"grad_norm": 2.296459436416626,
"learning_rate": 8.930368233886695e-07,
"loss": 0.7561,
"step": 9740
},
{
"epoch": 0.15171211283831645,
"grad_norm": 9.049659729003906,
"learning_rate": 8.929548843840644e-07,
"loss": 0.7144,
"step": 9745
},
{
"epoch": 0.15178995384028582,
"grad_norm": 6.223394393920898,
"learning_rate": 8.928729453794595e-07,
"loss": 0.737,
"step": 9750
},
{
"epoch": 0.1518677948422552,
"grad_norm": 7.632771968841553,
"learning_rate": 8.927910063748545e-07,
"loss": 0.8498,
"step": 9755
},
{
"epoch": 0.15194563584422457,
"grad_norm": 6.848038196563721,
"learning_rate": 8.927090673702495e-07,
"loss": 0.806,
"step": 9760
},
{
"epoch": 0.15202347684619397,
"grad_norm": 4.97011137008667,
"learning_rate": 8.926271283656446e-07,
"loss": 0.7157,
"step": 9765
},
{
"epoch": 0.15210131784816333,
"grad_norm": 3.5741093158721924,
"learning_rate": 8.925451893610396e-07,
"loss": 0.8259,
"step": 9770
},
{
"epoch": 0.15217915885013272,
"grad_norm": 3.7775120735168457,
"learning_rate": 8.924632503564347e-07,
"loss": 0.7166,
"step": 9775
},
{
"epoch": 0.1522569998521021,
"grad_norm": 3.5487003326416016,
"learning_rate": 8.923813113518297e-07,
"loss": 0.7971,
"step": 9780
},
{
"epoch": 0.15233484085407148,
"grad_norm": 4.203275203704834,
"learning_rate": 8.922993723472246e-07,
"loss": 0.8493,
"step": 9785
},
{
"epoch": 0.15241268185604084,
"grad_norm": 3.4415955543518066,
"learning_rate": 8.922174333426197e-07,
"loss": 0.7878,
"step": 9790
},
{
"epoch": 0.15249052285801024,
"grad_norm": 3.142867088317871,
"learning_rate": 8.921354943380148e-07,
"loss": 0.7424,
"step": 9795
},
{
"epoch": 0.1525683638599796,
"grad_norm": 4.120011806488037,
"learning_rate": 8.920535553334097e-07,
"loss": 0.8535,
"step": 9800
},
{
"epoch": 0.152646204861949,
"grad_norm": 3.5447614192962646,
"learning_rate": 8.919716163288048e-07,
"loss": 0.7767,
"step": 9805
},
{
"epoch": 0.15272404586391836,
"grad_norm": 3.2715671062469482,
"learning_rate": 8.918896773241999e-07,
"loss": 0.8062,
"step": 9810
},
{
"epoch": 0.15280188686588775,
"grad_norm": 2.4431989192962646,
"learning_rate": 8.918077383195948e-07,
"loss": 0.7578,
"step": 9815
},
{
"epoch": 0.1528797278678571,
"grad_norm": 3.4583024978637695,
"learning_rate": 8.917257993149899e-07,
"loss": 0.7531,
"step": 9820
},
{
"epoch": 0.1529575688698265,
"grad_norm": 2.993116617202759,
"learning_rate": 8.916438603103849e-07,
"loss": 0.7672,
"step": 9825
},
{
"epoch": 0.15303540987179587,
"grad_norm": 3.1958775520324707,
"learning_rate": 8.915619213057799e-07,
"loss": 0.8598,
"step": 9830
},
{
"epoch": 0.15311325087376526,
"grad_norm": 6.617530345916748,
"learning_rate": 8.914799823011749e-07,
"loss": 0.6993,
"step": 9835
},
{
"epoch": 0.15319109187573463,
"grad_norm": 3.9187653064727783,
"learning_rate": 8.9139804329657e-07,
"loss": 0.7635,
"step": 9840
},
{
"epoch": 0.153268932877704,
"grad_norm": 3.482074499130249,
"learning_rate": 8.91316104291965e-07,
"loss": 0.7737,
"step": 9845
},
{
"epoch": 0.15334677387967338,
"grad_norm": 9.398933410644531,
"learning_rate": 8.9123416528736e-07,
"loss": 0.8737,
"step": 9850
},
{
"epoch": 0.15342461488164275,
"grad_norm": 3.637568473815918,
"learning_rate": 8.911522262827551e-07,
"loss": 0.7667,
"step": 9855
},
{
"epoch": 0.15350245588361214,
"grad_norm": 3.3821427822113037,
"learning_rate": 8.910702872781502e-07,
"loss": 0.7836,
"step": 9860
},
{
"epoch": 0.1535802968855815,
"grad_norm": 2.9914300441741943,
"learning_rate": 8.909883482735452e-07,
"loss": 0.8564,
"step": 9865
},
{
"epoch": 0.1536581378875509,
"grad_norm": 6.171716690063477,
"learning_rate": 8.909064092689401e-07,
"loss": 0.8064,
"step": 9870
},
{
"epoch": 0.15373597888952026,
"grad_norm": 3.4968278408050537,
"learning_rate": 8.908244702643352e-07,
"loss": 0.7524,
"step": 9875
},
{
"epoch": 0.15381381989148965,
"grad_norm": 5.08829402923584,
"learning_rate": 8.907425312597302e-07,
"loss": 0.8207,
"step": 9880
},
{
"epoch": 0.15389166089345901,
"grad_norm": 3.5643150806427,
"learning_rate": 8.906605922551253e-07,
"loss": 0.7776,
"step": 9885
},
{
"epoch": 0.1539695018954284,
"grad_norm": 3.4858274459838867,
"learning_rate": 8.905786532505203e-07,
"loss": 0.8979,
"step": 9890
},
{
"epoch": 0.15404734289739777,
"grad_norm": 3.9792239665985107,
"learning_rate": 8.904967142459153e-07,
"loss": 0.797,
"step": 9895
},
{
"epoch": 0.15412518389936716,
"grad_norm": 9.95738410949707,
"learning_rate": 8.904147752413104e-07,
"loss": 0.7492,
"step": 9900
},
{
"epoch": 0.15420302490133653,
"grad_norm": 4.636844158172607,
"learning_rate": 8.903328362367054e-07,
"loss": 0.7867,
"step": 9905
},
{
"epoch": 0.15428086590330592,
"grad_norm": 7.289409637451172,
"learning_rate": 8.902508972321003e-07,
"loss": 0.7982,
"step": 9910
},
{
"epoch": 0.15435870690527528,
"grad_norm": 3.7324790954589844,
"learning_rate": 8.901689582274954e-07,
"loss": 0.7001,
"step": 9915
},
{
"epoch": 0.15443654790724468,
"grad_norm": 4.735339641571045,
"learning_rate": 8.900870192228905e-07,
"loss": 0.7155,
"step": 9920
},
{
"epoch": 0.15451438890921404,
"grad_norm": 8.172463417053223,
"learning_rate": 8.900050802182854e-07,
"loss": 0.8489,
"step": 9925
},
{
"epoch": 0.1545922299111834,
"grad_norm": 3.3313515186309814,
"learning_rate": 8.899231412136805e-07,
"loss": 0.8064,
"step": 9930
},
{
"epoch": 0.1546700709131528,
"grad_norm": 9.073691368103027,
"learning_rate": 8.898412022090756e-07,
"loss": 0.8577,
"step": 9935
},
{
"epoch": 0.15474791191512216,
"grad_norm": 7.126258373260498,
"learning_rate": 8.897592632044705e-07,
"loss": 0.7679,
"step": 9940
},
{
"epoch": 0.15482575291709155,
"grad_norm": 11.254960060119629,
"learning_rate": 8.896773241998656e-07,
"loss": 0.7725,
"step": 9945
},
{
"epoch": 0.15490359391906092,
"grad_norm": 3.706859827041626,
"learning_rate": 8.895953851952606e-07,
"loss": 0.9131,
"step": 9950
},
{
"epoch": 0.1549814349210303,
"grad_norm": 3.0554311275482178,
"learning_rate": 8.895134461906556e-07,
"loss": 0.7683,
"step": 9955
},
{
"epoch": 0.15505927592299967,
"grad_norm": 3.64799165725708,
"learning_rate": 8.894315071860506e-07,
"loss": 0.7275,
"step": 9960
},
{
"epoch": 0.15513711692496907,
"grad_norm": 3.383768320083618,
"learning_rate": 8.893495681814457e-07,
"loss": 0.8155,
"step": 9965
},
{
"epoch": 0.15521495792693843,
"grad_norm": 3.561455726623535,
"learning_rate": 8.892676291768407e-07,
"loss": 0.7256,
"step": 9970
},
{
"epoch": 0.15529279892890782,
"grad_norm": 3.7967913150787354,
"learning_rate": 8.891856901722358e-07,
"loss": 0.8299,
"step": 9975
},
{
"epoch": 0.15537063993087719,
"grad_norm": 3.6479313373565674,
"learning_rate": 8.891037511676308e-07,
"loss": 0.7851,
"step": 9980
},
{
"epoch": 0.15544848093284658,
"grad_norm": 4.59113883972168,
"learning_rate": 8.890218121630259e-07,
"loss": 0.8115,
"step": 9985
},
{
"epoch": 0.15552632193481594,
"grad_norm": 5.832945823669434,
"learning_rate": 8.889398731584208e-07,
"loss": 0.9049,
"step": 9990
},
{
"epoch": 0.15560416293678533,
"grad_norm": 3.0644237995147705,
"learning_rate": 8.888579341538158e-07,
"loss": 0.6787,
"step": 9995
},
{
"epoch": 0.1556820039387547,
"grad_norm": 4.14565372467041,
"learning_rate": 8.887759951492109e-07,
"loss": 0.8273,
"step": 10000
},
{
"epoch": 0.1557598449407241,
"grad_norm": 10.58462142944336,
"learning_rate": 8.886940561446059e-07,
"loss": 0.7526,
"step": 10005
},
{
"epoch": 0.15583768594269345,
"grad_norm": 4.249096870422363,
"learning_rate": 8.88612117140001e-07,
"loss": 0.7715,
"step": 10010
},
{
"epoch": 0.15591552694466282,
"grad_norm": 6.341519355773926,
"learning_rate": 8.88530178135396e-07,
"loss": 0.7869,
"step": 10015
},
{
"epoch": 0.1559933679466322,
"grad_norm": 2.831510543823242,
"learning_rate": 8.88448239130791e-07,
"loss": 0.807,
"step": 10020
},
{
"epoch": 0.15607120894860158,
"grad_norm": 3.297983169555664,
"learning_rate": 8.883663001261861e-07,
"loss": 0.7433,
"step": 10025
},
{
"epoch": 0.15614904995057097,
"grad_norm": 3.2775771617889404,
"learning_rate": 8.88284361121581e-07,
"loss": 0.7378,
"step": 10030
},
{
"epoch": 0.15622689095254033,
"grad_norm": 4.8905768394470215,
"learning_rate": 8.88202422116976e-07,
"loss": 0.9689,
"step": 10035
},
{
"epoch": 0.15630473195450972,
"grad_norm": 5.260064601898193,
"learning_rate": 8.881204831123711e-07,
"loss": 0.8044,
"step": 10040
},
{
"epoch": 0.1563825729564791,
"grad_norm": 3.0201990604400635,
"learning_rate": 8.880385441077662e-07,
"loss": 0.7943,
"step": 10045
},
{
"epoch": 0.15646041395844848,
"grad_norm": 3.7755088806152344,
"learning_rate": 8.879566051031611e-07,
"loss": 0.7369,
"step": 10050
},
{
"epoch": 0.15653825496041784,
"grad_norm": 4.074087619781494,
"learning_rate": 8.878746660985562e-07,
"loss": 0.8229,
"step": 10055
},
{
"epoch": 0.15661609596238724,
"grad_norm": 4.723263263702393,
"learning_rate": 8.877927270939513e-07,
"loss": 0.6489,
"step": 10060
},
{
"epoch": 0.1566939369643566,
"grad_norm": 4.477192401885986,
"learning_rate": 8.877107880893463e-07,
"loss": 0.8427,
"step": 10065
},
{
"epoch": 0.156771777966326,
"grad_norm": 2.5688400268554688,
"learning_rate": 8.876288490847412e-07,
"loss": 0.8412,
"step": 10070
},
{
"epoch": 0.15684961896829536,
"grad_norm": 4.53289794921875,
"learning_rate": 8.875469100801363e-07,
"loss": 0.8295,
"step": 10075
},
{
"epoch": 0.15692745997026475,
"grad_norm": 7.105226993560791,
"learning_rate": 8.874649710755313e-07,
"loss": 0.8648,
"step": 10080
},
{
"epoch": 0.1570053009722341,
"grad_norm": 2.8844571113586426,
"learning_rate": 8.873830320709263e-07,
"loss": 0.7657,
"step": 10085
},
{
"epoch": 0.1570831419742035,
"grad_norm": 4.569636344909668,
"learning_rate": 8.873010930663214e-07,
"loss": 0.6868,
"step": 10090
},
{
"epoch": 0.15716098297617287,
"grad_norm": 3.5974533557891846,
"learning_rate": 8.872191540617164e-07,
"loss": 0.9193,
"step": 10095
},
{
"epoch": 0.15723882397814223,
"grad_norm": 4.02683687210083,
"learning_rate": 8.871372150571115e-07,
"loss": 0.8309,
"step": 10100
},
{
"epoch": 0.15731666498011163,
"grad_norm": 3.6681370735168457,
"learning_rate": 8.870552760525065e-07,
"loss": 0.7995,
"step": 10105
},
{
"epoch": 0.157394505982081,
"grad_norm": 3.227896213531494,
"learning_rate": 8.869733370479014e-07,
"loss": 0.7712,
"step": 10110
},
{
"epoch": 0.15747234698405038,
"grad_norm": 7.877114295959473,
"learning_rate": 8.868913980432965e-07,
"loss": 0.7099,
"step": 10115
},
{
"epoch": 0.15755018798601975,
"grad_norm": 4.573225021362305,
"learning_rate": 8.868094590386915e-07,
"loss": 0.6932,
"step": 10120
},
{
"epoch": 0.15762802898798914,
"grad_norm": 4.980703353881836,
"learning_rate": 8.867275200340866e-07,
"loss": 0.9058,
"step": 10125
},
{
"epoch": 0.1577058699899585,
"grad_norm": 4.040239334106445,
"learning_rate": 8.866455810294816e-07,
"loss": 0.813,
"step": 10130
},
{
"epoch": 0.1577837109919279,
"grad_norm": 3.489124059677124,
"learning_rate": 8.865636420248767e-07,
"loss": 0.7237,
"step": 10135
},
{
"epoch": 0.15786155199389726,
"grad_norm": 2.5940945148468018,
"learning_rate": 8.864817030202717e-07,
"loss": 0.7494,
"step": 10140
},
{
"epoch": 0.15793939299586665,
"grad_norm": 3.312206745147705,
"learning_rate": 8.863997640156667e-07,
"loss": 0.8447,
"step": 10145
},
{
"epoch": 0.15801723399783602,
"grad_norm": 3.279050350189209,
"learning_rate": 8.863178250110617e-07,
"loss": 0.8192,
"step": 10150
},
{
"epoch": 0.1580950749998054,
"grad_norm": 3.221031904220581,
"learning_rate": 8.862358860064568e-07,
"loss": 0.8593,
"step": 10155
},
{
"epoch": 0.15817291600177477,
"grad_norm": 5.654365062713623,
"learning_rate": 8.861539470018517e-07,
"loss": 0.8396,
"step": 10160
},
{
"epoch": 0.15825075700374416,
"grad_norm": 3.0688130855560303,
"learning_rate": 8.860720079972468e-07,
"loss": 0.6908,
"step": 10165
},
{
"epoch": 0.15832859800571353,
"grad_norm": 4.515695571899414,
"learning_rate": 8.859900689926419e-07,
"loss": 0.8116,
"step": 10170
},
{
"epoch": 0.15840643900768292,
"grad_norm": 5.167048454284668,
"learning_rate": 8.859081299880368e-07,
"loss": 0.8373,
"step": 10175
},
{
"epoch": 0.15848428000965228,
"grad_norm": 8.9721097946167,
"learning_rate": 8.858261909834319e-07,
"loss": 0.9478,
"step": 10180
},
{
"epoch": 0.15856212101162165,
"grad_norm": 3.8844072818756104,
"learning_rate": 8.85744251978827e-07,
"loss": 0.8713,
"step": 10185
},
{
"epoch": 0.15863996201359104,
"grad_norm": 3.705676794052124,
"learning_rate": 8.85662312974222e-07,
"loss": 0.817,
"step": 10190
},
{
"epoch": 0.1587178030155604,
"grad_norm": 4.741096496582031,
"learning_rate": 8.855803739696169e-07,
"loss": 0.8058,
"step": 10195
},
{
"epoch": 0.1587956440175298,
"grad_norm": 6.481576442718506,
"learning_rate": 8.85498434965012e-07,
"loss": 0.9077,
"step": 10200
},
{
"epoch": 0.15887348501949916,
"grad_norm": 5.847426414489746,
"learning_rate": 8.85416495960407e-07,
"loss": 0.7257,
"step": 10205
},
{
"epoch": 0.15895132602146855,
"grad_norm": 3.6452407836914062,
"learning_rate": 8.85334556955802e-07,
"loss": 0.7282,
"step": 10210
},
{
"epoch": 0.15902916702343792,
"grad_norm": 3.5716402530670166,
"learning_rate": 8.852526179511971e-07,
"loss": 0.7504,
"step": 10215
},
{
"epoch": 0.1591070080254073,
"grad_norm": 4.702507495880127,
"learning_rate": 8.851706789465921e-07,
"loss": 0.8345,
"step": 10220
},
{
"epoch": 0.15918484902737667,
"grad_norm": 3.6276495456695557,
"learning_rate": 8.850887399419872e-07,
"loss": 0.8267,
"step": 10225
},
{
"epoch": 0.15926269002934607,
"grad_norm": 3.432413101196289,
"learning_rate": 8.850068009373822e-07,
"loss": 0.9476,
"step": 10230
},
{
"epoch": 0.15934053103131543,
"grad_norm": 4.081861972808838,
"learning_rate": 8.849248619327771e-07,
"loss": 0.7688,
"step": 10235
},
{
"epoch": 0.15941837203328482,
"grad_norm": 6.799472332000732,
"learning_rate": 8.848429229281722e-07,
"loss": 0.7084,
"step": 10240
},
{
"epoch": 0.1594962130352542,
"grad_norm": 2.736294984817505,
"learning_rate": 8.847609839235673e-07,
"loss": 0.7835,
"step": 10245
},
{
"epoch": 0.15957405403722358,
"grad_norm": 4.227167129516602,
"learning_rate": 8.846790449189623e-07,
"loss": 0.7833,
"step": 10250
},
{
"epoch": 0.15965189503919294,
"grad_norm": 3.3678765296936035,
"learning_rate": 8.845971059143573e-07,
"loss": 0.9005,
"step": 10255
},
{
"epoch": 0.15972973604116233,
"grad_norm": 3.6415300369262695,
"learning_rate": 8.845151669097524e-07,
"loss": 0.8214,
"step": 10260
},
{
"epoch": 0.1598075770431317,
"grad_norm": 4.28493595123291,
"learning_rate": 8.844332279051474e-07,
"loss": 0.7597,
"step": 10265
},
{
"epoch": 0.15988541804510106,
"grad_norm": 3.2147064208984375,
"learning_rate": 8.843512889005424e-07,
"loss": 0.7652,
"step": 10270
},
{
"epoch": 0.15996325904707046,
"grad_norm": 7.0327959060668945,
"learning_rate": 8.842693498959374e-07,
"loss": 0.7703,
"step": 10275
},
{
"epoch": 0.16004110004903982,
"grad_norm": 3.714036464691162,
"learning_rate": 8.841874108913325e-07,
"loss": 0.8864,
"step": 10280
},
{
"epoch": 0.1601189410510092,
"grad_norm": 3.1920130252838135,
"learning_rate": 8.841054718867274e-07,
"loss": 0.8203,
"step": 10285
},
{
"epoch": 0.16019678205297858,
"grad_norm": 4.356620788574219,
"learning_rate": 8.840235328821225e-07,
"loss": 0.8215,
"step": 10290
},
{
"epoch": 0.16027462305494797,
"grad_norm": 3.031799793243408,
"learning_rate": 8.839415938775176e-07,
"loss": 0.6909,
"step": 10295
},
{
"epoch": 0.16035246405691733,
"grad_norm": 3.817898750305176,
"learning_rate": 8.838596548729125e-07,
"loss": 0.7237,
"step": 10300
},
{
"epoch": 0.16043030505888672,
"grad_norm": 4.679711818695068,
"learning_rate": 8.837777158683076e-07,
"loss": 0.7296,
"step": 10305
},
{
"epoch": 0.1605081460608561,
"grad_norm": 3.7384095191955566,
"learning_rate": 8.836957768637027e-07,
"loss": 0.9011,
"step": 10310
},
{
"epoch": 0.16058598706282548,
"grad_norm": 4.7621541023254395,
"learning_rate": 8.836138378590976e-07,
"loss": 0.9026,
"step": 10315
},
{
"epoch": 0.16066382806479484,
"grad_norm": 6.402016639709473,
"learning_rate": 8.835318988544926e-07,
"loss": 0.8081,
"step": 10320
},
{
"epoch": 0.16074166906676424,
"grad_norm": 4.472290515899658,
"learning_rate": 8.834499598498877e-07,
"loss": 0.7606,
"step": 10325
},
{
"epoch": 0.1608195100687336,
"grad_norm": 5.197412967681885,
"learning_rate": 8.833680208452827e-07,
"loss": 0.7824,
"step": 10330
},
{
"epoch": 0.160897351070703,
"grad_norm": 3.40364408493042,
"learning_rate": 8.832860818406778e-07,
"loss": 0.7884,
"step": 10335
},
{
"epoch": 0.16097519207267236,
"grad_norm": 6.786139965057373,
"learning_rate": 8.832041428360728e-07,
"loss": 0.8743,
"step": 10340
},
{
"epoch": 0.16105303307464175,
"grad_norm": 4.048831462860107,
"learning_rate": 8.831222038314678e-07,
"loss": 0.8958,
"step": 10345
},
{
"epoch": 0.1611308740766111,
"grad_norm": 4.102180480957031,
"learning_rate": 8.830402648268629e-07,
"loss": 0.7835,
"step": 10350
},
{
"epoch": 0.16120871507858048,
"grad_norm": 3.135587453842163,
"learning_rate": 8.829583258222578e-07,
"loss": 0.6362,
"step": 10355
},
{
"epoch": 0.16128655608054987,
"grad_norm": 3.9217071533203125,
"learning_rate": 8.828763868176528e-07,
"loss": 0.7069,
"step": 10360
},
{
"epoch": 0.16136439708251923,
"grad_norm": 3.276561737060547,
"learning_rate": 8.827944478130479e-07,
"loss": 0.8588,
"step": 10365
},
{
"epoch": 0.16144223808448863,
"grad_norm": 3.9652113914489746,
"learning_rate": 8.82712508808443e-07,
"loss": 0.8276,
"step": 10370
},
{
"epoch": 0.161520079086458,
"grad_norm": 5.678191184997559,
"learning_rate": 8.82630569803838e-07,
"loss": 0.7526,
"step": 10375
},
{
"epoch": 0.16159792008842738,
"grad_norm": 5.475320816040039,
"learning_rate": 8.82548630799233e-07,
"loss": 0.8913,
"step": 10380
},
{
"epoch": 0.16167576109039675,
"grad_norm": 15.008077621459961,
"learning_rate": 8.824666917946281e-07,
"loss": 0.7912,
"step": 10385
},
{
"epoch": 0.16175360209236614,
"grad_norm": 3.8481225967407227,
"learning_rate": 8.823847527900232e-07,
"loss": 0.7949,
"step": 10390
},
{
"epoch": 0.1618314430943355,
"grad_norm": 2.759436845779419,
"learning_rate": 8.82302813785418e-07,
"loss": 0.7449,
"step": 10395
},
{
"epoch": 0.1619092840963049,
"grad_norm": 8.001230239868164,
"learning_rate": 8.822208747808131e-07,
"loss": 0.853,
"step": 10400
},
{
"epoch": 0.16198712509827426,
"grad_norm": 5.410604953765869,
"learning_rate": 8.821389357762082e-07,
"loss": 0.8032,
"step": 10405
},
{
"epoch": 0.16206496610024365,
"grad_norm": 17.351768493652344,
"learning_rate": 8.820569967716031e-07,
"loss": 0.7418,
"step": 10410
},
{
"epoch": 0.16214280710221302,
"grad_norm": 3.534604549407959,
"learning_rate": 8.819750577669982e-07,
"loss": 0.7653,
"step": 10415
},
{
"epoch": 0.1622206481041824,
"grad_norm": 8.916220664978027,
"learning_rate": 8.818931187623933e-07,
"loss": 0.837,
"step": 10420
},
{
"epoch": 0.16229848910615177,
"grad_norm": 4.538114547729492,
"learning_rate": 8.818111797577883e-07,
"loss": 0.7677,
"step": 10425
},
{
"epoch": 0.16237633010812116,
"grad_norm": 3.6164751052856445,
"learning_rate": 8.817292407531833e-07,
"loss": 0.7675,
"step": 10430
},
{
"epoch": 0.16245417111009053,
"grad_norm": 4.763157844543457,
"learning_rate": 8.816473017485783e-07,
"loss": 0.7237,
"step": 10435
},
{
"epoch": 0.1625320121120599,
"grad_norm": 4.728906631469727,
"learning_rate": 8.815653627439733e-07,
"loss": 0.8593,
"step": 10440
},
{
"epoch": 0.16260985311402928,
"grad_norm": 3.878848075866699,
"learning_rate": 8.814834237393683e-07,
"loss": 0.8701,
"step": 10445
},
{
"epoch": 0.16268769411599865,
"grad_norm": 5.231772422790527,
"learning_rate": 8.814014847347634e-07,
"loss": 0.8108,
"step": 10450
},
{
"epoch": 0.16276553511796804,
"grad_norm": 9.139507293701172,
"learning_rate": 8.813195457301584e-07,
"loss": 0.7747,
"step": 10455
},
{
"epoch": 0.1628433761199374,
"grad_norm": 4.208261013031006,
"learning_rate": 8.812376067255535e-07,
"loss": 0.7136,
"step": 10460
},
{
"epoch": 0.1629212171219068,
"grad_norm": 4.353900909423828,
"learning_rate": 8.811556677209485e-07,
"loss": 0.7753,
"step": 10465
},
{
"epoch": 0.16299905812387616,
"grad_norm": 2.729976177215576,
"learning_rate": 8.810737287163435e-07,
"loss": 0.7926,
"step": 10470
},
{
"epoch": 0.16307689912584555,
"grad_norm": 2.928906202316284,
"learning_rate": 8.809917897117385e-07,
"loss": 0.7747,
"step": 10475
},
{
"epoch": 0.16315474012781492,
"grad_norm": 4.206002712249756,
"learning_rate": 8.809098507071335e-07,
"loss": 0.8576,
"step": 10480
},
{
"epoch": 0.1632325811297843,
"grad_norm": 7.724112510681152,
"learning_rate": 8.808279117025286e-07,
"loss": 0.8393,
"step": 10485
},
{
"epoch": 0.16331042213175367,
"grad_norm": 3.295132637023926,
"learning_rate": 8.807459726979236e-07,
"loss": 0.778,
"step": 10490
},
{
"epoch": 0.16338826313372307,
"grad_norm": 3.3869988918304443,
"learning_rate": 8.806640336933187e-07,
"loss": 0.7569,
"step": 10495
},
{
"epoch": 0.16346610413569243,
"grad_norm": 3.718599319458008,
"learning_rate": 8.805820946887137e-07,
"loss": 0.8747,
"step": 10500
},
{
"epoch": 0.16354394513766182,
"grad_norm": 3.8476598262786865,
"learning_rate": 8.805001556841087e-07,
"loss": 0.6924,
"step": 10505
},
{
"epoch": 0.1636217861396312,
"grad_norm": 3.9861888885498047,
"learning_rate": 8.804182166795038e-07,
"loss": 0.8145,
"step": 10510
},
{
"epoch": 0.16369962714160058,
"grad_norm": 3.1306610107421875,
"learning_rate": 8.803362776748989e-07,
"loss": 0.6579,
"step": 10515
},
{
"epoch": 0.16377746814356994,
"grad_norm": 4.904189586639404,
"learning_rate": 8.802543386702937e-07,
"loss": 0.8639,
"step": 10520
},
{
"epoch": 0.16385530914553934,
"grad_norm": 5.17719841003418,
"learning_rate": 8.801723996656888e-07,
"loss": 0.8722,
"step": 10525
},
{
"epoch": 0.1639331501475087,
"grad_norm": 2.9521892070770264,
"learning_rate": 8.800904606610839e-07,
"loss": 0.7665,
"step": 10530
},
{
"epoch": 0.16401099114947806,
"grad_norm": 3.255908966064453,
"learning_rate": 8.800085216564788e-07,
"loss": 0.7702,
"step": 10535
},
{
"epoch": 0.16408883215144746,
"grad_norm": 6.015521049499512,
"learning_rate": 8.799265826518739e-07,
"loss": 0.7845,
"step": 10540
},
{
"epoch": 0.16416667315341682,
"grad_norm": 4.091128826141357,
"learning_rate": 8.79844643647269e-07,
"loss": 0.9369,
"step": 10545
},
{
"epoch": 0.1642445141553862,
"grad_norm": 4.375763416290283,
"learning_rate": 8.79762704642664e-07,
"loss": 0.8318,
"step": 10550
},
{
"epoch": 0.16432235515735558,
"grad_norm": 6.054563522338867,
"learning_rate": 8.79680765638059e-07,
"loss": 0.8422,
"step": 10555
},
{
"epoch": 0.16440019615932497,
"grad_norm": 4.5666303634643555,
"learning_rate": 8.79598826633454e-07,
"loss": 0.8438,
"step": 10560
},
{
"epoch": 0.16447803716129433,
"grad_norm": 3.462005615234375,
"learning_rate": 8.79516887628849e-07,
"loss": 0.7772,
"step": 10565
},
{
"epoch": 0.16455587816326372,
"grad_norm": 3.66213321685791,
"learning_rate": 8.79434948624244e-07,
"loss": 0.9158,
"step": 10570
},
{
"epoch": 0.1646337191652331,
"grad_norm": 3.454305648803711,
"learning_rate": 8.793530096196391e-07,
"loss": 0.7452,
"step": 10575
},
{
"epoch": 0.16471156016720248,
"grad_norm": 4.568371295928955,
"learning_rate": 8.792710706150341e-07,
"loss": 0.855,
"step": 10580
},
{
"epoch": 0.16478940116917185,
"grad_norm": 5.541522979736328,
"learning_rate": 8.791891316104292e-07,
"loss": 0.9156,
"step": 10585
},
{
"epoch": 0.16486724217114124,
"grad_norm": 4.69566011428833,
"learning_rate": 8.791071926058242e-07,
"loss": 0.7321,
"step": 10590
},
{
"epoch": 0.1649450831731106,
"grad_norm": 7.592962741851807,
"learning_rate": 8.790252536012192e-07,
"loss": 0.7888,
"step": 10595
},
{
"epoch": 0.16502292417508,
"grad_norm": 5.3040080070495605,
"learning_rate": 8.789433145966142e-07,
"loss": 0.7643,
"step": 10600
},
{
"epoch": 0.16510076517704936,
"grad_norm": 2.616908073425293,
"learning_rate": 8.788613755920093e-07,
"loss": 0.7614,
"step": 10605
},
{
"epoch": 0.16517860617901875,
"grad_norm": 3.683166980743408,
"learning_rate": 8.787794365874043e-07,
"loss": 0.715,
"step": 10610
},
{
"epoch": 0.16525644718098811,
"grad_norm": 4.2956132888793945,
"learning_rate": 8.786974975827993e-07,
"loss": 0.9153,
"step": 10615
},
{
"epoch": 0.16533428818295748,
"grad_norm": 3.76724910736084,
"learning_rate": 8.786155585781944e-07,
"loss": 0.86,
"step": 10620
},
{
"epoch": 0.16541212918492687,
"grad_norm": 3.4380204677581787,
"learning_rate": 8.785336195735894e-07,
"loss": 0.8662,
"step": 10625
},
{
"epoch": 0.16548997018689623,
"grad_norm": 8.281620025634766,
"learning_rate": 8.784516805689844e-07,
"loss": 0.8607,
"step": 10630
},
{
"epoch": 0.16556781118886563,
"grad_norm": 7.735804557800293,
"learning_rate": 8.783697415643795e-07,
"loss": 0.7513,
"step": 10635
},
{
"epoch": 0.165645652190835,
"grad_norm": 6.491322040557861,
"learning_rate": 8.782878025597745e-07,
"loss": 0.824,
"step": 10640
},
{
"epoch": 0.16572349319280438,
"grad_norm": 2.7615108489990234,
"learning_rate": 8.782058635551694e-07,
"loss": 0.9231,
"step": 10645
},
{
"epoch": 0.16580133419477375,
"grad_norm": 5.219547748565674,
"learning_rate": 8.781239245505645e-07,
"loss": 0.7178,
"step": 10650
},
{
"epoch": 0.16587917519674314,
"grad_norm": 8.346015930175781,
"learning_rate": 8.780419855459596e-07,
"loss": 0.8779,
"step": 10655
},
{
"epoch": 0.1659570161987125,
"grad_norm": 9.989750862121582,
"learning_rate": 8.779600465413545e-07,
"loss": 0.7609,
"step": 10660
},
{
"epoch": 0.1660348572006819,
"grad_norm": 6.761223793029785,
"learning_rate": 8.778781075367496e-07,
"loss": 0.9394,
"step": 10665
},
{
"epoch": 0.16611269820265126,
"grad_norm": 3.055026054382324,
"learning_rate": 8.777961685321447e-07,
"loss": 0.8548,
"step": 10670
},
{
"epoch": 0.16619053920462065,
"grad_norm": 7.2378950119018555,
"learning_rate": 8.777142295275397e-07,
"loss": 0.8538,
"step": 10675
},
{
"epoch": 0.16626838020659002,
"grad_norm": 11.686674118041992,
"learning_rate": 8.776322905229346e-07,
"loss": 0.7245,
"step": 10680
},
{
"epoch": 0.1663462212085594,
"grad_norm": 3.3093395233154297,
"learning_rate": 8.775503515183297e-07,
"loss": 0.7876,
"step": 10685
},
{
"epoch": 0.16642406221052877,
"grad_norm": 5.579384803771973,
"learning_rate": 8.774684125137247e-07,
"loss": 0.8294,
"step": 10690
},
{
"epoch": 0.16650190321249816,
"grad_norm": 3.092195749282837,
"learning_rate": 8.773864735091198e-07,
"loss": 0.8116,
"step": 10695
},
{
"epoch": 0.16657974421446753,
"grad_norm": 5.8653130531311035,
"learning_rate": 8.773045345045148e-07,
"loss": 0.7827,
"step": 10700
},
{
"epoch": 0.1666575852164369,
"grad_norm": 8.496586799621582,
"learning_rate": 8.772225954999098e-07,
"loss": 0.871,
"step": 10705
},
{
"epoch": 0.16673542621840629,
"grad_norm": 3.554830312728882,
"learning_rate": 8.771406564953049e-07,
"loss": 0.7876,
"step": 10710
},
{
"epoch": 0.16681326722037565,
"grad_norm": 4.988160610198975,
"learning_rate": 8.770587174907e-07,
"loss": 0.8173,
"step": 10715
},
{
"epoch": 0.16689110822234504,
"grad_norm": 4.899982929229736,
"learning_rate": 8.769767784860948e-07,
"loss": 0.7582,
"step": 10720
},
{
"epoch": 0.1669689492243144,
"grad_norm": 3.1257996559143066,
"learning_rate": 8.768948394814899e-07,
"loss": 0.7396,
"step": 10725
},
{
"epoch": 0.1670467902262838,
"grad_norm": 7.962161064147949,
"learning_rate": 8.76812900476885e-07,
"loss": 0.8768,
"step": 10730
},
{
"epoch": 0.16712463122825316,
"grad_norm": 9.329161643981934,
"learning_rate": 8.7673096147228e-07,
"loss": 0.7469,
"step": 10735
},
{
"epoch": 0.16720247223022255,
"grad_norm": 2.5543742179870605,
"learning_rate": 8.76649022467675e-07,
"loss": 0.8198,
"step": 10740
},
{
"epoch": 0.16728031323219192,
"grad_norm": 3.192782402038574,
"learning_rate": 8.765670834630701e-07,
"loss": 0.7747,
"step": 10745
},
{
"epoch": 0.1673581542341613,
"grad_norm": 3.3908746242523193,
"learning_rate": 8.764851444584652e-07,
"loss": 0.8149,
"step": 10750
},
{
"epoch": 0.16743599523613067,
"grad_norm": 3.8879611492156982,
"learning_rate": 8.764032054538601e-07,
"loss": 0.774,
"step": 10755
},
{
"epoch": 0.16751383623810007,
"grad_norm": 5.076687335968018,
"learning_rate": 8.763212664492551e-07,
"loss": 0.799,
"step": 10760
},
{
"epoch": 0.16759167724006943,
"grad_norm": 5.339931964874268,
"learning_rate": 8.762393274446502e-07,
"loss": 0.756,
"step": 10765
},
{
"epoch": 0.16766951824203882,
"grad_norm": 4.076517581939697,
"learning_rate": 8.761573884400451e-07,
"loss": 0.8323,
"step": 10770
},
{
"epoch": 0.1677473592440082,
"grad_norm": 4.2539777755737305,
"learning_rate": 8.760754494354402e-07,
"loss": 0.6913,
"step": 10775
},
{
"epoch": 0.16782520024597758,
"grad_norm": 3.196856737136841,
"learning_rate": 8.759935104308353e-07,
"loss": 0.7324,
"step": 10780
},
{
"epoch": 0.16790304124794694,
"grad_norm": 13.658886909484863,
"learning_rate": 8.759115714262303e-07,
"loss": 0.7811,
"step": 10785
},
{
"epoch": 0.1679808822499163,
"grad_norm": 3.423370838165283,
"learning_rate": 8.758296324216253e-07,
"loss": 0.7902,
"step": 10790
},
{
"epoch": 0.1680587232518857,
"grad_norm": 4.14307165145874,
"learning_rate": 8.757476934170204e-07,
"loss": 0.7751,
"step": 10795
},
{
"epoch": 0.16813656425385506,
"grad_norm": 3.6597414016723633,
"learning_rate": 8.756657544124153e-07,
"loss": 0.7519,
"step": 10800
},
{
"epoch": 0.16821440525582446,
"grad_norm": 3.839320182800293,
"learning_rate": 8.755838154078103e-07,
"loss": 0.6788,
"step": 10805
},
{
"epoch": 0.16829224625779382,
"grad_norm": 5.377815246582031,
"learning_rate": 8.755018764032054e-07,
"loss": 0.8277,
"step": 10810
},
{
"epoch": 0.1683700872597632,
"grad_norm": 2.5293519496917725,
"learning_rate": 8.754199373986004e-07,
"loss": 0.7754,
"step": 10815
},
{
"epoch": 0.16844792826173258,
"grad_norm": 3.5276260375976562,
"learning_rate": 8.753379983939955e-07,
"loss": 0.6878,
"step": 10820
},
{
"epoch": 0.16852576926370197,
"grad_norm": 4.435384273529053,
"learning_rate": 8.752560593893905e-07,
"loss": 0.7848,
"step": 10825
},
{
"epoch": 0.16860361026567133,
"grad_norm": 4.338689804077148,
"learning_rate": 8.751741203847855e-07,
"loss": 0.7934,
"step": 10830
},
{
"epoch": 0.16868145126764073,
"grad_norm": 5.802286148071289,
"learning_rate": 8.750921813801806e-07,
"loss": 0.6343,
"step": 10835
},
{
"epoch": 0.1687592922696101,
"grad_norm": 5.789932727813721,
"learning_rate": 8.750102423755757e-07,
"loss": 0.8002,
"step": 10840
},
{
"epoch": 0.16883713327157948,
"grad_norm": 3.8039424419403076,
"learning_rate": 8.749283033709705e-07,
"loss": 0.7334,
"step": 10845
},
{
"epoch": 0.16891497427354885,
"grad_norm": 3.930701971054077,
"learning_rate": 8.748463643663656e-07,
"loss": 0.8077,
"step": 10850
},
{
"epoch": 0.16899281527551824,
"grad_norm": 15.397842407226562,
"learning_rate": 8.747644253617607e-07,
"loss": 0.9553,
"step": 10855
},
{
"epoch": 0.1690706562774876,
"grad_norm": 7.080071926116943,
"learning_rate": 8.746824863571557e-07,
"loss": 0.7472,
"step": 10860
},
{
"epoch": 0.169148497279457,
"grad_norm": 3.849839687347412,
"learning_rate": 8.746005473525507e-07,
"loss": 0.7191,
"step": 10865
},
{
"epoch": 0.16922633828142636,
"grad_norm": 3.95355486869812,
"learning_rate": 8.745186083479458e-07,
"loss": 0.661,
"step": 10870
},
{
"epoch": 0.16930417928339572,
"grad_norm": 2.4966964721679688,
"learning_rate": 8.744366693433409e-07,
"loss": 0.8823,
"step": 10875
},
{
"epoch": 0.16938202028536511,
"grad_norm": 6.468384742736816,
"learning_rate": 8.743547303387358e-07,
"loss": 0.7943,
"step": 10880
},
{
"epoch": 0.16945986128733448,
"grad_norm": 4.023189544677734,
"learning_rate": 8.742727913341308e-07,
"loss": 0.7107,
"step": 10885
},
{
"epoch": 0.16953770228930387,
"grad_norm": 3.180260419845581,
"learning_rate": 8.741908523295259e-07,
"loss": 0.7373,
"step": 10890
},
{
"epoch": 0.16961554329127324,
"grad_norm": 4.347747802734375,
"learning_rate": 8.741089133249208e-07,
"loss": 0.7309,
"step": 10895
},
{
"epoch": 0.16969338429324263,
"grad_norm": 6.062042713165283,
"learning_rate": 8.740269743203159e-07,
"loss": 0.7572,
"step": 10900
},
{
"epoch": 0.169771225295212,
"grad_norm": 2.9714925289154053,
"learning_rate": 8.73945035315711e-07,
"loss": 0.7924,
"step": 10905
},
{
"epoch": 0.16984906629718138,
"grad_norm": 3.827422618865967,
"learning_rate": 8.73863096311106e-07,
"loss": 0.912,
"step": 10910
},
{
"epoch": 0.16992690729915075,
"grad_norm": 5.133169174194336,
"learning_rate": 8.73781157306501e-07,
"loss": 0.7078,
"step": 10915
},
{
"epoch": 0.17000474830112014,
"grad_norm": 2.856977939605713,
"learning_rate": 8.736992183018961e-07,
"loss": 0.8076,
"step": 10920
},
{
"epoch": 0.1700825893030895,
"grad_norm": 4.684539318084717,
"learning_rate": 8.73617279297291e-07,
"loss": 0.7045,
"step": 10925
},
{
"epoch": 0.1701604303050589,
"grad_norm": 4.420496940612793,
"learning_rate": 8.73535340292686e-07,
"loss": 0.8248,
"step": 10930
},
{
"epoch": 0.17023827130702826,
"grad_norm": 3.429518461227417,
"learning_rate": 8.734534012880811e-07,
"loss": 0.7643,
"step": 10935
},
{
"epoch": 0.17031611230899765,
"grad_norm": 3.1888604164123535,
"learning_rate": 8.733714622834761e-07,
"loss": 0.7827,
"step": 10940
},
{
"epoch": 0.17039395331096702,
"grad_norm": 3.491792917251587,
"learning_rate": 8.732895232788712e-07,
"loss": 0.8594,
"step": 10945
},
{
"epoch": 0.1704717943129364,
"grad_norm": 4.644289493560791,
"learning_rate": 8.732075842742662e-07,
"loss": 0.7508,
"step": 10950
},
{
"epoch": 0.17054963531490577,
"grad_norm": 3.222562313079834,
"learning_rate": 8.731256452696612e-07,
"loss": 0.7755,
"step": 10955
},
{
"epoch": 0.17062747631687514,
"grad_norm": 7.415604591369629,
"learning_rate": 8.730437062650563e-07,
"loss": 0.677,
"step": 10960
},
{
"epoch": 0.17070531731884453,
"grad_norm": 3.6360597610473633,
"learning_rate": 8.729617672604513e-07,
"loss": 0.8338,
"step": 10965
},
{
"epoch": 0.1707831583208139,
"grad_norm": 9.364253044128418,
"learning_rate": 8.728798282558462e-07,
"loss": 0.7644,
"step": 10970
},
{
"epoch": 0.17086099932278329,
"grad_norm": 3.551168441772461,
"learning_rate": 8.727978892512413e-07,
"loss": 0.7808,
"step": 10975
},
{
"epoch": 0.17093884032475265,
"grad_norm": 6.72664213180542,
"learning_rate": 8.727159502466364e-07,
"loss": 0.7789,
"step": 10980
},
{
"epoch": 0.17101668132672204,
"grad_norm": 3.8760182857513428,
"learning_rate": 8.726340112420314e-07,
"loss": 0.7943,
"step": 10985
},
{
"epoch": 0.1710945223286914,
"grad_norm": 4.414963722229004,
"learning_rate": 8.725520722374264e-07,
"loss": 0.753,
"step": 10990
},
{
"epoch": 0.1711723633306608,
"grad_norm": 8.448708534240723,
"learning_rate": 8.724701332328215e-07,
"loss": 0.704,
"step": 10995
},
{
"epoch": 0.17125020433263016,
"grad_norm": 3.2329261302948,
"learning_rate": 8.723881942282166e-07,
"loss": 0.7565,
"step": 11000
},
{
"epoch": 0.17132804533459955,
"grad_norm": 7.1403303146362305,
"learning_rate": 8.723062552236114e-07,
"loss": 0.7924,
"step": 11005
},
{
"epoch": 0.17140588633656892,
"grad_norm": 2.50464129447937,
"learning_rate": 8.722243162190065e-07,
"loss": 0.8038,
"step": 11010
},
{
"epoch": 0.1714837273385383,
"grad_norm": 3.236157178878784,
"learning_rate": 8.721423772144016e-07,
"loss": 0.7118,
"step": 11015
},
{
"epoch": 0.17156156834050768,
"grad_norm": 4.607430934906006,
"learning_rate": 8.720604382097966e-07,
"loss": 0.8654,
"step": 11020
},
{
"epoch": 0.17163940934247707,
"grad_norm": 3.002549171447754,
"learning_rate": 8.719784992051916e-07,
"loss": 0.7738,
"step": 11025
},
{
"epoch": 0.17171725034444643,
"grad_norm": 3.545802116394043,
"learning_rate": 8.718965602005867e-07,
"loss": 0.7659,
"step": 11030
},
{
"epoch": 0.17179509134641582,
"grad_norm": 4.228000640869141,
"learning_rate": 8.718146211959817e-07,
"loss": 0.89,
"step": 11035
},
{
"epoch": 0.1718729323483852,
"grad_norm": 10.417171478271484,
"learning_rate": 8.717326821913767e-07,
"loss": 0.7852,
"step": 11040
},
{
"epoch": 0.17195077335035455,
"grad_norm": 5.213564872741699,
"learning_rate": 8.716507431867717e-07,
"loss": 0.8346,
"step": 11045
},
{
"epoch": 0.17202861435232394,
"grad_norm": 3.869044780731201,
"learning_rate": 8.715688041821667e-07,
"loss": 0.7275,
"step": 11050
},
{
"epoch": 0.1721064553542933,
"grad_norm": 3.7918996810913086,
"learning_rate": 8.714868651775618e-07,
"loss": 0.9288,
"step": 11055
},
{
"epoch": 0.1721842963562627,
"grad_norm": 3.620591640472412,
"learning_rate": 8.714049261729568e-07,
"loss": 0.8667,
"step": 11060
},
{
"epoch": 0.17226213735823206,
"grad_norm": 3.3272628784179688,
"learning_rate": 8.713229871683518e-07,
"loss": 0.7839,
"step": 11065
},
{
"epoch": 0.17233997836020146,
"grad_norm": 3.3981659412384033,
"learning_rate": 8.712410481637469e-07,
"loss": 0.8703,
"step": 11070
},
{
"epoch": 0.17241781936217082,
"grad_norm": 2.7353670597076416,
"learning_rate": 8.71159109159142e-07,
"loss": 0.8323,
"step": 11075
},
{
"epoch": 0.1724956603641402,
"grad_norm": 3.0876946449279785,
"learning_rate": 8.710771701545369e-07,
"loss": 0.7806,
"step": 11080
},
{
"epoch": 0.17257350136610958,
"grad_norm": 4.163149833679199,
"learning_rate": 8.709952311499319e-07,
"loss": 0.738,
"step": 11085
},
{
"epoch": 0.17265134236807897,
"grad_norm": 3.454596519470215,
"learning_rate": 8.70913292145327e-07,
"loss": 0.7692,
"step": 11090
},
{
"epoch": 0.17272918337004833,
"grad_norm": 5.42886209487915,
"learning_rate": 8.708313531407219e-07,
"loss": 0.8061,
"step": 11095
},
{
"epoch": 0.17280702437201773,
"grad_norm": 6.216919898986816,
"learning_rate": 8.70749414136117e-07,
"loss": 0.7982,
"step": 11100
},
{
"epoch": 0.1728848653739871,
"grad_norm": 3.19822359085083,
"learning_rate": 8.706674751315121e-07,
"loss": 0.7365,
"step": 11105
},
{
"epoch": 0.17296270637595648,
"grad_norm": 6.890682220458984,
"learning_rate": 8.705855361269072e-07,
"loss": 0.8121,
"step": 11110
},
{
"epoch": 0.17304054737792585,
"grad_norm": 3.8538219928741455,
"learning_rate": 8.705035971223021e-07,
"loss": 0.8005,
"step": 11115
},
{
"epoch": 0.17311838837989524,
"grad_norm": 4.351493835449219,
"learning_rate": 8.704216581176972e-07,
"loss": 0.7921,
"step": 11120
},
{
"epoch": 0.1731962293818646,
"grad_norm": 4.173168182373047,
"learning_rate": 8.703397191130923e-07,
"loss": 0.7759,
"step": 11125
},
{
"epoch": 0.17327407038383397,
"grad_norm": 3.0455987453460693,
"learning_rate": 8.702577801084871e-07,
"loss": 0.7061,
"step": 11130
},
{
"epoch": 0.17335191138580336,
"grad_norm": 8.838937759399414,
"learning_rate": 8.701758411038822e-07,
"loss": 0.6888,
"step": 11135
},
{
"epoch": 0.17342975238777272,
"grad_norm": 10.514293670654297,
"learning_rate": 8.700939020992773e-07,
"loss": 0.8403,
"step": 11140
},
{
"epoch": 0.17350759338974212,
"grad_norm": 2.9291927814483643,
"learning_rate": 8.700119630946723e-07,
"loss": 0.8609,
"step": 11145
},
{
"epoch": 0.17358543439171148,
"grad_norm": 4.258464336395264,
"learning_rate": 8.699300240900673e-07,
"loss": 0.8436,
"step": 11150
},
{
"epoch": 0.17366327539368087,
"grad_norm": 4.499458312988281,
"learning_rate": 8.698480850854624e-07,
"loss": 0.883,
"step": 11155
},
{
"epoch": 0.17374111639565024,
"grad_norm": 4.2294745445251465,
"learning_rate": 8.697661460808574e-07,
"loss": 0.711,
"step": 11160
},
{
"epoch": 0.17381895739761963,
"grad_norm": 2.956054925918579,
"learning_rate": 8.696842070762525e-07,
"loss": 0.8658,
"step": 11165
},
{
"epoch": 0.173896798399589,
"grad_norm": 6.596224308013916,
"learning_rate": 8.696022680716474e-07,
"loss": 0.8289,
"step": 11170
},
{
"epoch": 0.17397463940155838,
"grad_norm": 4.582150936126709,
"learning_rate": 8.695203290670424e-07,
"loss": 0.7524,
"step": 11175
},
{
"epoch": 0.17405248040352775,
"grad_norm": 2.815945625305176,
"learning_rate": 8.694383900624375e-07,
"loss": 0.8105,
"step": 11180
},
{
"epoch": 0.17413032140549714,
"grad_norm": 7.399906635284424,
"learning_rate": 8.693564510578325e-07,
"loss": 0.628,
"step": 11185
},
{
"epoch": 0.1742081624074665,
"grad_norm": 2.3162524700164795,
"learning_rate": 8.692745120532275e-07,
"loss": 0.6802,
"step": 11190
},
{
"epoch": 0.1742860034094359,
"grad_norm": 4.691956043243408,
"learning_rate": 8.691925730486226e-07,
"loss": 0.9559,
"step": 11195
},
{
"epoch": 0.17436384441140526,
"grad_norm": 5.132546901702881,
"learning_rate": 8.691106340440177e-07,
"loss": 0.8418,
"step": 11200
},
{
"epoch": 0.17444168541337465,
"grad_norm": 6.407838344573975,
"learning_rate": 8.690286950394126e-07,
"loss": 0.9149,
"step": 11205
},
{
"epoch": 0.17451952641534402,
"grad_norm": 2.8634774684906006,
"learning_rate": 8.689467560348076e-07,
"loss": 0.8361,
"step": 11210
},
{
"epoch": 0.1745973674173134,
"grad_norm": 5.304955959320068,
"learning_rate": 8.688648170302027e-07,
"loss": 0.7911,
"step": 11215
},
{
"epoch": 0.17467520841928277,
"grad_norm": 6.025475978851318,
"learning_rate": 8.687828780255976e-07,
"loss": 0.7571,
"step": 11220
},
{
"epoch": 0.17475304942125214,
"grad_norm": 4.901416301727295,
"learning_rate": 8.687009390209927e-07,
"loss": 0.7321,
"step": 11225
},
{
"epoch": 0.17483089042322153,
"grad_norm": 2.951046943664551,
"learning_rate": 8.686190000163878e-07,
"loss": 0.756,
"step": 11230
},
{
"epoch": 0.1749087314251909,
"grad_norm": 3.0569169521331787,
"learning_rate": 8.685370610117829e-07,
"loss": 0.7937,
"step": 11235
},
{
"epoch": 0.1749865724271603,
"grad_norm": 3.96439528465271,
"learning_rate": 8.684551220071778e-07,
"loss": 0.7065,
"step": 11240
},
{
"epoch": 0.17506441342912965,
"grad_norm": 5.546309947967529,
"learning_rate": 8.683731830025729e-07,
"loss": 0.7787,
"step": 11245
},
{
"epoch": 0.17514225443109904,
"grad_norm": 4.913859844207764,
"learning_rate": 8.682912439979679e-07,
"loss": 0.7689,
"step": 11250
},
{
"epoch": 0.1752200954330684,
"grad_norm": 3.5462117195129395,
"learning_rate": 8.682093049933628e-07,
"loss": 0.7766,
"step": 11255
},
{
"epoch": 0.1752979364350378,
"grad_norm": 5.14613676071167,
"learning_rate": 8.681273659887579e-07,
"loss": 0.8672,
"step": 11260
},
{
"epoch": 0.17537577743700716,
"grad_norm": 3.5507776737213135,
"learning_rate": 8.68045426984153e-07,
"loss": 0.8519,
"step": 11265
},
{
"epoch": 0.17545361843897656,
"grad_norm": 5.96008825302124,
"learning_rate": 8.67963487979548e-07,
"loss": 0.8072,
"step": 11270
},
{
"epoch": 0.17553145944094592,
"grad_norm": 3.9602410793304443,
"learning_rate": 8.67881548974943e-07,
"loss": 0.7046,
"step": 11275
},
{
"epoch": 0.1756093004429153,
"grad_norm": 9.932755470275879,
"learning_rate": 8.677996099703381e-07,
"loss": 0.6901,
"step": 11280
},
{
"epoch": 0.17568714144488468,
"grad_norm": 3.0417749881744385,
"learning_rate": 8.677176709657331e-07,
"loss": 0.8085,
"step": 11285
},
{
"epoch": 0.17576498244685407,
"grad_norm": 8.406500816345215,
"learning_rate": 8.67635731961128e-07,
"loss": 0.8751,
"step": 11290
},
{
"epoch": 0.17584282344882343,
"grad_norm": 3.1526453495025635,
"learning_rate": 8.675537929565231e-07,
"loss": 0.779,
"step": 11295
},
{
"epoch": 0.17592066445079282,
"grad_norm": 8.264491081237793,
"learning_rate": 8.674718539519181e-07,
"loss": 0.6696,
"step": 11300
},
{
"epoch": 0.1759985054527622,
"grad_norm": 4.0117011070251465,
"learning_rate": 8.673899149473132e-07,
"loss": 0.7622,
"step": 11305
},
{
"epoch": 0.17607634645473155,
"grad_norm": 5.8008527755737305,
"learning_rate": 8.673079759427082e-07,
"loss": 0.9383,
"step": 11310
},
{
"epoch": 0.17615418745670094,
"grad_norm": 3.362180709838867,
"learning_rate": 8.672260369381032e-07,
"loss": 0.7315,
"step": 11315
},
{
"epoch": 0.1762320284586703,
"grad_norm": 3.2175207138061523,
"learning_rate": 8.671440979334983e-07,
"loss": 0.7064,
"step": 11320
},
{
"epoch": 0.1763098694606397,
"grad_norm": 3.180907964706421,
"learning_rate": 8.670621589288934e-07,
"loss": 0.7622,
"step": 11325
},
{
"epoch": 0.17638771046260907,
"grad_norm": 4.250461578369141,
"learning_rate": 8.669802199242882e-07,
"loss": 0.9496,
"step": 11330
},
{
"epoch": 0.17646555146457846,
"grad_norm": 3.719573497772217,
"learning_rate": 8.668982809196833e-07,
"loss": 0.7536,
"step": 11335
},
{
"epoch": 0.17654339246654782,
"grad_norm": 4.155417442321777,
"learning_rate": 8.668163419150784e-07,
"loss": 0.7122,
"step": 11340
},
{
"epoch": 0.1766212334685172,
"grad_norm": 5.191723346710205,
"learning_rate": 8.667344029104733e-07,
"loss": 0.6908,
"step": 11345
},
{
"epoch": 0.17669907447048658,
"grad_norm": 6.202426910400391,
"learning_rate": 8.666524639058684e-07,
"loss": 0.8704,
"step": 11350
},
{
"epoch": 0.17677691547245597,
"grad_norm": 3.2510268688201904,
"learning_rate": 8.665705249012635e-07,
"loss": 0.702,
"step": 11355
},
{
"epoch": 0.17685475647442533,
"grad_norm": 7.769083023071289,
"learning_rate": 8.664885858966586e-07,
"loss": 0.8048,
"step": 11360
},
{
"epoch": 0.17693259747639473,
"grad_norm": 3.8902037143707275,
"learning_rate": 8.664066468920535e-07,
"loss": 0.7719,
"step": 11365
},
{
"epoch": 0.1770104384783641,
"grad_norm": 4.6013407707214355,
"learning_rate": 8.663247078874485e-07,
"loss": 0.7691,
"step": 11370
},
{
"epoch": 0.17708827948033348,
"grad_norm": 7.063052654266357,
"learning_rate": 8.662427688828436e-07,
"loss": 0.8091,
"step": 11375
},
{
"epoch": 0.17716612048230285,
"grad_norm": 4.544634819030762,
"learning_rate": 8.661608298782386e-07,
"loss": 0.9371,
"step": 11380
},
{
"epoch": 0.17724396148427224,
"grad_norm": 4.195473670959473,
"learning_rate": 8.660788908736336e-07,
"loss": 0.7634,
"step": 11385
},
{
"epoch": 0.1773218024862416,
"grad_norm": 18.608367919921875,
"learning_rate": 8.659969518690287e-07,
"loss": 0.7902,
"step": 11390
},
{
"epoch": 0.17739964348821097,
"grad_norm": 3.536041498184204,
"learning_rate": 8.659150128644237e-07,
"loss": 0.7095,
"step": 11395
},
{
"epoch": 0.17747748449018036,
"grad_norm": 3.1353678703308105,
"learning_rate": 8.658330738598187e-07,
"loss": 0.688,
"step": 11400
},
{
"epoch": 0.17755532549214972,
"grad_norm": 2.603710889816284,
"learning_rate": 8.657511348552138e-07,
"loss": 0.8218,
"step": 11405
},
{
"epoch": 0.17763316649411912,
"grad_norm": 3.3999760150909424,
"learning_rate": 8.656691958506087e-07,
"loss": 0.7279,
"step": 11410
},
{
"epoch": 0.17771100749608848,
"grad_norm": 3.0341594219207764,
"learning_rate": 8.655872568460038e-07,
"loss": 0.754,
"step": 11415
},
{
"epoch": 0.17778884849805787,
"grad_norm": 5.157776355743408,
"learning_rate": 8.655053178413988e-07,
"loss": 0.8347,
"step": 11420
},
{
"epoch": 0.17786668950002724,
"grad_norm": 5.881651878356934,
"learning_rate": 8.654233788367938e-07,
"loss": 0.868,
"step": 11425
},
{
"epoch": 0.17794453050199663,
"grad_norm": 7.980086326599121,
"learning_rate": 8.653414398321889e-07,
"loss": 0.8467,
"step": 11430
},
{
"epoch": 0.178022371503966,
"grad_norm": 3.4080753326416016,
"learning_rate": 8.65259500827584e-07,
"loss": 0.641,
"step": 11435
},
{
"epoch": 0.17810021250593538,
"grad_norm": 8.840136528015137,
"learning_rate": 8.651775618229789e-07,
"loss": 0.8391,
"step": 11440
},
{
"epoch": 0.17817805350790475,
"grad_norm": 6.058139324188232,
"learning_rate": 8.65095622818374e-07,
"loss": 0.6896,
"step": 11445
},
{
"epoch": 0.17825589450987414,
"grad_norm": 3.771533250808716,
"learning_rate": 8.650136838137691e-07,
"loss": 0.7656,
"step": 11450
},
{
"epoch": 0.1783337355118435,
"grad_norm": 5.53726053237915,
"learning_rate": 8.649317448091639e-07,
"loss": 0.7161,
"step": 11455
},
{
"epoch": 0.1784115765138129,
"grad_norm": 4.151732444763184,
"learning_rate": 8.64849805804559e-07,
"loss": 0.7258,
"step": 11460
},
{
"epoch": 0.17848941751578226,
"grad_norm": 5.9917731285095215,
"learning_rate": 8.647678667999541e-07,
"loss": 0.8906,
"step": 11465
},
{
"epoch": 0.17856725851775165,
"grad_norm": 3.5444159507751465,
"learning_rate": 8.64685927795349e-07,
"loss": 0.8006,
"step": 11470
},
{
"epoch": 0.17864509951972102,
"grad_norm": 7.527516841888428,
"learning_rate": 8.646039887907441e-07,
"loss": 0.837,
"step": 11475
},
{
"epoch": 0.17872294052169038,
"grad_norm": 3.505826711654663,
"learning_rate": 8.645220497861392e-07,
"loss": 0.7962,
"step": 11480
},
{
"epoch": 0.17880078152365977,
"grad_norm": 4.634178161621094,
"learning_rate": 8.644401107815343e-07,
"loss": 0.7389,
"step": 11485
},
{
"epoch": 0.17887862252562914,
"grad_norm": 2.834869623184204,
"learning_rate": 8.643581717769292e-07,
"loss": 0.837,
"step": 11490
},
{
"epoch": 0.17895646352759853,
"grad_norm": 4.649651527404785,
"learning_rate": 8.642762327723242e-07,
"loss": 0.8676,
"step": 11495
},
{
"epoch": 0.1790343045295679,
"grad_norm": 4.7943925857543945,
"learning_rate": 8.641942937677193e-07,
"loss": 0.7031,
"step": 11500
},
{
"epoch": 0.1791121455315373,
"grad_norm": 3.4490435123443604,
"learning_rate": 8.641123547631143e-07,
"loss": 0.8332,
"step": 11505
},
{
"epoch": 0.17918998653350665,
"grad_norm": 2.6124467849731445,
"learning_rate": 8.640304157585093e-07,
"loss": 0.8277,
"step": 11510
},
{
"epoch": 0.17926782753547604,
"grad_norm": 3.379868507385254,
"learning_rate": 8.639484767539044e-07,
"loss": 0.8617,
"step": 11515
},
{
"epoch": 0.1793456685374454,
"grad_norm": 4.773552417755127,
"learning_rate": 8.638665377492994e-07,
"loss": 0.8771,
"step": 11520
},
{
"epoch": 0.1794235095394148,
"grad_norm": 8.424163818359375,
"learning_rate": 8.637845987446945e-07,
"loss": 0.8067,
"step": 11525
},
{
"epoch": 0.17950135054138416,
"grad_norm": 4.11102294921875,
"learning_rate": 8.637026597400895e-07,
"loss": 0.7825,
"step": 11530
},
{
"epoch": 0.17957919154335356,
"grad_norm": 3.056248188018799,
"learning_rate": 8.636207207354844e-07,
"loss": 0.805,
"step": 11535
},
{
"epoch": 0.17965703254532292,
"grad_norm": 6.354325294494629,
"learning_rate": 8.635387817308795e-07,
"loss": 0.697,
"step": 11540
},
{
"epoch": 0.1797348735472923,
"grad_norm": 4.208707809448242,
"learning_rate": 8.634568427262745e-07,
"loss": 0.7442,
"step": 11545
},
{
"epoch": 0.17981271454926168,
"grad_norm": 3.256444215774536,
"learning_rate": 8.633749037216695e-07,
"loss": 0.7955,
"step": 11550
},
{
"epoch": 0.17989055555123107,
"grad_norm": 2.9025378227233887,
"learning_rate": 8.632929647170646e-07,
"loss": 0.8062,
"step": 11555
},
{
"epoch": 0.17996839655320043,
"grad_norm": 5.104341506958008,
"learning_rate": 8.632110257124597e-07,
"loss": 0.9282,
"step": 11560
},
{
"epoch": 0.1800462375551698,
"grad_norm": 3.7267444133758545,
"learning_rate": 8.631290867078546e-07,
"loss": 0.7416,
"step": 11565
},
{
"epoch": 0.1801240785571392,
"grad_norm": 2.9586052894592285,
"learning_rate": 8.630471477032497e-07,
"loss": 0.7961,
"step": 11570
},
{
"epoch": 0.18020191955910855,
"grad_norm": 4.650422096252441,
"learning_rate": 8.629652086986447e-07,
"loss": 0.7148,
"step": 11575
},
{
"epoch": 0.18027976056107795,
"grad_norm": 3.344991445541382,
"learning_rate": 8.628832696940396e-07,
"loss": 0.7804,
"step": 11580
},
{
"epoch": 0.1803576015630473,
"grad_norm": 3.1043858528137207,
"learning_rate": 8.628013306894347e-07,
"loss": 0.7004,
"step": 11585
},
{
"epoch": 0.1804354425650167,
"grad_norm": 4.275179862976074,
"learning_rate": 8.627193916848298e-07,
"loss": 0.702,
"step": 11590
},
{
"epoch": 0.18051328356698607,
"grad_norm": 5.884884357452393,
"learning_rate": 8.626374526802248e-07,
"loss": 0.7196,
"step": 11595
},
{
"epoch": 0.18059112456895546,
"grad_norm": 5.935474395751953,
"learning_rate": 8.625555136756198e-07,
"loss": 0.8548,
"step": 11600
},
{
"epoch": 0.18066896557092482,
"grad_norm": 3.8844761848449707,
"learning_rate": 8.624735746710149e-07,
"loss": 0.7467,
"step": 11605
},
{
"epoch": 0.18074680657289421,
"grad_norm": 3.4248223304748535,
"learning_rate": 8.6239163566641e-07,
"loss": 0.8664,
"step": 11610
},
{
"epoch": 0.18082464757486358,
"grad_norm": 4.0504679679870605,
"learning_rate": 8.623096966618048e-07,
"loss": 0.8649,
"step": 11615
},
{
"epoch": 0.18090248857683297,
"grad_norm": 3.0257060527801514,
"learning_rate": 8.622277576571999e-07,
"loss": 0.7091,
"step": 11620
},
{
"epoch": 0.18098032957880233,
"grad_norm": 4.030515193939209,
"learning_rate": 8.62145818652595e-07,
"loss": 0.8485,
"step": 11625
},
{
"epoch": 0.18105817058077173,
"grad_norm": 3.375437021255493,
"learning_rate": 8.6206387964799e-07,
"loss": 0.6832,
"step": 11630
},
{
"epoch": 0.1811360115827411,
"grad_norm": 4.179788112640381,
"learning_rate": 8.61981940643385e-07,
"loss": 0.8009,
"step": 11635
},
{
"epoch": 0.18121385258471048,
"grad_norm": 5.095760822296143,
"learning_rate": 8.619000016387801e-07,
"loss": 0.7984,
"step": 11640
},
{
"epoch": 0.18129169358667985,
"grad_norm": 3.096256732940674,
"learning_rate": 8.618180626341751e-07,
"loss": 0.8468,
"step": 11645
},
{
"epoch": 0.1813695345886492,
"grad_norm": 3.6533854007720947,
"learning_rate": 8.617361236295702e-07,
"loss": 0.7336,
"step": 11650
},
{
"epoch": 0.1814473755906186,
"grad_norm": 3.505079984664917,
"learning_rate": 8.616541846249651e-07,
"loss": 0.7025,
"step": 11655
},
{
"epoch": 0.18152521659258797,
"grad_norm": 3.6542341709136963,
"learning_rate": 8.615722456203601e-07,
"loss": 0.8489,
"step": 11660
},
{
"epoch": 0.18160305759455736,
"grad_norm": 6.4186811447143555,
"learning_rate": 8.614903066157552e-07,
"loss": 0.7712,
"step": 11665
},
{
"epoch": 0.18168089859652672,
"grad_norm": 5.470929145812988,
"learning_rate": 8.614083676111502e-07,
"loss": 0.819,
"step": 11670
},
{
"epoch": 0.18175873959849612,
"grad_norm": 3.3178632259368896,
"learning_rate": 8.613264286065452e-07,
"loss": 0.813,
"step": 11675
},
{
"epoch": 0.18183658060046548,
"grad_norm": 3.1316630840301514,
"learning_rate": 8.612444896019403e-07,
"loss": 0.7939,
"step": 11680
},
{
"epoch": 0.18191442160243487,
"grad_norm": 5.289381980895996,
"learning_rate": 8.611625505973354e-07,
"loss": 0.7354,
"step": 11685
},
{
"epoch": 0.18199226260440424,
"grad_norm": 2.2420859336853027,
"learning_rate": 8.610806115927303e-07,
"loss": 0.7458,
"step": 11690
},
{
"epoch": 0.18207010360637363,
"grad_norm": 5.712198257446289,
"learning_rate": 8.609986725881253e-07,
"loss": 0.7175,
"step": 11695
},
{
"epoch": 0.182147944608343,
"grad_norm": 4.2332353591918945,
"learning_rate": 8.609167335835204e-07,
"loss": 0.7712,
"step": 11700
},
{
"epoch": 0.18222578561031239,
"grad_norm": 3.4181714057922363,
"learning_rate": 8.608347945789153e-07,
"loss": 0.7699,
"step": 11705
},
{
"epoch": 0.18230362661228175,
"grad_norm": 4.554285049438477,
"learning_rate": 8.607528555743104e-07,
"loss": 0.7718,
"step": 11710
},
{
"epoch": 0.18238146761425114,
"grad_norm": 4.693836212158203,
"learning_rate": 8.606709165697055e-07,
"loss": 0.8118,
"step": 11715
},
{
"epoch": 0.1824593086162205,
"grad_norm": 5.117660999298096,
"learning_rate": 8.605889775651005e-07,
"loss": 0.7083,
"step": 11720
},
{
"epoch": 0.1825371496181899,
"grad_norm": 7.209866046905518,
"learning_rate": 8.605070385604955e-07,
"loss": 0.832,
"step": 11725
},
{
"epoch": 0.18261499062015926,
"grad_norm": 3.6614935398101807,
"learning_rate": 8.604250995558906e-07,
"loss": 0.7817,
"step": 11730
},
{
"epoch": 0.18269283162212863,
"grad_norm": 3.863678216934204,
"learning_rate": 8.603431605512855e-07,
"loss": 0.8006,
"step": 11735
},
{
"epoch": 0.18277067262409802,
"grad_norm": 3.742063522338867,
"learning_rate": 8.602612215466806e-07,
"loss": 0.7947,
"step": 11740
},
{
"epoch": 0.18284851362606738,
"grad_norm": 2.856513261795044,
"learning_rate": 8.601792825420756e-07,
"loss": 0.745,
"step": 11745
},
{
"epoch": 0.18292635462803677,
"grad_norm": 2.845435619354248,
"learning_rate": 8.600973435374707e-07,
"loss": 0.794,
"step": 11750
},
{
"epoch": 0.18300419563000614,
"grad_norm": 8.055949211120605,
"learning_rate": 8.600154045328657e-07,
"loss": 0.7383,
"step": 11755
},
{
"epoch": 0.18308203663197553,
"grad_norm": 4.0927510261535645,
"learning_rate": 8.599334655282607e-07,
"loss": 0.6759,
"step": 11760
},
{
"epoch": 0.1831598776339449,
"grad_norm": 3.1355292797088623,
"learning_rate": 8.598515265236558e-07,
"loss": 0.771,
"step": 11765
},
{
"epoch": 0.1832377186359143,
"grad_norm": 9.864462852478027,
"learning_rate": 8.597695875190508e-07,
"loss": 0.709,
"step": 11770
},
{
"epoch": 0.18331555963788365,
"grad_norm": 5.377257823944092,
"learning_rate": 8.596876485144459e-07,
"loss": 0.903,
"step": 11775
},
{
"epoch": 0.18339340063985304,
"grad_norm": 3.909209728240967,
"learning_rate": 8.596057095098408e-07,
"loss": 0.7645,
"step": 11780
},
{
"epoch": 0.1834712416418224,
"grad_norm": 4.229231834411621,
"learning_rate": 8.595237705052358e-07,
"loss": 0.8469,
"step": 11785
},
{
"epoch": 0.1835490826437918,
"grad_norm": 9.634142875671387,
"learning_rate": 8.594418315006309e-07,
"loss": 0.88,
"step": 11790
},
{
"epoch": 0.18362692364576116,
"grad_norm": 2.9738218784332275,
"learning_rate": 8.59359892496026e-07,
"loss": 0.689,
"step": 11795
},
{
"epoch": 0.18370476464773056,
"grad_norm": 3.0469038486480713,
"learning_rate": 8.592779534914209e-07,
"loss": 0.8383,
"step": 11800
},
{
"epoch": 0.18378260564969992,
"grad_norm": 4.129268646240234,
"learning_rate": 8.59196014486816e-07,
"loss": 0.6902,
"step": 11805
},
{
"epoch": 0.1838604466516693,
"grad_norm": 4.603461265563965,
"learning_rate": 8.591140754822111e-07,
"loss": 0.813,
"step": 11810
},
{
"epoch": 0.18393828765363868,
"grad_norm": 4.814962387084961,
"learning_rate": 8.59032136477606e-07,
"loss": 0.8819,
"step": 11815
},
{
"epoch": 0.18401612865560804,
"grad_norm": 7.185861110687256,
"learning_rate": 8.58950197473001e-07,
"loss": 0.8366,
"step": 11820
},
{
"epoch": 0.18409396965757743,
"grad_norm": 7.243460655212402,
"learning_rate": 8.588682584683961e-07,
"loss": 0.9382,
"step": 11825
},
{
"epoch": 0.1841718106595468,
"grad_norm": 6.482030391693115,
"learning_rate": 8.587863194637911e-07,
"loss": 0.8316,
"step": 11830
},
{
"epoch": 0.1842496516615162,
"grad_norm": 12.124528884887695,
"learning_rate": 8.587043804591861e-07,
"loss": 0.8205,
"step": 11835
},
{
"epoch": 0.18432749266348555,
"grad_norm": 4.70367431640625,
"learning_rate": 8.586224414545812e-07,
"loss": 0.6944,
"step": 11840
},
{
"epoch": 0.18440533366545495,
"grad_norm": 6.1089558601379395,
"learning_rate": 8.585405024499762e-07,
"loss": 0.8094,
"step": 11845
},
{
"epoch": 0.1844831746674243,
"grad_norm": 3.807187557220459,
"learning_rate": 8.584585634453712e-07,
"loss": 0.8715,
"step": 11850
},
{
"epoch": 0.1845610156693937,
"grad_norm": 4.469877243041992,
"learning_rate": 8.583766244407663e-07,
"loss": 0.7794,
"step": 11855
},
{
"epoch": 0.18463885667136307,
"grad_norm": 5.07852029800415,
"learning_rate": 8.582946854361612e-07,
"loss": 0.7603,
"step": 11860
},
{
"epoch": 0.18471669767333246,
"grad_norm": 3.7635741233825684,
"learning_rate": 8.582127464315563e-07,
"loss": 0.817,
"step": 11865
},
{
"epoch": 0.18479453867530182,
"grad_norm": 5.198869228363037,
"learning_rate": 8.581308074269513e-07,
"loss": 0.8442,
"step": 11870
},
{
"epoch": 0.18487237967727121,
"grad_norm": 3.176208019256592,
"learning_rate": 8.580488684223464e-07,
"loss": 0.8135,
"step": 11875
},
{
"epoch": 0.18495022067924058,
"grad_norm": 5.479611396789551,
"learning_rate": 8.579669294177414e-07,
"loss": 0.686,
"step": 11880
},
{
"epoch": 0.18502806168120997,
"grad_norm": 5.823884010314941,
"learning_rate": 8.578849904131365e-07,
"loss": 0.7851,
"step": 11885
},
{
"epoch": 0.18510590268317934,
"grad_norm": 4.050318717956543,
"learning_rate": 8.578030514085315e-07,
"loss": 0.7321,
"step": 11890
},
{
"epoch": 0.18518374368514873,
"grad_norm": 2.703254461288452,
"learning_rate": 8.577211124039265e-07,
"loss": 0.7569,
"step": 11895
},
{
"epoch": 0.1852615846871181,
"grad_norm": 6.201961517333984,
"learning_rate": 8.576391733993215e-07,
"loss": 0.8172,
"step": 11900
},
{
"epoch": 0.18533942568908748,
"grad_norm": 4.213263511657715,
"learning_rate": 8.575572343947165e-07,
"loss": 0.8344,
"step": 11905
},
{
"epoch": 0.18541726669105685,
"grad_norm": 6.157925128936768,
"learning_rate": 8.574752953901115e-07,
"loss": 0.7343,
"step": 11910
},
{
"epoch": 0.1854951076930262,
"grad_norm": 4.5453691482543945,
"learning_rate": 8.573933563855066e-07,
"loss": 0.7038,
"step": 11915
},
{
"epoch": 0.1855729486949956,
"grad_norm": 4.831943035125732,
"learning_rate": 8.573114173809017e-07,
"loss": 0.8805,
"step": 11920
},
{
"epoch": 0.18565078969696497,
"grad_norm": 2.977743625640869,
"learning_rate": 8.572294783762966e-07,
"loss": 0.784,
"step": 11925
},
{
"epoch": 0.18572863069893436,
"grad_norm": 5.694007396697998,
"learning_rate": 8.571475393716917e-07,
"loss": 0.6915,
"step": 11930
},
{
"epoch": 0.18580647170090372,
"grad_norm": 4.1263017654418945,
"learning_rate": 8.570656003670868e-07,
"loss": 0.7476,
"step": 11935
},
{
"epoch": 0.18588431270287312,
"grad_norm": 5.350509166717529,
"learning_rate": 8.569836613624816e-07,
"loss": 0.6784,
"step": 11940
},
{
"epoch": 0.18596215370484248,
"grad_norm": 3.7994115352630615,
"learning_rate": 8.569017223578767e-07,
"loss": 0.7191,
"step": 11945
},
{
"epoch": 0.18603999470681187,
"grad_norm": 5.086226940155029,
"learning_rate": 8.568197833532718e-07,
"loss": 0.7841,
"step": 11950
},
{
"epoch": 0.18611783570878124,
"grad_norm": 4.248946189880371,
"learning_rate": 8.567378443486668e-07,
"loss": 0.8489,
"step": 11955
},
{
"epoch": 0.18619567671075063,
"grad_norm": 2.877885580062866,
"learning_rate": 8.566559053440618e-07,
"loss": 0.66,
"step": 11960
},
{
"epoch": 0.18627351771272,
"grad_norm": 6.250997066497803,
"learning_rate": 8.565739663394569e-07,
"loss": 0.7974,
"step": 11965
},
{
"epoch": 0.18635135871468939,
"grad_norm": 5.646812915802002,
"learning_rate": 8.564920273348519e-07,
"loss": 0.87,
"step": 11970
},
{
"epoch": 0.18642919971665875,
"grad_norm": 3.944369316101074,
"learning_rate": 8.56410088330247e-07,
"loss": 0.8484,
"step": 11975
},
{
"epoch": 0.18650704071862814,
"grad_norm": 3.154167413711548,
"learning_rate": 8.563281493256419e-07,
"loss": 0.728,
"step": 11980
},
{
"epoch": 0.1865848817205975,
"grad_norm": 5.012053489685059,
"learning_rate": 8.562462103210369e-07,
"loss": 0.7125,
"step": 11985
},
{
"epoch": 0.1866627227225669,
"grad_norm": 5.746982097625732,
"learning_rate": 8.56164271316432e-07,
"loss": 0.7059,
"step": 11990
},
{
"epoch": 0.18674056372453626,
"grad_norm": 3.114208698272705,
"learning_rate": 8.56082332311827e-07,
"loss": 0.76,
"step": 11995
},
{
"epoch": 0.18681840472650563,
"grad_norm": 3.1285858154296875,
"learning_rate": 8.560003933072221e-07,
"loss": 0.8049,
"step": 12000
}
],
"logging_steps": 5,
"max_steps": 64233,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.805456425474064e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}