{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.998596998426938, "eval_steps": 500, "global_step": 14700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017006079673483271, "grad_norm": 14.845677861036755, "learning_rate": 1.1312217194570136e-07, "loss": 1.8551, "step": 5 }, { "epoch": 0.0034012159346966542, "grad_norm": 10.628996742983775, "learning_rate": 2.2624434389140273e-07, "loss": 1.8205, "step": 10 }, { "epoch": 0.005101823902044981, "grad_norm": 16.209420936017096, "learning_rate": 3.393665158371041e-07, "loss": 1.8248, "step": 15 }, { "epoch": 0.0068024318693933085, "grad_norm": 14.057467625353302, "learning_rate": 4.5248868778280546e-07, "loss": 1.8197, "step": 20 }, { "epoch": 0.008503039836741635, "grad_norm": 8.549081078810927, "learning_rate": 5.656108597285068e-07, "loss": 1.8493, "step": 25 }, { "epoch": 0.010203647804089963, "grad_norm": 8.704433651079075, "learning_rate": 6.787330316742082e-07, "loss": 1.811, "step": 30 }, { "epoch": 0.011904255771438289, "grad_norm": 12.2652942343875, "learning_rate": 7.918552036199095e-07, "loss": 1.8156, "step": 35 }, { "epoch": 0.013604863738786617, "grad_norm": 7.896258864091117, "learning_rate": 9.049773755656109e-07, "loss": 1.8359, "step": 40 }, { "epoch": 0.015305471706134943, "grad_norm": 6.971049091184883, "learning_rate": 1.0180995475113123e-06, "loss": 1.8116, "step": 45 }, { "epoch": 0.01700607967348327, "grad_norm": 6.957776653029673, "learning_rate": 1.1312217194570136e-06, "loss": 1.7826, "step": 50 }, { "epoch": 0.018706687640831596, "grad_norm": 9.177739116592859, "learning_rate": 1.244343891402715e-06, "loss": 1.7557, "step": 55 }, { "epoch": 0.020407295608179925, "grad_norm": 7.756059818874078, "learning_rate": 1.3574660633484164e-06, "loss": 1.7249, "step": 60 }, { "epoch": 0.02210790357552825, "grad_norm": 18.463301673578197, "learning_rate": 1.4705882352941177e-06, "loss": 1.6644, "step": 65 }, { "epoch": 0.023808511542876578, "grad_norm": 10.995078036433805, "learning_rate": 1.583710407239819e-06, "loss": 1.649, "step": 70 }, { "epoch": 0.025509119510224904, "grad_norm": 12.94230015695223, "learning_rate": 1.6968325791855207e-06, "loss": 1.659, "step": 75 }, { "epoch": 0.027209727477573234, "grad_norm": 6.76101300426946, "learning_rate": 1.8099547511312218e-06, "loss": 1.6152, "step": 80 }, { "epoch": 0.02891033544492156, "grad_norm": 8.812543750769409, "learning_rate": 1.9230769230769234e-06, "loss": 1.5711, "step": 85 }, { "epoch": 0.030610943412269886, "grad_norm": 5.697844190371594, "learning_rate": 2.0361990950226245e-06, "loss": 1.5305, "step": 90 }, { "epoch": 0.03231155137961821, "grad_norm": 19.374366768238563, "learning_rate": 2.149321266968326e-06, "loss": 1.5062, "step": 95 }, { "epoch": 0.03401215934696654, "grad_norm": 5.157411864253761, "learning_rate": 2.2624434389140273e-06, "loss": 1.4838, "step": 100 }, { "epoch": 0.035712767314314865, "grad_norm": 13.81826997890456, "learning_rate": 2.3755656108597284e-06, "loss": 1.4604, "step": 105 }, { "epoch": 0.03741337528166319, "grad_norm": 8.371503617793609, "learning_rate": 2.48868778280543e-06, "loss": 1.424, "step": 110 }, { "epoch": 0.039113983249011525, "grad_norm": 4.489966249256709, "learning_rate": 2.6018099547511316e-06, "loss": 1.3639, "step": 115 }, { "epoch": 0.04081459121635985, "grad_norm": 6.698307717633603, "learning_rate": 2.7149321266968327e-06, "loss": 1.3189, "step": 120 }, { "epoch": 0.04251519918370818, "grad_norm": 7.92343503351275, "learning_rate": 2.8280542986425343e-06, "loss": 1.2216, "step": 125 }, { "epoch": 0.0442158071510565, "grad_norm": 7.3532950329675755, "learning_rate": 2.9411764705882355e-06, "loss": 1.1338, "step": 130 }, { "epoch": 0.04591641511840483, "grad_norm": 6.168611954311049, "learning_rate": 3.054298642533937e-06, "loss": 1.0414, "step": 135 }, { "epoch": 0.047617023085753156, "grad_norm": 5.875428318783832, "learning_rate": 3.167420814479638e-06, "loss": 0.9731, "step": 140 }, { "epoch": 0.04931763105310148, "grad_norm": 15.936106002351035, "learning_rate": 3.2805429864253398e-06, "loss": 0.9013, "step": 145 }, { "epoch": 0.05101823902044981, "grad_norm": 14.723970861685023, "learning_rate": 3.3936651583710413e-06, "loss": 0.8686, "step": 150 }, { "epoch": 0.052718846987798135, "grad_norm": 6.296754035865692, "learning_rate": 3.506787330316742e-06, "loss": 0.8403, "step": 155 }, { "epoch": 0.05441945495514647, "grad_norm": 11.410646614152762, "learning_rate": 3.6199095022624436e-06, "loss": 0.82, "step": 160 }, { "epoch": 0.056120062922494794, "grad_norm": 6.179450576960927, "learning_rate": 3.7330316742081452e-06, "loss": 0.8021, "step": 165 }, { "epoch": 0.05782067088984312, "grad_norm": 7.548417828622113, "learning_rate": 3.846153846153847e-06, "loss": 0.7834, "step": 170 }, { "epoch": 0.05952127885719145, "grad_norm": 8.169502681440726, "learning_rate": 3.959276018099548e-06, "loss": 0.7809, "step": 175 }, { "epoch": 0.06122188682453977, "grad_norm": 8.997947648112065, "learning_rate": 4.072398190045249e-06, "loss": 0.7669, "step": 180 }, { "epoch": 0.0629224947918881, "grad_norm": 23.942263352004463, "learning_rate": 4.185520361990951e-06, "loss": 0.7577, "step": 185 }, { "epoch": 0.06462310275923643, "grad_norm": 9.65549631769889, "learning_rate": 4.298642533936652e-06, "loss": 0.7592, "step": 190 }, { "epoch": 0.06632371072658476, "grad_norm": 9.151096835570465, "learning_rate": 4.411764705882353e-06, "loss": 0.732, "step": 195 }, { "epoch": 0.06802431869393308, "grad_norm": 23.689339149203136, "learning_rate": 4.5248868778280546e-06, "loss": 0.7169, "step": 200 }, { "epoch": 0.06972492666128141, "grad_norm": 9.503382054799566, "learning_rate": 4.6380090497737566e-06, "loss": 0.7167, "step": 205 }, { "epoch": 0.07142553462862973, "grad_norm": 11.060398318836771, "learning_rate": 4.751131221719457e-06, "loss": 0.7278, "step": 210 }, { "epoch": 0.07312614259597806, "grad_norm": 14.275008718187829, "learning_rate": 4.864253393665159e-06, "loss": 0.6725, "step": 215 }, { "epoch": 0.07482675056332638, "grad_norm": 9.254261400953006, "learning_rate": 4.97737556561086e-06, "loss": 0.7149, "step": 220 }, { "epoch": 0.07652735853067472, "grad_norm": 12.152819376143459, "learning_rate": 4.999999058430077e-06, "loss": 0.6756, "step": 225 }, { "epoch": 0.07822796649802305, "grad_norm": 12.71525277853713, "learning_rate": 4.999995233303476e-06, "loss": 0.72, "step": 230 }, { "epoch": 0.07992857446537137, "grad_norm": 16.52069302728288, "learning_rate": 4.999988465776579e-06, "loss": 0.68, "step": 235 }, { "epoch": 0.0816291824327197, "grad_norm": 21.514609789250752, "learning_rate": 4.999978755857349e-06, "loss": 0.6682, "step": 240 }, { "epoch": 0.08332979040006802, "grad_norm": 21.83122952918328, "learning_rate": 4.999966103557213e-06, "loss": 0.6931, "step": 245 }, { "epoch": 0.08503039836741635, "grad_norm": 8.826610512818476, "learning_rate": 4.999950508891065e-06, "loss": 0.7082, "step": 250 }, { "epoch": 0.08673100633476467, "grad_norm": 7.902453275444064, "learning_rate": 4.999931971877258e-06, "loss": 0.676, "step": 255 }, { "epoch": 0.088431614302113, "grad_norm": 16.854627629783014, "learning_rate": 4.99991049253761e-06, "loss": 0.6749, "step": 260 }, { "epoch": 0.09013222226946133, "grad_norm": 9.936283538247146, "learning_rate": 4.999886070897401e-06, "loss": 0.683, "step": 265 }, { "epoch": 0.09183283023680966, "grad_norm": 9.757808014367393, "learning_rate": 4.999858706985373e-06, "loss": 0.6862, "step": 270 }, { "epoch": 0.09353343820415799, "grad_norm": 19.431419507015317, "learning_rate": 4.999828400833734e-06, "loss": 0.6694, "step": 275 }, { "epoch": 0.09523404617150631, "grad_norm": 15.051911904223115, "learning_rate": 4.999795152478153e-06, "loss": 0.6801, "step": 280 }, { "epoch": 0.09693465413885465, "grad_norm": 6.485125825801793, "learning_rate": 4.999758961957761e-06, "loss": 0.6419, "step": 285 }, { "epoch": 0.09863526210620296, "grad_norm": 11.220090644662514, "learning_rate": 4.999719829315155e-06, "loss": 0.6408, "step": 290 }, { "epoch": 0.1003358700735513, "grad_norm": 10.519717283422201, "learning_rate": 4.99967775459639e-06, "loss": 0.6608, "step": 295 }, { "epoch": 0.10203647804089962, "grad_norm": 6.226029435794413, "learning_rate": 4.999632737850989e-06, "loss": 0.6091, "step": 300 }, { "epoch": 0.10373708600824795, "grad_norm": 8.430092149740332, "learning_rate": 4.999584779131933e-06, "loss": 0.6631, "step": 305 }, { "epoch": 0.10543769397559627, "grad_norm": 12.188582572432878, "learning_rate": 4.999533878495668e-06, "loss": 0.6165, "step": 310 }, { "epoch": 0.1071383019429446, "grad_norm": 45.910216646833675, "learning_rate": 4.9994800360021025e-06, "loss": 0.666, "step": 315 }, { "epoch": 0.10883890991029294, "grad_norm": 10.217829004467873, "learning_rate": 4.999423251714608e-06, "loss": 0.6705, "step": 320 }, { "epoch": 0.11053951787764126, "grad_norm": 14.882953923091867, "learning_rate": 4.999363525700016e-06, "loss": 0.6803, "step": 325 }, { "epoch": 0.11224012584498959, "grad_norm": 31.58978597081891, "learning_rate": 4.999300858028622e-06, "loss": 0.6537, "step": 330 }, { "epoch": 0.11394073381233791, "grad_norm": 13.780953871869633, "learning_rate": 4.999235248774183e-06, "loss": 0.662, "step": 335 }, { "epoch": 0.11564134177968624, "grad_norm": 10.335809816067288, "learning_rate": 4.999166698013921e-06, "loss": 0.6321, "step": 340 }, { "epoch": 0.11734194974703456, "grad_norm": 11.932357562298984, "learning_rate": 4.999095205828515e-06, "loss": 0.6204, "step": 345 }, { "epoch": 0.1190425577143829, "grad_norm": 15.051840430556013, "learning_rate": 4.99902077230211e-06, "loss": 0.6196, "step": 350 }, { "epoch": 0.12074316568173121, "grad_norm": 5.225931699307834, "learning_rate": 4.9989433975223105e-06, "loss": 0.6026, "step": 355 }, { "epoch": 0.12244377364907955, "grad_norm": 23.381306701490157, "learning_rate": 4.9988630815801845e-06, "loss": 0.6267, "step": 360 }, { "epoch": 0.12414438161642788, "grad_norm": 5.887716941478886, "learning_rate": 4.9987798245702615e-06, "loss": 0.64, "step": 365 }, { "epoch": 0.1258449895837762, "grad_norm": 9.763764194339249, "learning_rate": 4.99869362659053e-06, "loss": 0.6487, "step": 370 }, { "epoch": 0.12754559755112452, "grad_norm": 18.995686135308443, "learning_rate": 4.998604487742444e-06, "loss": 0.6238, "step": 375 }, { "epoch": 0.12924620551847285, "grad_norm": 16.79081962728407, "learning_rate": 4.998512408130914e-06, "loss": 0.6544, "step": 380 }, { "epoch": 0.13094681348582118, "grad_norm": 22.69480477412656, "learning_rate": 4.998417387864316e-06, "loss": 0.6659, "step": 385 }, { "epoch": 0.13264742145316952, "grad_norm": 8.466445322881029, "learning_rate": 4.998319427054486e-06, "loss": 0.6341, "step": 390 }, { "epoch": 0.13434802942051782, "grad_norm": 19.163211220414826, "learning_rate": 4.998218525816717e-06, "loss": 0.6256, "step": 395 }, { "epoch": 0.13604863738786616, "grad_norm": 10.50771347186857, "learning_rate": 4.99811468426977e-06, "loss": 0.6509, "step": 400 }, { "epoch": 0.1377492453552145, "grad_norm": 7.984299752999823, "learning_rate": 4.99800790253586e-06, "loss": 0.6003, "step": 405 }, { "epoch": 0.13944985332256282, "grad_norm": 14.463342596093812, "learning_rate": 4.997898180740665e-06, "loss": 0.6412, "step": 410 }, { "epoch": 0.14115046128991116, "grad_norm": 13.750816687294764, "learning_rate": 4.997785519013324e-06, "loss": 0.6466, "step": 415 }, { "epoch": 0.14285106925725946, "grad_norm": 10.10934252434204, "learning_rate": 4.997669917486437e-06, "loss": 0.6155, "step": 420 }, { "epoch": 0.1445516772246078, "grad_norm": 17.79848414615945, "learning_rate": 4.997551376296061e-06, "loss": 0.6091, "step": 425 }, { "epoch": 0.14625228519195613, "grad_norm": 8.761272326725976, "learning_rate": 4.997429895581715e-06, "loss": 0.6654, "step": 430 }, { "epoch": 0.14795289315930446, "grad_norm": 10.547931177670689, "learning_rate": 4.9973054754863765e-06, "loss": 0.6347, "step": 435 }, { "epoch": 0.14965350112665277, "grad_norm": 21.541634564039974, "learning_rate": 4.997178116156484e-06, "loss": 0.6056, "step": 440 }, { "epoch": 0.1513541090940011, "grad_norm": 58.40832117665425, "learning_rate": 4.997047817741935e-06, "loss": 0.599, "step": 445 }, { "epoch": 0.15305471706134943, "grad_norm": 42.61840891068491, "learning_rate": 4.996914580396085e-06, "loss": 0.6139, "step": 450 }, { "epoch": 0.15475532502869777, "grad_norm": 33.2602204534426, "learning_rate": 4.99677840427575e-06, "loss": 0.5814, "step": 455 }, { "epoch": 0.1564559329960461, "grad_norm": 17.75586783558878, "learning_rate": 4.9966392895412035e-06, "loss": 0.6225, "step": 460 }, { "epoch": 0.1581565409633944, "grad_norm": 22.11341977382103, "learning_rate": 4.996497236356179e-06, "loss": 0.6116, "step": 465 }, { "epoch": 0.15985714893074274, "grad_norm": 6.925398096788078, "learning_rate": 4.996352244887868e-06, "loss": 0.6278, "step": 470 }, { "epoch": 0.16155775689809107, "grad_norm": 13.062658589923394, "learning_rate": 4.996204315306918e-06, "loss": 0.6352, "step": 475 }, { "epoch": 0.1632583648654394, "grad_norm": 9.168214718861417, "learning_rate": 4.996053447787439e-06, "loss": 0.6279, "step": 480 }, { "epoch": 0.1649589728327877, "grad_norm": 11.089977042081552, "learning_rate": 4.995899642506995e-06, "loss": 0.5696, "step": 485 }, { "epoch": 0.16665958080013604, "grad_norm": 21.32464496096674, "learning_rate": 4.99574289964661e-06, "loss": 0.6424, "step": 490 }, { "epoch": 0.16836018876748438, "grad_norm": 10.20829075940166, "learning_rate": 4.995583219390764e-06, "loss": 0.6468, "step": 495 }, { "epoch": 0.1700607967348327, "grad_norm": 12.808347360487026, "learning_rate": 4.995420601927393e-06, "loss": 0.6019, "step": 500 }, { "epoch": 0.17176140470218104, "grad_norm": 14.52851578382901, "learning_rate": 4.9952550474478944e-06, "loss": 0.6404, "step": 505 }, { "epoch": 0.17346201266952935, "grad_norm": 9.455805978378836, "learning_rate": 4.995086556147118e-06, "loss": 0.6054, "step": 510 }, { "epoch": 0.17516262063687768, "grad_norm": 11.538105090235101, "learning_rate": 4.994915128223372e-06, "loss": 0.5966, "step": 515 }, { "epoch": 0.176863228604226, "grad_norm": 8.620211977586647, "learning_rate": 4.994740763878421e-06, "loss": 0.5818, "step": 520 }, { "epoch": 0.17856383657157435, "grad_norm": 5.597079164594491, "learning_rate": 4.994563463317485e-06, "loss": 0.6358, "step": 525 }, { "epoch": 0.18026444453892265, "grad_norm": 33.76289557696585, "learning_rate": 4.9943832267492395e-06, "loss": 0.6182, "step": 530 }, { "epoch": 0.18196505250627099, "grad_norm": 7.537374321416811, "learning_rate": 4.9942000543858175e-06, "loss": 0.5935, "step": 535 }, { "epoch": 0.18366566047361932, "grad_norm": 22.412650764298824, "learning_rate": 4.994013946442804e-06, "loss": 0.6073, "step": 540 }, { "epoch": 0.18536626844096765, "grad_norm": 8.955431330054688, "learning_rate": 4.993824903139243e-06, "loss": 0.6291, "step": 545 }, { "epoch": 0.18706687640831599, "grad_norm": 15.448389517405271, "learning_rate": 4.99363292469763e-06, "loss": 0.6185, "step": 550 }, { "epoch": 0.1887674843756643, "grad_norm": 15.638782533428731, "learning_rate": 4.993438011343918e-06, "loss": 0.6126, "step": 555 }, { "epoch": 0.19046809234301262, "grad_norm": 18.146240583453405, "learning_rate": 4.99324016330751e-06, "loss": 0.5975, "step": 560 }, { "epoch": 0.19216870031036096, "grad_norm": 6.679553445363789, "learning_rate": 4.993039380821268e-06, "loss": 0.5781, "step": 565 }, { "epoch": 0.1938693082777093, "grad_norm": 7.274465103154115, "learning_rate": 4.992835664121506e-06, "loss": 0.6128, "step": 570 }, { "epoch": 0.1955699162450576, "grad_norm": 8.554386699728521, "learning_rate": 4.9926290134479885e-06, "loss": 0.5844, "step": 575 }, { "epoch": 0.19727052421240593, "grad_norm": 15.461566631562391, "learning_rate": 4.992419429043937e-06, "loss": 0.5844, "step": 580 }, { "epoch": 0.19897113217975426, "grad_norm": 19.367094609515036, "learning_rate": 4.992206911156024e-06, "loss": 0.5846, "step": 585 }, { "epoch": 0.2006717401471026, "grad_norm": 7.949321770564925, "learning_rate": 4.991991460034376e-06, "loss": 0.5842, "step": 590 }, { "epoch": 0.20237234811445093, "grad_norm": 33.169106678809236, "learning_rate": 4.991773075932569e-06, "loss": 0.6138, "step": 595 }, { "epoch": 0.20407295608179923, "grad_norm": 11.807875501068386, "learning_rate": 4.991551759107634e-06, "loss": 0.605, "step": 600 }, { "epoch": 0.20577356404914757, "grad_norm": 9.818511278159773, "learning_rate": 4.991327509820053e-06, "loss": 0.59, "step": 605 }, { "epoch": 0.2074741720164959, "grad_norm": 13.519496693016531, "learning_rate": 4.991100328333758e-06, "loss": 0.5705, "step": 610 }, { "epoch": 0.20917477998384423, "grad_norm": 11.998677980651333, "learning_rate": 4.990870214916134e-06, "loss": 0.5961, "step": 615 }, { "epoch": 0.21087538795119254, "grad_norm": 12.57627298318073, "learning_rate": 4.990637169838016e-06, "loss": 0.549, "step": 620 }, { "epoch": 0.21257599591854087, "grad_norm": 17.643005954195313, "learning_rate": 4.990401193373688e-06, "loss": 0.5859, "step": 625 }, { "epoch": 0.2142766038858892, "grad_norm": 8.023911558473781, "learning_rate": 4.990162285800886e-06, "loss": 0.6005, "step": 630 }, { "epoch": 0.21597721185323754, "grad_norm": 11.829776562644458, "learning_rate": 4.989920447400795e-06, "loss": 0.5895, "step": 635 }, { "epoch": 0.21767781982058587, "grad_norm": 10.355274989975698, "learning_rate": 4.989675678458051e-06, "loss": 0.5783, "step": 640 }, { "epoch": 0.21937842778793418, "grad_norm": 8.69232719426023, "learning_rate": 4.989427979260736e-06, "loss": 0.5789, "step": 645 }, { "epoch": 0.2210790357552825, "grad_norm": 37.636377516133614, "learning_rate": 4.989177350100383e-06, "loss": 0.5719, "step": 650 }, { "epoch": 0.22277964372263084, "grad_norm": 5.815207714212499, "learning_rate": 4.988923791271976e-06, "loss": 0.583, "step": 655 }, { "epoch": 0.22448025168997918, "grad_norm": 40.068811104492795, "learning_rate": 4.98866730307394e-06, "loss": 0.6066, "step": 660 }, { "epoch": 0.22618085965732748, "grad_norm": 8.20744741521443, "learning_rate": 4.988407885808153e-06, "loss": 0.5652, "step": 665 }, { "epoch": 0.22788146762467582, "grad_norm": 11.283153989927639, "learning_rate": 4.988145539779941e-06, "loss": 0.5962, "step": 670 }, { "epoch": 0.22958207559202415, "grad_norm": 8.555390469834528, "learning_rate": 4.987880265298074e-06, "loss": 0.5461, "step": 675 }, { "epoch": 0.23128268355937248, "grad_norm": 34.05887296744781, "learning_rate": 4.987612062674771e-06, "loss": 0.547, "step": 680 }, { "epoch": 0.23298329152672081, "grad_norm": 7.011722422652996, "learning_rate": 4.9873409322256965e-06, "loss": 0.6133, "step": 685 }, { "epoch": 0.23468389949406912, "grad_norm": 108.36469510812252, "learning_rate": 4.9870668742699595e-06, "loss": 0.5601, "step": 690 }, { "epoch": 0.23638450746141745, "grad_norm": 9.570385046609053, "learning_rate": 4.986789889130117e-06, "loss": 0.5887, "step": 695 }, { "epoch": 0.2380851154287658, "grad_norm": 9.717325030089524, "learning_rate": 4.98650997713217e-06, "loss": 0.561, "step": 700 }, { "epoch": 0.23978572339611412, "grad_norm": 24.521659275717248, "learning_rate": 4.986227138605564e-06, "loss": 0.5919, "step": 705 }, { "epoch": 0.24148633136346243, "grad_norm": 7.215984020082289, "learning_rate": 4.985941373883189e-06, "loss": 0.5791, "step": 710 }, { "epoch": 0.24318693933081076, "grad_norm": 14.248665524828537, "learning_rate": 4.985652683301379e-06, "loss": 0.5486, "step": 715 }, { "epoch": 0.2448875472981591, "grad_norm": 8.789194246601989, "learning_rate": 4.985361067199915e-06, "loss": 0.5545, "step": 720 }, { "epoch": 0.24658815526550742, "grad_norm": 10.012395745311808, "learning_rate": 4.985066525922014e-06, "loss": 0.5868, "step": 725 }, { "epoch": 0.24828876323285576, "grad_norm": 8.64765545690827, "learning_rate": 4.984769059814343e-06, "loss": 0.5867, "step": 730 }, { "epoch": 0.24998937120020406, "grad_norm": 6.843706075436087, "learning_rate": 4.984468669227007e-06, "loss": 0.5557, "step": 735 }, { "epoch": 0.2516899791675524, "grad_norm": 19.947108102711866, "learning_rate": 4.984165354513555e-06, "loss": 0.5571, "step": 740 }, { "epoch": 0.2533905871349007, "grad_norm": 11.780021114199064, "learning_rate": 4.983859116030976e-06, "loss": 0.5666, "step": 745 }, { "epoch": 0.25509119510224904, "grad_norm": 5.630908993158177, "learning_rate": 4.983549954139702e-06, "loss": 0.5511, "step": 750 }, { "epoch": 0.25679180306959737, "grad_norm": 6.928219837079772, "learning_rate": 4.983237869203606e-06, "loss": 0.5519, "step": 755 }, { "epoch": 0.2584924110369457, "grad_norm": 9.85491708098051, "learning_rate": 4.982922861589997e-06, "loss": 0.5477, "step": 760 }, { "epoch": 0.26019301900429403, "grad_norm": 21.197537518574446, "learning_rate": 4.982604931669631e-06, "loss": 0.5686, "step": 765 }, { "epoch": 0.26189362697164237, "grad_norm": 5.825778604217589, "learning_rate": 4.982284079816697e-06, "loss": 0.5858, "step": 770 }, { "epoch": 0.2635942349389907, "grad_norm": 5.1625548382347075, "learning_rate": 4.981960306408826e-06, "loss": 0.5715, "step": 775 }, { "epoch": 0.26529484290633903, "grad_norm": 10.036784754520419, "learning_rate": 4.981633611827088e-06, "loss": 0.5551, "step": 780 }, { "epoch": 0.26699545087368737, "grad_norm": 10.305028664800242, "learning_rate": 4.98130399645599e-06, "loss": 0.5599, "step": 785 }, { "epoch": 0.26869605884103565, "grad_norm": 4.589228297867227, "learning_rate": 4.980971460683475e-06, "loss": 0.5949, "step": 790 }, { "epoch": 0.270396666808384, "grad_norm": 8.27995784938204, "learning_rate": 4.980636004900927e-06, "loss": 0.56, "step": 795 }, { "epoch": 0.2720972747757323, "grad_norm": 4.493782493116703, "learning_rate": 4.980297629503165e-06, "loss": 0.5835, "step": 800 }, { "epoch": 0.27379788274308064, "grad_norm": 7.126752565086873, "learning_rate": 4.979956334888443e-06, "loss": 0.5671, "step": 805 }, { "epoch": 0.275498490710429, "grad_norm": 4.463794058054743, "learning_rate": 4.979612121458452e-06, "loss": 0.5703, "step": 810 }, { "epoch": 0.2771990986777773, "grad_norm": 5.351060624932273, "learning_rate": 4.9792649896183195e-06, "loss": 0.5907, "step": 815 }, { "epoch": 0.27889970664512564, "grad_norm": 92.26169791363449, "learning_rate": 4.978914939776606e-06, "loss": 0.5393, "step": 820 }, { "epoch": 0.280600314612474, "grad_norm": 9.493610254132465, "learning_rate": 4.978561972345306e-06, "loss": 0.5723, "step": 825 }, { "epoch": 0.2823009225798223, "grad_norm": 7.8072634292712575, "learning_rate": 4.978206087739851e-06, "loss": 0.5459, "step": 830 }, { "epoch": 0.2840015305471706, "grad_norm": 5.331414597258025, "learning_rate": 4.9778472863791e-06, "loss": 0.5611, "step": 835 }, { "epoch": 0.2857021385145189, "grad_norm": 16.750010087853518, "learning_rate": 4.977485568685353e-06, "loss": 0.5804, "step": 840 }, { "epoch": 0.28740274648186726, "grad_norm": 9.52017836985358, "learning_rate": 4.977120935084336e-06, "loss": 0.5626, "step": 845 }, { "epoch": 0.2891033544492156, "grad_norm": 5.48572070726049, "learning_rate": 4.97675338600521e-06, "loss": 0.6, "step": 850 }, { "epoch": 0.2908039624165639, "grad_norm": 4.410706726025624, "learning_rate": 4.976382921880564e-06, "loss": 0.5426, "step": 855 }, { "epoch": 0.29250457038391225, "grad_norm": 6.059898850103638, "learning_rate": 4.976009543146423e-06, "loss": 0.543, "step": 860 }, { "epoch": 0.2942051783512606, "grad_norm": 20.172923388018912, "learning_rate": 4.975633250242239e-06, "loss": 0.5557, "step": 865 }, { "epoch": 0.2959057863186089, "grad_norm": 5.932443651579922, "learning_rate": 4.975254043610894e-06, "loss": 0.5806, "step": 870 }, { "epoch": 0.29760639428595725, "grad_norm": 3.8592716485343224, "learning_rate": 4.9748719236987e-06, "loss": 0.5553, "step": 875 }, { "epoch": 0.29930700225330553, "grad_norm": 3.978579777692364, "learning_rate": 4.974486890955398e-06, "loss": 0.5903, "step": 880 }, { "epoch": 0.30100761022065387, "grad_norm": 6.483714401233538, "learning_rate": 4.9740989458341574e-06, "loss": 0.527, "step": 885 }, { "epoch": 0.3027082181880022, "grad_norm": 6.120570644247395, "learning_rate": 4.973708088791574e-06, "loss": 0.5599, "step": 890 }, { "epoch": 0.30440882615535053, "grad_norm": 7.440959126495233, "learning_rate": 4.973314320287674e-06, "loss": 0.5384, "step": 895 }, { "epoch": 0.30610943412269886, "grad_norm": 5.871413458662885, "learning_rate": 4.972917640785906e-06, "loss": 0.5498, "step": 900 }, { "epoch": 0.3078100420900472, "grad_norm": 3.61871679460945, "learning_rate": 4.972518050753146e-06, "loss": 0.5527, "step": 905 }, { "epoch": 0.30951065005739553, "grad_norm": 3.7899680523970214, "learning_rate": 4.9721155506597e-06, "loss": 0.5674, "step": 910 }, { "epoch": 0.31121125802474386, "grad_norm": 5.213262776183392, "learning_rate": 4.971710140979292e-06, "loss": 0.5383, "step": 915 }, { "epoch": 0.3129118659920922, "grad_norm": 5.3160173941547075, "learning_rate": 4.971301822189077e-06, "loss": 0.5613, "step": 920 }, { "epoch": 0.3146124739594405, "grad_norm": 5.996017735597043, "learning_rate": 4.970890594769627e-06, "loss": 0.5563, "step": 925 }, { "epoch": 0.3163130819267888, "grad_norm": 6.198763177382854, "learning_rate": 4.970476459204945e-06, "loss": 0.5409, "step": 930 }, { "epoch": 0.31801368989413714, "grad_norm": 4.017575965045663, "learning_rate": 4.97005941598245e-06, "loss": 0.5448, "step": 935 }, { "epoch": 0.3197142978614855, "grad_norm": 3.672395789135882, "learning_rate": 4.9696394655929884e-06, "loss": 0.5129, "step": 940 }, { "epoch": 0.3214149058288338, "grad_norm": 24.946087982059257, "learning_rate": 4.9692166085308244e-06, "loss": 0.5546, "step": 945 }, { "epoch": 0.32311551379618214, "grad_norm": 5.3019007125672415, "learning_rate": 4.968790845293646e-06, "loss": 0.5655, "step": 950 }, { "epoch": 0.3248161217635305, "grad_norm": 5.693729615742935, "learning_rate": 4.96836217638256e-06, "loss": 0.5535, "step": 955 }, { "epoch": 0.3265167297308788, "grad_norm": 5.49960606645776, "learning_rate": 4.967930602302094e-06, "loss": 0.5707, "step": 960 }, { "epoch": 0.32821733769822714, "grad_norm": 16.264894478999373, "learning_rate": 4.967496123560193e-06, "loss": 0.5545, "step": 965 }, { "epoch": 0.3299179456655754, "grad_norm": 7.924514395325215, "learning_rate": 4.9670587406682235e-06, "loss": 0.562, "step": 970 }, { "epoch": 0.33161855363292375, "grad_norm": 8.601257289905918, "learning_rate": 4.966618454140969e-06, "loss": 0.5537, "step": 975 }, { "epoch": 0.3333191616002721, "grad_norm": 8.691735260143128, "learning_rate": 4.966175264496629e-06, "loss": 0.5569, "step": 980 }, { "epoch": 0.3350197695676204, "grad_norm": 19.92859770802887, "learning_rate": 4.965729172256822e-06, "loss": 0.5485, "step": 985 }, { "epoch": 0.33672037753496875, "grad_norm": 6.849877453900109, "learning_rate": 4.9652801779465815e-06, "loss": 0.5443, "step": 990 }, { "epoch": 0.3384209855023171, "grad_norm": 6.020515213223856, "learning_rate": 4.964828282094356e-06, "loss": 0.5159, "step": 995 }, { "epoch": 0.3401215934696654, "grad_norm": 18.6239988809091, "learning_rate": 4.964373485232012e-06, "loss": 0.5711, "step": 1000 }, { "epoch": 0.34182220143701375, "grad_norm": 4.822075213885056, "learning_rate": 4.963915787894827e-06, "loss": 0.5349, "step": 1005 }, { "epoch": 0.3435228094043621, "grad_norm": 8.028398581194914, "learning_rate": 4.963455190621492e-06, "loss": 0.5413, "step": 1010 }, { "epoch": 0.34522341737171036, "grad_norm": 5.371954315819236, "learning_rate": 4.962991693954115e-06, "loss": 0.5536, "step": 1015 }, { "epoch": 0.3469240253390587, "grad_norm": 5.842648577621306, "learning_rate": 4.962525298438213e-06, "loss": 0.5436, "step": 1020 }, { "epoch": 0.34862463330640703, "grad_norm": 4.632859936784621, "learning_rate": 4.962056004622716e-06, "loss": 0.5334, "step": 1025 }, { "epoch": 0.35032524127375536, "grad_norm": 8.003687856035425, "learning_rate": 4.961583813059966e-06, "loss": 0.5131, "step": 1030 }, { "epoch": 0.3520258492411037, "grad_norm": 11.804117251299203, "learning_rate": 4.961108724305714e-06, "loss": 0.5131, "step": 1035 }, { "epoch": 0.353726457208452, "grad_norm": 5.612226850601007, "learning_rate": 4.960630738919122e-06, "loss": 0.5585, "step": 1040 }, { "epoch": 0.35542706517580036, "grad_norm": 10.67681254905423, "learning_rate": 4.9601498574627604e-06, "loss": 0.5269, "step": 1045 }, { "epoch": 0.3571276731431487, "grad_norm": 5.676959649188794, "learning_rate": 4.959666080502609e-06, "loss": 0.5331, "step": 1050 }, { "epoch": 0.358828281110497, "grad_norm": 7.879954337915793, "learning_rate": 4.959179408608053e-06, "loss": 0.5649, "step": 1055 }, { "epoch": 0.3605288890778453, "grad_norm": 10.994295716410706, "learning_rate": 4.958689842351891e-06, "loss": 0.5212, "step": 1060 }, { "epoch": 0.36222949704519364, "grad_norm": 6.393504474286174, "learning_rate": 4.95819738231032e-06, "loss": 0.5469, "step": 1065 }, { "epoch": 0.36393010501254197, "grad_norm": 5.526703277625495, "learning_rate": 4.95770202906295e-06, "loss": 0.5405, "step": 1070 }, { "epoch": 0.3656307129798903, "grad_norm": 4.905009308794142, "learning_rate": 4.957203783192791e-06, "loss": 0.5049, "step": 1075 }, { "epoch": 0.36733132094723864, "grad_norm": 3.650707522940902, "learning_rate": 4.956702645286261e-06, "loss": 0.5308, "step": 1080 }, { "epoch": 0.36903192891458697, "grad_norm": 26.974090854070898, "learning_rate": 4.95619861593318e-06, "loss": 0.5526, "step": 1085 }, { "epoch": 0.3707325368819353, "grad_norm": 5.833810588702081, "learning_rate": 4.955691695726771e-06, "loss": 0.5483, "step": 1090 }, { "epoch": 0.37243314484928364, "grad_norm": 8.1828994258448, "learning_rate": 4.95518188526366e-06, "loss": 0.5137, "step": 1095 }, { "epoch": 0.37413375281663197, "grad_norm": 4.851544353465898, "learning_rate": 4.954669185143876e-06, "loss": 0.5173, "step": 1100 }, { "epoch": 0.37583436078398025, "grad_norm": 5.393727849252035, "learning_rate": 4.9541535959708466e-06, "loss": 0.5568, "step": 1105 }, { "epoch": 0.3775349687513286, "grad_norm": 4.42721889010259, "learning_rate": 4.953635118351401e-06, "loss": 0.5273, "step": 1110 }, { "epoch": 0.3792355767186769, "grad_norm": 9.849567416783387, "learning_rate": 4.953113752895769e-06, "loss": 0.5445, "step": 1115 }, { "epoch": 0.38093618468602525, "grad_norm": 6.044269741022495, "learning_rate": 4.952589500217576e-06, "loss": 0.5313, "step": 1120 }, { "epoch": 0.3826367926533736, "grad_norm": 4.982181056132783, "learning_rate": 4.952062360933849e-06, "loss": 0.5473, "step": 1125 }, { "epoch": 0.3843374006207219, "grad_norm": 4.301027365473603, "learning_rate": 4.9515323356650115e-06, "loss": 0.5688, "step": 1130 }, { "epoch": 0.38603800858807025, "grad_norm": 3.643635403946816, "learning_rate": 4.950999425034882e-06, "loss": 0.5363, "step": 1135 }, { "epoch": 0.3877386165554186, "grad_norm": 3.83748828648786, "learning_rate": 4.950463629670678e-06, "loss": 0.5212, "step": 1140 }, { "epoch": 0.3894392245227669, "grad_norm": 4.663616763812123, "learning_rate": 4.949924950203009e-06, "loss": 0.5326, "step": 1145 }, { "epoch": 0.3911398324901152, "grad_norm": 6.234666871922535, "learning_rate": 4.949383387265881e-06, "loss": 0.5414, "step": 1150 }, { "epoch": 0.3928404404574635, "grad_norm": 5.132252044177524, "learning_rate": 4.948838941496692e-06, "loss": 0.5429, "step": 1155 }, { "epoch": 0.39454104842481186, "grad_norm": 6.7815614041683885, "learning_rate": 4.948291613536237e-06, "loss": 0.5556, "step": 1160 }, { "epoch": 0.3962416563921602, "grad_norm": 6.167051999446675, "learning_rate": 4.947741404028697e-06, "loss": 0.5245, "step": 1165 }, { "epoch": 0.3979422643595085, "grad_norm": 3.9963057579837558, "learning_rate": 4.94718831362165e-06, "loss": 0.5151, "step": 1170 }, { "epoch": 0.39964287232685686, "grad_norm": 5.246564898589007, "learning_rate": 4.946632342966063e-06, "loss": 0.5208, "step": 1175 }, { "epoch": 0.4013434802942052, "grad_norm": 5.94690942725412, "learning_rate": 4.946073492716291e-06, "loss": 0.5308, "step": 1180 }, { "epoch": 0.4030440882615535, "grad_norm": 4.144690597020785, "learning_rate": 4.945511763530081e-06, "loss": 0.5042, "step": 1185 }, { "epoch": 0.40474469622890186, "grad_norm": 9.088228806626756, "learning_rate": 4.944947156068567e-06, "loss": 0.5161, "step": 1190 }, { "epoch": 0.40644530419625013, "grad_norm": 4.627393792456539, "learning_rate": 4.944379670996269e-06, "loss": 0.5397, "step": 1195 }, { "epoch": 0.40814591216359847, "grad_norm": 3.9794107239823315, "learning_rate": 4.943809308981097e-06, "loss": 0.5032, "step": 1200 }, { "epoch": 0.4098465201309468, "grad_norm": 5.199479025001829, "learning_rate": 4.943236070694346e-06, "loss": 0.506, "step": 1205 }, { "epoch": 0.41154712809829513, "grad_norm": 3.6782110958459286, "learning_rate": 4.942659956810695e-06, "loss": 0.5125, "step": 1210 }, { "epoch": 0.41324773606564347, "grad_norm": 5.334976336098045, "learning_rate": 4.9420809680082095e-06, "loss": 0.5203, "step": 1215 }, { "epoch": 0.4149483440329918, "grad_norm": 5.595794839734362, "learning_rate": 4.941499104968336e-06, "loss": 0.5499, "step": 1220 }, { "epoch": 0.41664895200034013, "grad_norm": 6.580528830910674, "learning_rate": 4.9409143683759065e-06, "loss": 0.4854, "step": 1225 }, { "epoch": 0.41834955996768847, "grad_norm": 3.524339063672003, "learning_rate": 4.940326758919133e-06, "loss": 0.5283, "step": 1230 }, { "epoch": 0.4200501679350368, "grad_norm": 5.1992380024915645, "learning_rate": 4.93973627728961e-06, "loss": 0.5289, "step": 1235 }, { "epoch": 0.4217507759023851, "grad_norm": 3.932664128734379, "learning_rate": 4.939142924182314e-06, "loss": 0.5446, "step": 1240 }, { "epoch": 0.4234513838697334, "grad_norm": 8.984495756116655, "learning_rate": 4.9385467002955965e-06, "loss": 0.5124, "step": 1245 }, { "epoch": 0.42515199183708174, "grad_norm": 4.594542649829299, "learning_rate": 4.937947606331192e-06, "loss": 0.5525, "step": 1250 }, { "epoch": 0.4268525998044301, "grad_norm": 3.943719389932639, "learning_rate": 4.937345642994211e-06, "loss": 0.5187, "step": 1255 }, { "epoch": 0.4285532077717784, "grad_norm": 6.6305586617254635, "learning_rate": 4.936740810993143e-06, "loss": 0.524, "step": 1260 }, { "epoch": 0.43025381573912674, "grad_norm": 4.2711052445927935, "learning_rate": 4.936133111039852e-06, "loss": 0.5247, "step": 1265 }, { "epoch": 0.4319544237064751, "grad_norm": 4.105088800881805, "learning_rate": 4.9355225438495755e-06, "loss": 0.5343, "step": 1270 }, { "epoch": 0.4336550316738234, "grad_norm": 29.45639019594073, "learning_rate": 4.934909110140932e-06, "loss": 0.5107, "step": 1275 }, { "epoch": 0.43535563964117174, "grad_norm": 9.843585653327034, "learning_rate": 4.934292810635907e-06, "loss": 0.4964, "step": 1280 }, { "epoch": 0.43705624760852, "grad_norm": 4.397799128373228, "learning_rate": 4.933673646059863e-06, "loss": 0.5314, "step": 1285 }, { "epoch": 0.43875685557586835, "grad_norm": 3.6290143915072637, "learning_rate": 4.933051617141533e-06, "loss": 0.5169, "step": 1290 }, { "epoch": 0.4404574635432167, "grad_norm": 4.597619249145893, "learning_rate": 4.932426724613023e-06, "loss": 0.5304, "step": 1295 }, { "epoch": 0.442158071510565, "grad_norm": 4.281712154934983, "learning_rate": 4.931798969209806e-06, "loss": 0.5203, "step": 1300 }, { "epoch": 0.44385867947791335, "grad_norm": 4.16242238081176, "learning_rate": 4.931168351670727e-06, "loss": 0.5115, "step": 1305 }, { "epoch": 0.4455592874452617, "grad_norm": 3.668437512941314, "learning_rate": 4.930534872737999e-06, "loss": 0.5134, "step": 1310 }, { "epoch": 0.44725989541261, "grad_norm": 3.914120522681882, "learning_rate": 4.929898533157206e-06, "loss": 0.5059, "step": 1315 }, { "epoch": 0.44896050337995835, "grad_norm": 17.584805925934216, "learning_rate": 4.92925933367729e-06, "loss": 0.5062, "step": 1320 }, { "epoch": 0.4506611113473067, "grad_norm": 9.716755292603246, "learning_rate": 4.928617275050569e-06, "loss": 0.543, "step": 1325 }, { "epoch": 0.45236171931465496, "grad_norm": 9.825666844295863, "learning_rate": 4.927972358032721e-06, "loss": 0.5192, "step": 1330 }, { "epoch": 0.4540623272820033, "grad_norm": 5.309335957751379, "learning_rate": 4.927324583382788e-06, "loss": 0.4956, "step": 1335 }, { "epoch": 0.45576293524935163, "grad_norm": 6.105104421881541, "learning_rate": 4.926673951863178e-06, "loss": 0.5084, "step": 1340 }, { "epoch": 0.45746354321669996, "grad_norm": 9.0127339165914, "learning_rate": 4.926020464239658e-06, "loss": 0.5192, "step": 1345 }, { "epoch": 0.4591641511840483, "grad_norm": 7.508306727379984, "learning_rate": 4.92536412128136e-06, "loss": 0.5402, "step": 1350 }, { "epoch": 0.46086475915139663, "grad_norm": 10.958004027192123, "learning_rate": 4.924704923760773e-06, "loss": 0.5311, "step": 1355 }, { "epoch": 0.46256536711874496, "grad_norm": 23.348256819461614, "learning_rate": 4.924042872453749e-06, "loss": 0.5068, "step": 1360 }, { "epoch": 0.4642659750860933, "grad_norm": 4.384966711835645, "learning_rate": 4.923377968139498e-06, "loss": 0.5272, "step": 1365 }, { "epoch": 0.46596658305344163, "grad_norm": 5.097304462399159, "learning_rate": 4.922710211600586e-06, "loss": 0.4841, "step": 1370 }, { "epoch": 0.4676671910207899, "grad_norm": 8.238318724884653, "learning_rate": 4.922039603622939e-06, "loss": 0.5133, "step": 1375 }, { "epoch": 0.46936779898813824, "grad_norm": 4.312265953800085, "learning_rate": 4.921366144995835e-06, "loss": 0.5089, "step": 1380 }, { "epoch": 0.4710684069554866, "grad_norm": 21.21052229234248, "learning_rate": 4.920689836511911e-06, "loss": 0.5194, "step": 1385 }, { "epoch": 0.4727690149228349, "grad_norm": 6.828270310440812, "learning_rate": 4.920010678967158e-06, "loss": 0.4983, "step": 1390 }, { "epoch": 0.47446962289018324, "grad_norm": 16.456284342932708, "learning_rate": 4.919328673160916e-06, "loss": 0.5232, "step": 1395 }, { "epoch": 0.4761702308575316, "grad_norm": 5.452512136180211, "learning_rate": 4.918643819895881e-06, "loss": 0.4872, "step": 1400 }, { "epoch": 0.4778708388248799, "grad_norm": 10.00043037154549, "learning_rate": 4.917956119978101e-06, "loss": 0.5121, "step": 1405 }, { "epoch": 0.47957144679222824, "grad_norm": 6.648280131979151, "learning_rate": 4.917265574216972e-06, "loss": 0.4972, "step": 1410 }, { "epoch": 0.4812720547595766, "grad_norm": 7.056170116608334, "learning_rate": 4.9165721834252386e-06, "loss": 0.5109, "step": 1415 }, { "epoch": 0.48297266272692485, "grad_norm": 6.63777649357413, "learning_rate": 4.915875948418999e-06, "loss": 0.5093, "step": 1420 }, { "epoch": 0.4846732706942732, "grad_norm": 6.289903815492902, "learning_rate": 4.915176870017693e-06, "loss": 0.4841, "step": 1425 }, { "epoch": 0.4863738786616215, "grad_norm": 22.911413804487406, "learning_rate": 4.9144749490441116e-06, "loss": 0.5205, "step": 1430 }, { "epoch": 0.48807448662896985, "grad_norm": 6.84767145522513, "learning_rate": 4.913770186324387e-06, "loss": 0.5217, "step": 1435 }, { "epoch": 0.4897750945963182, "grad_norm": 15.458207722628924, "learning_rate": 4.9130625826879996e-06, "loss": 0.5044, "step": 1440 }, { "epoch": 0.4914757025636665, "grad_norm": 6.0191432394301145, "learning_rate": 4.912352138967773e-06, "loss": 0.5227, "step": 1445 }, { "epoch": 0.49317631053101485, "grad_norm": 5.759768357934574, "learning_rate": 4.911638855999872e-06, "loss": 0.5126, "step": 1450 }, { "epoch": 0.4948769184983632, "grad_norm": 26.23263891230328, "learning_rate": 4.910922734623804e-06, "loss": 0.5401, "step": 1455 }, { "epoch": 0.4965775264657115, "grad_norm": 8.5652855148443, "learning_rate": 4.910203775682416e-06, "loss": 0.5321, "step": 1460 }, { "epoch": 0.4982781344330598, "grad_norm": 11.817399115283937, "learning_rate": 4.909481980021897e-06, "loss": 0.4836, "step": 1465 }, { "epoch": 0.4999787424004081, "grad_norm": 7.423591169847086, "learning_rate": 4.908757348491772e-06, "loss": 0.5055, "step": 1470 }, { "epoch": 0.5016793503677565, "grad_norm": 11.43019213459179, "learning_rate": 4.9080298819449065e-06, "loss": 0.5026, "step": 1475 }, { "epoch": 0.5033799583351048, "grad_norm": 5.952995411347645, "learning_rate": 4.9072995812375e-06, "loss": 0.5047, "step": 1480 }, { "epoch": 0.5050805663024531, "grad_norm": 5.842345126987176, "learning_rate": 4.906566447229089e-06, "loss": 0.5295, "step": 1485 }, { "epoch": 0.5067811742698014, "grad_norm": 12.139042350418991, "learning_rate": 4.905830480782546e-06, "loss": 0.5132, "step": 1490 }, { "epoch": 0.5084817822371498, "grad_norm": 6.024078140148232, "learning_rate": 4.905091682764074e-06, "loss": 0.504, "step": 1495 }, { "epoch": 0.5101823902044981, "grad_norm": 6.014832908987949, "learning_rate": 4.904350054043212e-06, "loss": 0.5069, "step": 1500 }, { "epoch": 0.5118829981718465, "grad_norm": 4.221709319864658, "learning_rate": 4.9036055954928275e-06, "loss": 0.5106, "step": 1505 }, { "epoch": 0.5135836061391947, "grad_norm": 8.242477093088722, "learning_rate": 4.9028583079891225e-06, "loss": 0.5255, "step": 1510 }, { "epoch": 0.5152842141065431, "grad_norm": 7.853884867567823, "learning_rate": 4.902108192411623e-06, "loss": 0.5027, "step": 1515 }, { "epoch": 0.5169848220738914, "grad_norm": 8.378738866524177, "learning_rate": 4.90135524964319e-06, "loss": 0.5034, "step": 1520 }, { "epoch": 0.5186854300412398, "grad_norm": 7.10934057499516, "learning_rate": 4.900599480570007e-06, "loss": 0.5433, "step": 1525 }, { "epoch": 0.5203860380085881, "grad_norm": 3.633807957238506, "learning_rate": 4.899840886081587e-06, "loss": 0.4692, "step": 1530 }, { "epoch": 0.5220866459759363, "grad_norm": 22.01575433481114, "learning_rate": 4.899079467070765e-06, "loss": 0.5064, "step": 1535 }, { "epoch": 0.5237872539432847, "grad_norm": 7.008822525300486, "learning_rate": 4.898315224433705e-06, "loss": 0.503, "step": 1540 }, { "epoch": 0.525487861910633, "grad_norm": 6.499364764260329, "learning_rate": 4.89754815906989e-06, "loss": 0.5127, "step": 1545 }, { "epoch": 0.5271884698779814, "grad_norm": 7.1158336276148555, "learning_rate": 4.896778271882129e-06, "loss": 0.4955, "step": 1550 }, { "epoch": 0.5288890778453297, "grad_norm": 4.583909761477803, "learning_rate": 4.896005563776548e-06, "loss": 0.4843, "step": 1555 }, { "epoch": 0.5305896858126781, "grad_norm": 8.683598896960767, "learning_rate": 4.895230035662596e-06, "loss": 0.522, "step": 1560 }, { "epoch": 0.5322902937800263, "grad_norm": 9.43675992818845, "learning_rate": 4.894451688453041e-06, "loss": 0.5015, "step": 1565 }, { "epoch": 0.5339909017473747, "grad_norm": 6.993609954246139, "learning_rate": 4.893670523063969e-06, "loss": 0.4975, "step": 1570 }, { "epoch": 0.535691509714723, "grad_norm": 8.637779532146578, "learning_rate": 4.892886540414781e-06, "loss": 0.5043, "step": 1575 }, { "epoch": 0.5373921176820713, "grad_norm": 4.197137588803186, "learning_rate": 4.892099741428195e-06, "loss": 0.4979, "step": 1580 }, { "epoch": 0.5390927256494197, "grad_norm": 4.895856929628726, "learning_rate": 4.891310127030245e-06, "loss": 0.5354, "step": 1585 }, { "epoch": 0.540793333616768, "grad_norm": 77.59232070659658, "learning_rate": 4.890517698150277e-06, "loss": 0.4963, "step": 1590 }, { "epoch": 0.5424939415841163, "grad_norm": 5.082350540203327, "learning_rate": 4.8897224557209485e-06, "loss": 0.4849, "step": 1595 }, { "epoch": 0.5441945495514646, "grad_norm": 29.652950399011726, "learning_rate": 4.8889244006782315e-06, "loss": 0.5093, "step": 1600 }, { "epoch": 0.545895157518813, "grad_norm": 3.7542360216139876, "learning_rate": 4.8881235339614065e-06, "loss": 0.496, "step": 1605 }, { "epoch": 0.5475957654861613, "grad_norm": 5.102119315535732, "learning_rate": 4.887319856513064e-06, "loss": 0.4814, "step": 1610 }, { "epoch": 0.5492963734535097, "grad_norm": 48.189015212909666, "learning_rate": 4.8865133692791e-06, "loss": 0.5043, "step": 1615 }, { "epoch": 0.550996981420858, "grad_norm": 5.108464793813238, "learning_rate": 4.885704073208723e-06, "loss": 0.4871, "step": 1620 }, { "epoch": 0.5526975893882062, "grad_norm": 3.801975209912871, "learning_rate": 4.88489196925444e-06, "loss": 0.4954, "step": 1625 }, { "epoch": 0.5543981973555546, "grad_norm": 19.09107126052143, "learning_rate": 4.88407705837207e-06, "loss": 0.5217, "step": 1630 }, { "epoch": 0.5560988053229029, "grad_norm": 4.28185994919201, "learning_rate": 4.8832593415207306e-06, "loss": 0.4829, "step": 1635 }, { "epoch": 0.5577994132902513, "grad_norm": 5.515135680286602, "learning_rate": 4.882438819662844e-06, "loss": 0.5161, "step": 1640 }, { "epoch": 0.5595000212575996, "grad_norm": 41.17614904295365, "learning_rate": 4.881615493764136e-06, "loss": 0.5137, "step": 1645 }, { "epoch": 0.561200629224948, "grad_norm": 4.3491776766922605, "learning_rate": 4.8807893647936266e-06, "loss": 0.4831, "step": 1650 }, { "epoch": 0.5629012371922962, "grad_norm": 21.235110695196852, "learning_rate": 4.879960433723641e-06, "loss": 0.4687, "step": 1655 }, { "epoch": 0.5646018451596446, "grad_norm": 11.409972086344075, "learning_rate": 4.879128701529798e-06, "loss": 0.5001, "step": 1660 }, { "epoch": 0.5663024531269929, "grad_norm": 15.662946322646652, "learning_rate": 4.878294169191017e-06, "loss": 0.4718, "step": 1665 }, { "epoch": 0.5680030610943412, "grad_norm": 5.2615695796352835, "learning_rate": 4.87745683768951e-06, "loss": 0.5159, "step": 1670 }, { "epoch": 0.5697036690616896, "grad_norm": 5.4981450533202825, "learning_rate": 4.8766167080107845e-06, "loss": 0.5018, "step": 1675 }, { "epoch": 0.5714042770290378, "grad_norm": 4.772009449482228, "learning_rate": 4.875773781143642e-06, "loss": 0.4969, "step": 1680 }, { "epoch": 0.5731048849963862, "grad_norm": 6.217195880579487, "learning_rate": 4.874928058080176e-06, "loss": 0.5014, "step": 1685 }, { "epoch": 0.5748054929637345, "grad_norm": 4.835096996723914, "learning_rate": 4.87407953981577e-06, "loss": 0.5009, "step": 1690 }, { "epoch": 0.5765061009310829, "grad_norm": 11.76873010691928, "learning_rate": 4.873228227349098e-06, "loss": 0.5219, "step": 1695 }, { "epoch": 0.5782067088984312, "grad_norm": 7.812298909878062, "learning_rate": 4.872374121682124e-06, "loss": 0.4769, "step": 1700 }, { "epoch": 0.5799073168657796, "grad_norm": 6.354998163013655, "learning_rate": 4.871517223820097e-06, "loss": 0.512, "step": 1705 }, { "epoch": 0.5816079248331278, "grad_norm": 5.045657877477977, "learning_rate": 4.870657534771553e-06, "loss": 0.4787, "step": 1710 }, { "epoch": 0.5833085328004761, "grad_norm": 4.220803612156143, "learning_rate": 4.869795055548316e-06, "loss": 0.4906, "step": 1715 }, { "epoch": 0.5850091407678245, "grad_norm": 5.753072013667585, "learning_rate": 4.868929787165488e-06, "loss": 0.5104, "step": 1720 }, { "epoch": 0.5867097487351728, "grad_norm": 7.075151115902222, "learning_rate": 4.8680617306414605e-06, "loss": 0.4792, "step": 1725 }, { "epoch": 0.5884103567025212, "grad_norm": 7.925818725105634, "learning_rate": 4.867190886997902e-06, "loss": 0.4854, "step": 1730 }, { "epoch": 0.5901109646698695, "grad_norm": 4.251475398397968, "learning_rate": 4.8663172572597635e-06, "loss": 0.5057, "step": 1735 }, { "epoch": 0.5918115726372178, "grad_norm": 5.747439077315864, "learning_rate": 4.865440842455273e-06, "loss": 0.4957, "step": 1740 }, { "epoch": 0.5935121806045661, "grad_norm": 6.078600480831179, "learning_rate": 4.86456164361594e-06, "loss": 0.4954, "step": 1745 }, { "epoch": 0.5952127885719145, "grad_norm": 13.313329606941615, "learning_rate": 4.863679661776546e-06, "loss": 0.4921, "step": 1750 }, { "epoch": 0.5969133965392628, "grad_norm": 17.953981141289145, "learning_rate": 4.862794897975152e-06, "loss": 0.5218, "step": 1755 }, { "epoch": 0.5986140045066111, "grad_norm": 6.03392051370175, "learning_rate": 4.86190735325309e-06, "loss": 0.4786, "step": 1760 }, { "epoch": 0.6003146124739595, "grad_norm": 4.247443888478685, "learning_rate": 4.861017028654968e-06, "loss": 0.5048, "step": 1765 }, { "epoch": 0.6020152204413077, "grad_norm": 4.88629937124035, "learning_rate": 4.8601239252286656e-06, "loss": 0.4843, "step": 1770 }, { "epoch": 0.6037158284086561, "grad_norm": 4.053847272751819, "learning_rate": 4.859228044025329e-06, "loss": 0.5098, "step": 1775 }, { "epoch": 0.6054164363760044, "grad_norm": 20.647837698987473, "learning_rate": 4.85832938609938e-06, "loss": 0.5073, "step": 1780 }, { "epoch": 0.6071170443433528, "grad_norm": 3.5679725080288116, "learning_rate": 4.857427952508502e-06, "loss": 0.4983, "step": 1785 }, { "epoch": 0.6088176523107011, "grad_norm": 9.954128237756244, "learning_rate": 4.856523744313651e-06, "loss": 0.4835, "step": 1790 }, { "epoch": 0.6105182602780495, "grad_norm": 3.2295931672229274, "learning_rate": 4.855616762579045e-06, "loss": 0.4841, "step": 1795 }, { "epoch": 0.6122188682453977, "grad_norm": 4.346330125227621, "learning_rate": 4.854707008372166e-06, "loss": 0.5096, "step": 1800 }, { "epoch": 0.613919476212746, "grad_norm": 7.383006888543724, "learning_rate": 4.853794482763763e-06, "loss": 0.4938, "step": 1805 }, { "epoch": 0.6156200841800944, "grad_norm": 4.317265827403654, "learning_rate": 4.852879186827843e-06, "loss": 0.5038, "step": 1810 }, { "epoch": 0.6173206921474427, "grad_norm": 4.598508284769239, "learning_rate": 4.851961121641674e-06, "loss": 0.4873, "step": 1815 }, { "epoch": 0.6190213001147911, "grad_norm": 5.382585627951489, "learning_rate": 4.851040288285786e-06, "loss": 0.4984, "step": 1820 }, { "epoch": 0.6207219080821393, "grad_norm": 8.002827010093496, "learning_rate": 4.850116687843963e-06, "loss": 0.4788, "step": 1825 }, { "epoch": 0.6224225160494877, "grad_norm": 3.9799533025063973, "learning_rate": 4.849190321403251e-06, "loss": 0.4498, "step": 1830 }, { "epoch": 0.624123124016836, "grad_norm": 8.894414052201425, "learning_rate": 4.848261190053946e-06, "loss": 0.4979, "step": 1835 }, { "epoch": 0.6258237319841844, "grad_norm": 3.412756862000719, "learning_rate": 4.8473292948896005e-06, "loss": 0.4979, "step": 1840 }, { "epoch": 0.6275243399515327, "grad_norm": 3.873859775374139, "learning_rate": 4.846394637007022e-06, "loss": 0.4902, "step": 1845 }, { "epoch": 0.629224947918881, "grad_norm": 3.69179871568231, "learning_rate": 4.845457217506265e-06, "loss": 0.48, "step": 1850 }, { "epoch": 0.6309255558862293, "grad_norm": 8.0157501503533, "learning_rate": 4.84451703749064e-06, "loss": 0.4866, "step": 1855 }, { "epoch": 0.6326261638535776, "grad_norm": 6.609620215897019, "learning_rate": 4.843574098066701e-06, "loss": 0.4965, "step": 1860 }, { "epoch": 0.634326771820926, "grad_norm": 3.693916650709301, "learning_rate": 4.842628400344253e-06, "loss": 0.4947, "step": 1865 }, { "epoch": 0.6360273797882743, "grad_norm": 4.423863521675628, "learning_rate": 4.841679945436348e-06, "loss": 0.4773, "step": 1870 }, { "epoch": 0.6377279877556227, "grad_norm": 6.541175340939763, "learning_rate": 4.84072873445928e-06, "loss": 0.4829, "step": 1875 }, { "epoch": 0.639428595722971, "grad_norm": 3.9317693296152307, "learning_rate": 4.8397747685325895e-06, "loss": 0.4943, "step": 1880 }, { "epoch": 0.6411292036903193, "grad_norm": 4.774570588691984, "learning_rate": 4.838818048779057e-06, "loss": 0.4889, "step": 1885 }, { "epoch": 0.6428298116576676, "grad_norm": 4.703072623525204, "learning_rate": 4.837858576324707e-06, "loss": 0.5005, "step": 1890 }, { "epoch": 0.6445304196250159, "grad_norm": 5.772442022027065, "learning_rate": 4.8368963522988024e-06, "loss": 0.4796, "step": 1895 }, { "epoch": 0.6462310275923643, "grad_norm": 8.030324664728342, "learning_rate": 4.835931377833845e-06, "loss": 0.4812, "step": 1900 }, { "epoch": 0.6479316355597126, "grad_norm": 3.5413513374729724, "learning_rate": 4.834963654065572e-06, "loss": 0.4856, "step": 1905 }, { "epoch": 0.649632243527061, "grad_norm": 6.0837977052511025, "learning_rate": 4.833993182132959e-06, "loss": 0.4722, "step": 1910 }, { "epoch": 0.6513328514944092, "grad_norm": 8.02450472974402, "learning_rate": 4.833019963178214e-06, "loss": 0.4681, "step": 1915 }, { "epoch": 0.6530334594617576, "grad_norm": 4.181925906522143, "learning_rate": 4.832043998346781e-06, "loss": 0.4709, "step": 1920 }, { "epoch": 0.6547340674291059, "grad_norm": 5.974847517676, "learning_rate": 4.831065288787331e-06, "loss": 0.4918, "step": 1925 }, { "epoch": 0.6564346753964543, "grad_norm": 4.168052732228723, "learning_rate": 4.830083835651771e-06, "loss": 0.4852, "step": 1930 }, { "epoch": 0.6581352833638026, "grad_norm": 6.232875709754771, "learning_rate": 4.829099640095233e-06, "loss": 0.4813, "step": 1935 }, { "epoch": 0.6598358913311508, "grad_norm": 28.52667043886641, "learning_rate": 4.828112703276078e-06, "loss": 0.4605, "step": 1940 }, { "epoch": 0.6615364992984992, "grad_norm": 46.96459715556022, "learning_rate": 4.827123026355895e-06, "loss": 0.494, "step": 1945 }, { "epoch": 0.6632371072658475, "grad_norm": 6.5226820623683075, "learning_rate": 4.826130610499495e-06, "loss": 0.4722, "step": 1950 }, { "epoch": 0.6649377152331959, "grad_norm": 7.257587063242, "learning_rate": 4.8251354568749135e-06, "loss": 0.4574, "step": 1955 }, { "epoch": 0.6666383232005442, "grad_norm": 16.44207952733053, "learning_rate": 4.824137566653411e-06, "loss": 0.4688, "step": 1960 }, { "epoch": 0.6683389311678926, "grad_norm": 10.657328727348993, "learning_rate": 4.823136941009465e-06, "loss": 0.5342, "step": 1965 }, { "epoch": 0.6700395391352408, "grad_norm": 11.670933092961624, "learning_rate": 4.822133581120775e-06, "loss": 0.4665, "step": 1970 }, { "epoch": 0.6717401471025892, "grad_norm": 12.400540360619097, "learning_rate": 4.821127488168258e-06, "loss": 0.4973, "step": 1975 }, { "epoch": 0.6734407550699375, "grad_norm": 9.656617982085303, "learning_rate": 4.820118663336047e-06, "loss": 0.4849, "step": 1980 }, { "epoch": 0.6751413630372858, "grad_norm": 7.568416904448632, "learning_rate": 4.819107107811491e-06, "loss": 0.4602, "step": 1985 }, { "epoch": 0.6768419710046342, "grad_norm": 25.48241061674708, "learning_rate": 4.818092822785153e-06, "loss": 0.4568, "step": 1990 }, { "epoch": 0.6785425789719824, "grad_norm": 12.682770296425272, "learning_rate": 4.817075809450808e-06, "loss": 0.4829, "step": 1995 }, { "epoch": 0.6802431869393308, "grad_norm": 204.72845926361106, "learning_rate": 4.816056069005442e-06, "loss": 0.4807, "step": 2000 }, { "epoch": 0.6819437949066791, "grad_norm": 10.013600707099535, "learning_rate": 4.815033602649253e-06, "loss": 0.4473, "step": 2005 }, { "epoch": 0.6836444028740275, "grad_norm": 4.951889922224116, "learning_rate": 4.814008411585644e-06, "loss": 0.4805, "step": 2010 }, { "epoch": 0.6853450108413758, "grad_norm": 7.398756791518998, "learning_rate": 4.812980497021225e-06, "loss": 0.4641, "step": 2015 }, { "epoch": 0.6870456188087242, "grad_norm": 22.45007834808948, "learning_rate": 4.811949860165815e-06, "loss": 0.4614, "step": 2020 }, { "epoch": 0.6887462267760724, "grad_norm": 5.60126858273136, "learning_rate": 4.810916502232434e-06, "loss": 0.479, "step": 2025 }, { "epoch": 0.6904468347434207, "grad_norm": 7.2788557552454085, "learning_rate": 4.809880424437306e-06, "loss": 0.4944, "step": 2030 }, { "epoch": 0.6921474427107691, "grad_norm": 4.656186187366622, "learning_rate": 4.808841627999854e-06, "loss": 0.4833, "step": 2035 }, { "epoch": 0.6938480506781174, "grad_norm": 30.248464246047728, "learning_rate": 4.807800114142703e-06, "loss": 0.4691, "step": 2040 }, { "epoch": 0.6955486586454658, "grad_norm": 10.50047738121829, "learning_rate": 4.806755884091676e-06, "loss": 0.4793, "step": 2045 }, { "epoch": 0.6972492666128141, "grad_norm": 31.140998402104383, "learning_rate": 4.8057089390757924e-06, "loss": 0.5166, "step": 2050 }, { "epoch": 0.6989498745801624, "grad_norm": 8.90954865280583, "learning_rate": 4.804659280327268e-06, "loss": 0.4768, "step": 2055 }, { "epoch": 0.7006504825475107, "grad_norm": 5.63500786631946, "learning_rate": 4.803606909081509e-06, "loss": 0.4884, "step": 2060 }, { "epoch": 0.7023510905148591, "grad_norm": 7.777067999911534, "learning_rate": 4.802551826577119e-06, "loss": 0.4808, "step": 2065 }, { "epoch": 0.7040516984822074, "grad_norm": 75.5694198184285, "learning_rate": 4.8014940340558905e-06, "loss": 0.4969, "step": 2070 }, { "epoch": 0.7057523064495557, "grad_norm": 6.143201015797779, "learning_rate": 4.800433532762804e-06, "loss": 0.4624, "step": 2075 }, { "epoch": 0.707452914416904, "grad_norm": 15.70149527409961, "learning_rate": 4.79937032394603e-06, "loss": 0.4822, "step": 2080 }, { "epoch": 0.7091535223842523, "grad_norm": 5.0318953609604575, "learning_rate": 4.7983044088569265e-06, "loss": 0.499, "step": 2085 }, { "epoch": 0.7108541303516007, "grad_norm": 5.90864243120211, "learning_rate": 4.797235788750034e-06, "loss": 0.4735, "step": 2090 }, { "epoch": 0.712554738318949, "grad_norm": 15.263140292292057, "learning_rate": 4.796164464883078e-06, "loss": 0.4926, "step": 2095 }, { "epoch": 0.7142553462862974, "grad_norm": 7.923676324997792, "learning_rate": 4.795090438516969e-06, "loss": 0.4995, "step": 2100 }, { "epoch": 0.7159559542536457, "grad_norm": 8.587103946080287, "learning_rate": 4.794013710915793e-06, "loss": 0.4451, "step": 2105 }, { "epoch": 0.717656562220994, "grad_norm": 4.278158469703077, "learning_rate": 4.792934283346817e-06, "loss": 0.475, "step": 2110 }, { "epoch": 0.7193571701883423, "grad_norm": 9.063783270010104, "learning_rate": 4.79185215708049e-06, "loss": 0.4841, "step": 2115 }, { "epoch": 0.7210577781556906, "grad_norm": 10.694221466512458, "learning_rate": 4.790767333390431e-06, "loss": 0.4758, "step": 2120 }, { "epoch": 0.722758386123039, "grad_norm": 12.8885009650192, "learning_rate": 4.789679813553439e-06, "loss": 0.4966, "step": 2125 }, { "epoch": 0.7244589940903873, "grad_norm": 5.5429762120679715, "learning_rate": 4.788589598849482e-06, "loss": 0.4346, "step": 2130 }, { "epoch": 0.7261596020577357, "grad_norm": 5.05789608696174, "learning_rate": 4.787496690561701e-06, "loss": 0.5014, "step": 2135 }, { "epoch": 0.7278602100250839, "grad_norm": 9.032561200753488, "learning_rate": 4.786401089976411e-06, "loss": 0.4887, "step": 2140 }, { "epoch": 0.7295608179924323, "grad_norm": 9.045399707510782, "learning_rate": 4.78530279838309e-06, "loss": 0.4758, "step": 2145 }, { "epoch": 0.7312614259597806, "grad_norm": 3.9827265791533573, "learning_rate": 4.784201817074387e-06, "loss": 0.4973, "step": 2150 }, { "epoch": 0.732962033927129, "grad_norm": 4.782939617573199, "learning_rate": 4.783098147346116e-06, "loss": 0.4871, "step": 2155 }, { "epoch": 0.7346626418944773, "grad_norm": 34.78788179165436, "learning_rate": 4.7819917904972534e-06, "loss": 0.4889, "step": 2160 }, { "epoch": 0.7363632498618256, "grad_norm": 6.056648176350482, "learning_rate": 4.78088274782994e-06, "loss": 0.4651, "step": 2165 }, { "epoch": 0.7380638578291739, "grad_norm": 3.1753379829358948, "learning_rate": 4.779771020649478e-06, "loss": 0.4661, "step": 2170 }, { "epoch": 0.7397644657965222, "grad_norm": 5.045547762543024, "learning_rate": 4.778656610264327e-06, "loss": 0.4597, "step": 2175 }, { "epoch": 0.7414650737638706, "grad_norm": 1269.2808540525675, "learning_rate": 4.777539517986109e-06, "loss": 0.4991, "step": 2180 }, { "epoch": 0.7431656817312189, "grad_norm": 4.946687469448479, "learning_rate": 4.776419745129596e-06, "loss": 0.4759, "step": 2185 }, { "epoch": 0.7448662896985673, "grad_norm": 3.4129729667348045, "learning_rate": 4.775297293012719e-06, "loss": 0.4765, "step": 2190 }, { "epoch": 0.7465668976659156, "grad_norm": 4.111665483599155, "learning_rate": 4.774172162956565e-06, "loss": 0.5037, "step": 2195 }, { "epoch": 0.7482675056332639, "grad_norm": 3.429305468282582, "learning_rate": 4.773044356285367e-06, "loss": 0.4945, "step": 2200 }, { "epoch": 0.7499681136006122, "grad_norm": 5.395335191960881, "learning_rate": 4.771913874326513e-06, "loss": 0.4932, "step": 2205 }, { "epoch": 0.7516687215679605, "grad_norm": 5.678600438248466, "learning_rate": 4.770780718410535e-06, "loss": 0.4765, "step": 2210 }, { "epoch": 0.7533693295353089, "grad_norm": 5.155283968677559, "learning_rate": 4.769644889871116e-06, "loss": 0.4555, "step": 2215 }, { "epoch": 0.7550699375026572, "grad_norm": 4.8046584137620405, "learning_rate": 4.768506390045085e-06, "loss": 0.4633, "step": 2220 }, { "epoch": 0.7567705454700056, "grad_norm": 5.655460549118874, "learning_rate": 4.767365220272412e-06, "loss": 0.4666, "step": 2225 }, { "epoch": 0.7584711534373538, "grad_norm": 25.4746580475505, "learning_rate": 4.76622138189621e-06, "loss": 0.459, "step": 2230 }, { "epoch": 0.7601717614047022, "grad_norm": 6.276798385082229, "learning_rate": 4.7650748762627355e-06, "loss": 0.4816, "step": 2235 }, { "epoch": 0.7618723693720505, "grad_norm": 5.280921721834364, "learning_rate": 4.763925704721382e-06, "loss": 0.4534, "step": 2240 }, { "epoch": 0.7635729773393989, "grad_norm": 5.545205141472925, "learning_rate": 4.762773868624681e-06, "loss": 0.4845, "step": 2245 }, { "epoch": 0.7652735853067472, "grad_norm": 4.326151217698957, "learning_rate": 4.7616193693282995e-06, "loss": 0.5031, "step": 2250 }, { "epoch": 0.7669741932740954, "grad_norm": 20.35394810547081, "learning_rate": 4.76046220819104e-06, "loss": 0.4931, "step": 2255 }, { "epoch": 0.7686748012414438, "grad_norm": 4.459175948393059, "learning_rate": 4.759302386574839e-06, "loss": 0.4586, "step": 2260 }, { "epoch": 0.7703754092087921, "grad_norm": 6.448537415717612, "learning_rate": 4.758139905844762e-06, "loss": 0.4915, "step": 2265 }, { "epoch": 0.7720760171761405, "grad_norm": 4.674162493692723, "learning_rate": 4.756974767369005e-06, "loss": 0.4758, "step": 2270 }, { "epoch": 0.7737766251434888, "grad_norm": 5.882512364200616, "learning_rate": 4.755806972518891e-06, "loss": 0.4757, "step": 2275 }, { "epoch": 0.7754772331108372, "grad_norm": 7.673762814821439, "learning_rate": 4.754636522668873e-06, "loss": 0.488, "step": 2280 }, { "epoch": 0.7771778410781854, "grad_norm": 8.76539028078966, "learning_rate": 4.753463419196523e-06, "loss": 0.4725, "step": 2285 }, { "epoch": 0.7788784490455338, "grad_norm": 5.686655568345862, "learning_rate": 4.752287663482544e-06, "loss": 0.4487, "step": 2290 }, { "epoch": 0.7805790570128821, "grad_norm": 10.123980373339233, "learning_rate": 4.751109256910753e-06, "loss": 0.4721, "step": 2295 }, { "epoch": 0.7822796649802304, "grad_norm": 11.095650127885145, "learning_rate": 4.749928200868092e-06, "loss": 0.446, "step": 2300 }, { "epoch": 0.7839802729475788, "grad_norm": 5.396599575860084, "learning_rate": 4.748744496744617e-06, "loss": 0.4875, "step": 2305 }, { "epoch": 0.785680880914927, "grad_norm": 5.606663991669503, "learning_rate": 4.747558145933506e-06, "loss": 0.4378, "step": 2310 }, { "epoch": 0.7873814888822754, "grad_norm": 45.39123351898769, "learning_rate": 4.7463691498310475e-06, "loss": 0.4757, "step": 2315 }, { "epoch": 0.7890820968496237, "grad_norm": 17.03989550437034, "learning_rate": 4.745177509836646e-06, "loss": 0.4598, "step": 2320 }, { "epoch": 0.7907827048169721, "grad_norm": 5.307561123525143, "learning_rate": 4.743983227352817e-06, "loss": 0.4667, "step": 2325 }, { "epoch": 0.7924833127843204, "grad_norm": 7.788495450749629, "learning_rate": 4.742786303785185e-06, "loss": 0.4743, "step": 2330 }, { "epoch": 0.7941839207516688, "grad_norm": 14.053571890786356, "learning_rate": 4.741586740542485e-06, "loss": 0.4795, "step": 2335 }, { "epoch": 0.795884528719017, "grad_norm": 7.322934279287959, "learning_rate": 4.740384539036559e-06, "loss": 0.4729, "step": 2340 }, { "epoch": 0.7975851366863653, "grad_norm": 37.789877963267486, "learning_rate": 4.739179700682349e-06, "loss": 0.4697, "step": 2345 }, { "epoch": 0.7992857446537137, "grad_norm": 11.595352099933702, "learning_rate": 4.737972226897909e-06, "loss": 0.4745, "step": 2350 }, { "epoch": 0.800986352621062, "grad_norm": 25.6667298770931, "learning_rate": 4.736762119104386e-06, "loss": 0.4823, "step": 2355 }, { "epoch": 0.8026869605884104, "grad_norm": 4.220556063695543, "learning_rate": 4.735549378726035e-06, "loss": 0.4595, "step": 2360 }, { "epoch": 0.8043875685557587, "grad_norm": 6.57385849376313, "learning_rate": 4.734334007190204e-06, "loss": 0.4611, "step": 2365 }, { "epoch": 0.806088176523107, "grad_norm": 4.593605374546327, "learning_rate": 4.7331160059273384e-06, "loss": 0.486, "step": 2370 }, { "epoch": 0.8077887844904553, "grad_norm": 6.230998624428787, "learning_rate": 4.7318953763709815e-06, "loss": 0.4626, "step": 2375 }, { "epoch": 0.8094893924578037, "grad_norm": 4.308841381825005, "learning_rate": 4.730672119957769e-06, "loss": 0.4836, "step": 2380 }, { "epoch": 0.811190000425152, "grad_norm": 10.504270551953045, "learning_rate": 4.729446238127426e-06, "loss": 0.4838, "step": 2385 }, { "epoch": 0.8128906083925003, "grad_norm": 8.747795183006614, "learning_rate": 4.72821773232277e-06, "loss": 0.4655, "step": 2390 }, { "epoch": 0.8145912163598487, "grad_norm": 4.696438683929611, "learning_rate": 4.726986603989706e-06, "loss": 0.4554, "step": 2395 }, { "epoch": 0.8162918243271969, "grad_norm": 5.197297769902515, "learning_rate": 4.725752854577226e-06, "loss": 0.4854, "step": 2400 }, { "epoch": 0.8179924322945453, "grad_norm": 9.708673608244993, "learning_rate": 4.724516485537406e-06, "loss": 0.4588, "step": 2405 }, { "epoch": 0.8196930402618936, "grad_norm": 6.237931890661816, "learning_rate": 4.723277498325406e-06, "loss": 0.4478, "step": 2410 }, { "epoch": 0.821393648229242, "grad_norm": 5.926988930736583, "learning_rate": 4.722035894399467e-06, "loss": 0.4962, "step": 2415 }, { "epoch": 0.8230942561965903, "grad_norm": 3.4452720408933137, "learning_rate": 4.7207916752209114e-06, "loss": 0.4626, "step": 2420 }, { "epoch": 0.8247948641639387, "grad_norm": 5.756019877459417, "learning_rate": 4.719544842254138e-06, "loss": 0.4521, "step": 2425 }, { "epoch": 0.8264954721312869, "grad_norm": 10.606430075297766, "learning_rate": 4.7182953969666205e-06, "loss": 0.4817, "step": 2430 }, { "epoch": 0.8281960800986352, "grad_norm": 6.154783522566068, "learning_rate": 4.7170433408289115e-06, "loss": 0.4515, "step": 2435 }, { "epoch": 0.8298966880659836, "grad_norm": 7.466536394495599, "learning_rate": 4.715788675314632e-06, "loss": 0.4731, "step": 2440 }, { "epoch": 0.8315972960333319, "grad_norm": 4.166417892434687, "learning_rate": 4.714531401900477e-06, "loss": 0.4818, "step": 2445 }, { "epoch": 0.8332979040006803, "grad_norm": 9.757418884123936, "learning_rate": 4.713271522066209e-06, "loss": 0.4447, "step": 2450 }, { "epoch": 0.8349985119680285, "grad_norm": 5.439076498016159, "learning_rate": 4.712009037294661e-06, "loss": 0.4596, "step": 2455 }, { "epoch": 0.8366991199353769, "grad_norm": 17.66563572208141, "learning_rate": 4.710743949071729e-06, "loss": 0.4622, "step": 2460 }, { "epoch": 0.8383997279027252, "grad_norm": 7.339347866198813, "learning_rate": 4.709476258886374e-06, "loss": 0.4777, "step": 2465 }, { "epoch": 0.8401003358700736, "grad_norm": 5.808232850206273, "learning_rate": 4.7082059682306205e-06, "loss": 0.4658, "step": 2470 }, { "epoch": 0.8418009438374219, "grad_norm": 4.758995702808101, "learning_rate": 4.706933078599552e-06, "loss": 0.4586, "step": 2475 }, { "epoch": 0.8435015518047702, "grad_norm": 5.963329705407439, "learning_rate": 4.70565759149131e-06, "loss": 0.457, "step": 2480 }, { "epoch": 0.8452021597721185, "grad_norm": 3.032448624315757, "learning_rate": 4.7043795084070984e-06, "loss": 0.4813, "step": 2485 }, { "epoch": 0.8469027677394668, "grad_norm": 5.6970102027520415, "learning_rate": 4.703098830851172e-06, "loss": 0.4655, "step": 2490 }, { "epoch": 0.8486033757068152, "grad_norm": 4.745699201763051, "learning_rate": 4.701815560330838e-06, "loss": 0.5015, "step": 2495 }, { "epoch": 0.8503039836741635, "grad_norm": 4.211802120114542, "learning_rate": 4.700529698356459e-06, "loss": 0.496, "step": 2500 }, { "epoch": 0.8520045916415119, "grad_norm": 4.025902675598321, "learning_rate": 4.699241246441445e-06, "loss": 0.4853, "step": 2505 }, { "epoch": 0.8537051996088602, "grad_norm": 3.816704219546769, "learning_rate": 4.697950206102258e-06, "loss": 0.4383, "step": 2510 }, { "epoch": 0.8554058075762085, "grad_norm": 9.123638826957368, "learning_rate": 4.6966565788584e-06, "loss": 0.4801, "step": 2515 }, { "epoch": 0.8571064155435568, "grad_norm": 3.5778897585484786, "learning_rate": 4.695360366232425e-06, "loss": 0.4646, "step": 2520 }, { "epoch": 0.8588070235109051, "grad_norm": 16.129442045616248, "learning_rate": 4.694061569749926e-06, "loss": 0.4688, "step": 2525 }, { "epoch": 0.8605076314782535, "grad_norm": 6.115096395312036, "learning_rate": 4.692760190939536e-06, "loss": 0.4795, "step": 2530 }, { "epoch": 0.8622082394456018, "grad_norm": 3.880119186396497, "learning_rate": 4.69145623133293e-06, "loss": 0.4716, "step": 2535 }, { "epoch": 0.8639088474129502, "grad_norm": 4.367106638465115, "learning_rate": 4.690149692464819e-06, "loss": 0.4711, "step": 2540 }, { "epoch": 0.8656094553802984, "grad_norm": 6.151471448173627, "learning_rate": 4.688840575872949e-06, "loss": 0.4756, "step": 2545 }, { "epoch": 0.8673100633476468, "grad_norm": 4.2213353474054625, "learning_rate": 4.687528883098104e-06, "loss": 0.4862, "step": 2550 }, { "epoch": 0.8690106713149951, "grad_norm": 3.7338203683525846, "learning_rate": 4.686214615684095e-06, "loss": 0.4693, "step": 2555 }, { "epoch": 0.8707112792823435, "grad_norm": 3.793322509112781, "learning_rate": 4.684897775177765e-06, "loss": 0.4807, "step": 2560 }, { "epoch": 0.8724118872496918, "grad_norm": 3.3071137506742536, "learning_rate": 4.683578363128985e-06, "loss": 0.4685, "step": 2565 }, { "epoch": 0.87411249521704, "grad_norm": 3.70214370047083, "learning_rate": 4.6822563810906555e-06, "loss": 0.4666, "step": 2570 }, { "epoch": 0.8758131031843884, "grad_norm": 12.650059452463687, "learning_rate": 4.680931830618698e-06, "loss": 0.4861, "step": 2575 }, { "epoch": 0.8775137111517367, "grad_norm": 5.255235753352614, "learning_rate": 4.67960471327206e-06, "loss": 0.4642, "step": 2580 }, { "epoch": 0.8792143191190851, "grad_norm": 3.427445126764981, "learning_rate": 4.678275030612708e-06, "loss": 0.4574, "step": 2585 }, { "epoch": 0.8809149270864334, "grad_norm": 3.619414149116754, "learning_rate": 4.676942784205627e-06, "loss": 0.4566, "step": 2590 }, { "epoch": 0.8826155350537818, "grad_norm": 4.467195890103454, "learning_rate": 4.675607975618823e-06, "loss": 0.4398, "step": 2595 }, { "epoch": 0.88431614302113, "grad_norm": 7.306921975067082, "learning_rate": 4.674270606423315e-06, "loss": 0.4616, "step": 2600 }, { "epoch": 0.8860167509884784, "grad_norm": 6.891857969882333, "learning_rate": 4.672930678193135e-06, "loss": 0.4634, "step": 2605 }, { "epoch": 0.8877173589558267, "grad_norm": 9.568307356460663, "learning_rate": 4.671588192505329e-06, "loss": 0.4473, "step": 2610 }, { "epoch": 0.889417966923175, "grad_norm": 6.489021091237962, "learning_rate": 4.670243150939951e-06, "loss": 0.4602, "step": 2615 }, { "epoch": 0.8911185748905234, "grad_norm": 4.418966555081972, "learning_rate": 4.668895555080067e-06, "loss": 0.467, "step": 2620 }, { "epoch": 0.8928191828578717, "grad_norm": 8.801700053716434, "learning_rate": 4.667545406511745e-06, "loss": 0.4814, "step": 2625 }, { "epoch": 0.89451979082522, "grad_norm": 6.603121083107336, "learning_rate": 4.666192706824058e-06, "loss": 0.4666, "step": 2630 }, { "epoch": 0.8962203987925683, "grad_norm": 17.83521499702972, "learning_rate": 4.664837457609084e-06, "loss": 0.4595, "step": 2635 }, { "epoch": 0.8979210067599167, "grad_norm": 10.753629924265912, "learning_rate": 4.6634796604619e-06, "loss": 0.4613, "step": 2640 }, { "epoch": 0.899621614727265, "grad_norm": 4.53607565282488, "learning_rate": 4.662119316980581e-06, "loss": 0.4613, "step": 2645 }, { "epoch": 0.9013222226946134, "grad_norm": 28.28754483294441, "learning_rate": 4.6607564287662025e-06, "loss": 0.4687, "step": 2650 }, { "epoch": 0.9030228306619617, "grad_norm": 5.844627428539397, "learning_rate": 4.6593909974228305e-06, "loss": 0.4811, "step": 2655 }, { "epoch": 0.9047234386293099, "grad_norm": 16.725038565305354, "learning_rate": 4.658023024557528e-06, "loss": 0.4608, "step": 2660 }, { "epoch": 0.9064240465966583, "grad_norm": 5.72088266552718, "learning_rate": 4.656652511780346e-06, "loss": 0.4965, "step": 2665 }, { "epoch": 0.9081246545640066, "grad_norm": 8.49705017317251, "learning_rate": 4.655279460704327e-06, "loss": 0.5135, "step": 2670 }, { "epoch": 0.909825262531355, "grad_norm": 13.24214617793498, "learning_rate": 4.653903872945501e-06, "loss": 0.4133, "step": 2675 }, { "epoch": 0.9115258704987033, "grad_norm": 16.276010232458102, "learning_rate": 4.652525750122881e-06, "loss": 0.4976, "step": 2680 }, { "epoch": 0.9132264784660516, "grad_norm": 6.662302437602245, "learning_rate": 4.651145093858469e-06, "loss": 0.472, "step": 2685 }, { "epoch": 0.9149270864333999, "grad_norm": 6.4514546340510055, "learning_rate": 4.6497619057772435e-06, "loss": 0.4697, "step": 2690 }, { "epoch": 0.9166276944007483, "grad_norm": 5.021678339377137, "learning_rate": 4.648376187507165e-06, "loss": 0.4664, "step": 2695 }, { "epoch": 0.9183283023680966, "grad_norm": 6.669097101168566, "learning_rate": 4.646987940679171e-06, "loss": 0.4601, "step": 2700 }, { "epoch": 0.9200289103354449, "grad_norm": 5.902255296320931, "learning_rate": 4.645597166927177e-06, "loss": 0.432, "step": 2705 }, { "epoch": 0.9217295183027933, "grad_norm": 3.8720393482781272, "learning_rate": 4.644203867888071e-06, "loss": 0.4432, "step": 2710 }, { "epoch": 0.9234301262701415, "grad_norm": 4.313561539291421, "learning_rate": 4.642808045201713e-06, "loss": 0.4798, "step": 2715 }, { "epoch": 0.9251307342374899, "grad_norm": 3.977826142428004, "learning_rate": 4.641409700510935e-06, "loss": 0.4506, "step": 2720 }, { "epoch": 0.9268313422048382, "grad_norm": 4.959302496448603, "learning_rate": 4.640008835461535e-06, "loss": 0.4767, "step": 2725 }, { "epoch": 0.9285319501721866, "grad_norm": 5.247335980334496, "learning_rate": 4.638605451702279e-06, "loss": 0.4629, "step": 2730 }, { "epoch": 0.9302325581395349, "grad_norm": 4.279180890288213, "learning_rate": 4.637199550884896e-06, "loss": 0.4616, "step": 2735 }, { "epoch": 0.9319331661068833, "grad_norm": 6.556960210684756, "learning_rate": 4.635791134664079e-06, "loss": 0.4738, "step": 2740 }, { "epoch": 0.9336337740742315, "grad_norm": 4.8762624843050135, "learning_rate": 4.634380204697481e-06, "loss": 0.4566, "step": 2745 }, { "epoch": 0.9353343820415798, "grad_norm": 9.973695201929845, "learning_rate": 4.632966762645713e-06, "loss": 0.4344, "step": 2750 }, { "epoch": 0.9370349900089282, "grad_norm": 7.938627728975852, "learning_rate": 4.631550810172344e-06, "loss": 0.4724, "step": 2755 }, { "epoch": 0.9387355979762765, "grad_norm": 4.1641712047801756, "learning_rate": 4.630132348943895e-06, "loss": 0.4425, "step": 2760 }, { "epoch": 0.9404362059436249, "grad_norm": 4.969972919685798, "learning_rate": 4.628711380629843e-06, "loss": 0.466, "step": 2765 }, { "epoch": 0.9421368139109731, "grad_norm": 4.011899311812286, "learning_rate": 4.627287906902615e-06, "loss": 0.4923, "step": 2770 }, { "epoch": 0.9438374218783215, "grad_norm": 11.1196782659473, "learning_rate": 4.625861929437584e-06, "loss": 0.4652, "step": 2775 }, { "epoch": 0.9455380298456698, "grad_norm": 3.6956879692855003, "learning_rate": 4.6244334499130725e-06, "loss": 0.4909, "step": 2780 }, { "epoch": 0.9472386378130182, "grad_norm": 19.071716213434737, "learning_rate": 4.6230024700103485e-06, "loss": 0.4454, "step": 2785 }, { "epoch": 0.9489392457803665, "grad_norm": 7.606087801760514, "learning_rate": 4.621568991413619e-06, "loss": 0.4779, "step": 2790 }, { "epoch": 0.9506398537477148, "grad_norm": 14.696215182161811, "learning_rate": 4.6201330158100354e-06, "loss": 0.4957, "step": 2795 }, { "epoch": 0.9523404617150631, "grad_norm": 3.8448687891191873, "learning_rate": 4.618694544889688e-06, "loss": 0.4629, "step": 2800 }, { "epoch": 0.9540410696824114, "grad_norm": 4.724455462118728, "learning_rate": 4.617253580345602e-06, "loss": 0.4518, "step": 2805 }, { "epoch": 0.9557416776497598, "grad_norm": 5.1164099572684325, "learning_rate": 4.6158101238737385e-06, "loss": 0.4759, "step": 2810 }, { "epoch": 0.9574422856171081, "grad_norm": 3.324153731402897, "learning_rate": 4.6143641771729914e-06, "loss": 0.445, "step": 2815 }, { "epoch": 0.9591428935844565, "grad_norm": 6.728755458552797, "learning_rate": 4.612915741945185e-06, "loss": 0.4359, "step": 2820 }, { "epoch": 0.9608435015518048, "grad_norm": 5.2302562387140625, "learning_rate": 4.611464819895075e-06, "loss": 0.4576, "step": 2825 }, { "epoch": 0.9625441095191531, "grad_norm": 9.353759033223223, "learning_rate": 4.61001141273034e-06, "loss": 0.4831, "step": 2830 }, { "epoch": 0.9642447174865014, "grad_norm": 5.5242719103803495, "learning_rate": 4.608555522161586e-06, "loss": 0.503, "step": 2835 }, { "epoch": 0.9659453254538497, "grad_norm": 7.7545751238763865, "learning_rate": 4.607097149902342e-06, "loss": 0.4575, "step": 2840 }, { "epoch": 0.9676459334211981, "grad_norm": 5.8608037887213795, "learning_rate": 4.605636297669057e-06, "loss": 0.4603, "step": 2845 }, { "epoch": 0.9693465413885464, "grad_norm": 7.1451302730190065, "learning_rate": 4.6041729671811e-06, "loss": 0.4429, "step": 2850 }, { "epoch": 0.9710471493558948, "grad_norm": 6.7365333394401645, "learning_rate": 4.602707160160753e-06, "loss": 0.4698, "step": 2855 }, { "epoch": 0.972747757323243, "grad_norm": 3.6076879083981384, "learning_rate": 4.601238878333218e-06, "loss": 0.4658, "step": 2860 }, { "epoch": 0.9744483652905914, "grad_norm": 9.201931404991615, "learning_rate": 4.599768123426608e-06, "loss": 0.4593, "step": 2865 }, { "epoch": 0.9761489732579397, "grad_norm": 2.7281621411639883, "learning_rate": 4.598294897171945e-06, "loss": 0.4848, "step": 2870 }, { "epoch": 0.9778495812252881, "grad_norm": 9.274983810881102, "learning_rate": 4.596819201303161e-06, "loss": 0.4579, "step": 2875 }, { "epoch": 0.9795501891926364, "grad_norm": 3.935092331123125, "learning_rate": 4.595341037557095e-06, "loss": 0.4814, "step": 2880 }, { "epoch": 0.9812507971599846, "grad_norm": 5.058622072366201, "learning_rate": 4.59386040767349e-06, "loss": 0.4646, "step": 2885 }, { "epoch": 0.982951405127333, "grad_norm": 4.204448443843568, "learning_rate": 4.59237731339499e-06, "loss": 0.4688, "step": 2890 }, { "epoch": 0.9846520130946813, "grad_norm": 3.554591216834179, "learning_rate": 4.590891756467143e-06, "loss": 0.4388, "step": 2895 }, { "epoch": 0.9863526210620297, "grad_norm": 32.519673573312126, "learning_rate": 4.589403738638393e-06, "loss": 0.454, "step": 2900 }, { "epoch": 0.988053229029378, "grad_norm": 3.968138282150363, "learning_rate": 4.587913261660081e-06, "loss": 0.4524, "step": 2905 }, { "epoch": 0.9897538369967264, "grad_norm": 4.53973252164776, "learning_rate": 4.586420327286442e-06, "loss": 0.4525, "step": 2910 }, { "epoch": 0.9914544449640746, "grad_norm": 6.107299256583201, "learning_rate": 4.584924937274606e-06, "loss": 0.4386, "step": 2915 }, { "epoch": 0.993155052931423, "grad_norm": 3.4189689701799253, "learning_rate": 4.583427093384587e-06, "loss": 0.4676, "step": 2920 }, { "epoch": 0.9948556608987713, "grad_norm": 4.958127080161495, "learning_rate": 4.581926797379293e-06, "loss": 0.4401, "step": 2925 }, { "epoch": 0.9965562688661196, "grad_norm": 6.7040103587950775, "learning_rate": 4.580424051024514e-06, "loss": 0.449, "step": 2930 }, { "epoch": 0.998256876833468, "grad_norm": 4.556130318387315, "learning_rate": 4.57891885608893e-06, "loss": 0.445, "step": 2935 }, { "epoch": 0.9999574848008163, "grad_norm": 6.870877500958566, "learning_rate": 4.577411214344095e-06, "loss": 0.4826, "step": 2940 }, { "epoch": 1.0013604863738788, "grad_norm": 4.65609078495718, "learning_rate": 4.5759011275644476e-06, "loss": 0.3659, "step": 2945 }, { "epoch": 1.003061094341227, "grad_norm": 6.095258824323697, "learning_rate": 4.574388597527303e-06, "loss": 0.4694, "step": 2950 }, { "epoch": 1.0047617023085753, "grad_norm": 13.23422585656265, "learning_rate": 4.5728736260128534e-06, "loss": 0.4472, "step": 2955 }, { "epoch": 1.0064623102759236, "grad_norm": 12.631205425261738, "learning_rate": 4.571356214804162e-06, "loss": 0.4765, "step": 2960 }, { "epoch": 1.0081629182432719, "grad_norm": 5.619759816789756, "learning_rate": 4.569836365687164e-06, "loss": 0.4474, "step": 2965 }, { "epoch": 1.0098635262106204, "grad_norm": 7.497889977874089, "learning_rate": 4.568314080450667e-06, "loss": 0.4511, "step": 2970 }, { "epoch": 1.0115641341779686, "grad_norm": 10.195727142868547, "learning_rate": 4.566789360886341e-06, "loss": 0.4309, "step": 2975 }, { "epoch": 1.013264742145317, "grad_norm": 7.06258951989465, "learning_rate": 4.565262208788725e-06, "loss": 0.4776, "step": 2980 }, { "epoch": 1.0149653501126652, "grad_norm": 4.829993597545264, "learning_rate": 4.5637326259552195e-06, "loss": 0.4804, "step": 2985 }, { "epoch": 1.0166659580800137, "grad_norm": 3.9442902889511386, "learning_rate": 4.562200614186085e-06, "loss": 0.4682, "step": 2990 }, { "epoch": 1.018366566047362, "grad_norm": 16.10502076363894, "learning_rate": 4.560666175284441e-06, "loss": 0.4772, "step": 2995 }, { "epoch": 1.0200671740147103, "grad_norm": 6.765284857357975, "learning_rate": 4.559129311056268e-06, "loss": 0.4535, "step": 3000 }, { "epoch": 1.0217677819820585, "grad_norm": 11.530197956087518, "learning_rate": 4.557590023310393e-06, "loss": 0.4638, "step": 3005 }, { "epoch": 1.0234683899494068, "grad_norm": 3.2137510867237205, "learning_rate": 4.556048313858503e-06, "loss": 0.4323, "step": 3010 }, { "epoch": 1.0251689979167553, "grad_norm": 5.4221029924761535, "learning_rate": 4.554504184515129e-06, "loss": 0.4371, "step": 3015 }, { "epoch": 1.0268696058841036, "grad_norm": 4.507971574496705, "learning_rate": 4.552957637097657e-06, "loss": 0.4416, "step": 3020 }, { "epoch": 1.0285702138514519, "grad_norm": 9.912936642252575, "learning_rate": 4.551408673426311e-06, "loss": 0.4822, "step": 3025 }, { "epoch": 1.0302708218188001, "grad_norm": 5.346358979650128, "learning_rate": 4.5498572953241655e-06, "loss": 0.4549, "step": 3030 }, { "epoch": 1.0319714297861486, "grad_norm": 6.011299429761261, "learning_rate": 4.548303504617133e-06, "loss": 0.4475, "step": 3035 }, { "epoch": 1.033672037753497, "grad_norm": 3.2839622653873755, "learning_rate": 4.546747303133968e-06, "loss": 0.4557, "step": 3040 }, { "epoch": 1.0353726457208452, "grad_norm": 23.614972369165166, "learning_rate": 4.54518869270626e-06, "loss": 0.4278, "step": 3045 }, { "epoch": 1.0370732536881935, "grad_norm": 4.660578836842771, "learning_rate": 4.543627675168434e-06, "loss": 0.4481, "step": 3050 }, { "epoch": 1.0387738616555418, "grad_norm": 10.575905769665871, "learning_rate": 4.542064252357751e-06, "loss": 0.4574, "step": 3055 }, { "epoch": 1.0404744696228903, "grad_norm": 7.379932610615159, "learning_rate": 4.540498426114299e-06, "loss": 0.4684, "step": 3060 }, { "epoch": 1.0421750775902385, "grad_norm": 8.807344023575178, "learning_rate": 4.538930198280998e-06, "loss": 0.4404, "step": 3065 }, { "epoch": 1.0438756855575868, "grad_norm": 4.329853268424765, "learning_rate": 4.537359570703591e-06, "loss": 0.4524, "step": 3070 }, { "epoch": 1.045576293524935, "grad_norm": 3.074946328347183, "learning_rate": 4.53578654523065e-06, "loss": 0.445, "step": 3075 }, { "epoch": 1.0472769014922836, "grad_norm": 5.265770851396301, "learning_rate": 4.5342111237135655e-06, "loss": 0.4519, "step": 3080 }, { "epoch": 1.0489775094596319, "grad_norm": 10.079118708913205, "learning_rate": 4.53263330800655e-06, "loss": 0.4468, "step": 3085 }, { "epoch": 1.0506781174269801, "grad_norm": 4.1319553496504176, "learning_rate": 4.531053099966632e-06, "loss": 0.4688, "step": 3090 }, { "epoch": 1.0523787253943284, "grad_norm": 3.4649959921528852, "learning_rate": 4.529470501453659e-06, "loss": 0.4332, "step": 3095 }, { "epoch": 1.0540793333616767, "grad_norm": 6.613087258668495, "learning_rate": 4.527885514330287e-06, "loss": 0.4308, "step": 3100 }, { "epoch": 1.0557799413290252, "grad_norm": 3.239224295796761, "learning_rate": 4.5262981404619885e-06, "loss": 0.4474, "step": 3105 }, { "epoch": 1.0574805492963735, "grad_norm": 6.602689444846102, "learning_rate": 4.524708381717042e-06, "loss": 0.4446, "step": 3110 }, { "epoch": 1.0591811572637218, "grad_norm": 6.351423156609205, "learning_rate": 4.523116239966533e-06, "loss": 0.4614, "step": 3115 }, { "epoch": 1.06088176523107, "grad_norm": 5.347267558037939, "learning_rate": 4.521521717084354e-06, "loss": 0.4599, "step": 3120 }, { "epoch": 1.0625823731984185, "grad_norm": 6.26149946924594, "learning_rate": 4.519924814947197e-06, "loss": 0.4677, "step": 3125 }, { "epoch": 1.0642829811657668, "grad_norm": 5.167871496778332, "learning_rate": 4.518325535434557e-06, "loss": 0.4429, "step": 3130 }, { "epoch": 1.065983589133115, "grad_norm": 3.674103729786612, "learning_rate": 4.516723880428725e-06, "loss": 0.4338, "step": 3135 }, { "epoch": 1.0676841971004634, "grad_norm": 7.916041729824052, "learning_rate": 4.515119851814788e-06, "loss": 0.4385, "step": 3140 }, { "epoch": 1.0693848050678116, "grad_norm": 6.309378831258314, "learning_rate": 4.513513451480629e-06, "loss": 0.4474, "step": 3145 }, { "epoch": 1.0710854130351601, "grad_norm": 3.51815928246974, "learning_rate": 4.511904681316919e-06, "loss": 0.4418, "step": 3150 }, { "epoch": 1.0727860210025084, "grad_norm": 7.411610274427342, "learning_rate": 4.5102935432171215e-06, "loss": 0.464, "step": 3155 }, { "epoch": 1.0744866289698567, "grad_norm": 28.98091241308416, "learning_rate": 4.508680039077484e-06, "loss": 0.4333, "step": 3160 }, { "epoch": 1.076187236937205, "grad_norm": 4.84016751436113, "learning_rate": 4.507064170797041e-06, "loss": 0.4527, "step": 3165 }, { "epoch": 1.0778878449045535, "grad_norm": 28.89849564174859, "learning_rate": 4.505445940277608e-06, "loss": 0.4709, "step": 3170 }, { "epoch": 1.0795884528719017, "grad_norm": 6.460096323509922, "learning_rate": 4.503825349423782e-06, "loss": 0.4478, "step": 3175 }, { "epoch": 1.08128906083925, "grad_norm": 4.593375607263882, "learning_rate": 4.502202400142938e-06, "loss": 0.4602, "step": 3180 }, { "epoch": 1.0829896688065983, "grad_norm": 6.4386210353397795, "learning_rate": 4.500577094345224e-06, "loss": 0.4451, "step": 3185 }, { "epoch": 1.0846902767739466, "grad_norm": 6.617364744010471, "learning_rate": 4.498949433943567e-06, "loss": 0.469, "step": 3190 }, { "epoch": 1.086390884741295, "grad_norm": 9.878249271965329, "learning_rate": 4.497319420853658e-06, "loss": 0.4468, "step": 3195 }, { "epoch": 1.0880914927086434, "grad_norm": 13.343862517629125, "learning_rate": 4.495687056993966e-06, "loss": 0.4649, "step": 3200 }, { "epoch": 1.0897921006759916, "grad_norm": 8.119912932277256, "learning_rate": 4.4940523442857176e-06, "loss": 0.4563, "step": 3205 }, { "epoch": 1.09149270864334, "grad_norm": 5.451266829499762, "learning_rate": 4.49241528465291e-06, "loss": 0.438, "step": 3210 }, { "epoch": 1.0931933166106884, "grad_norm": 5.946305828392075, "learning_rate": 4.490775880022301e-06, "loss": 0.4577, "step": 3215 }, { "epoch": 1.0948939245780367, "grad_norm": 8.190063989790795, "learning_rate": 4.489134132323407e-06, "loss": 0.4244, "step": 3220 }, { "epoch": 1.096594532545385, "grad_norm": 5.297926056975686, "learning_rate": 4.487490043488504e-06, "loss": 0.4599, "step": 3225 }, { "epoch": 1.0982951405127332, "grad_norm": 4.262145728783015, "learning_rate": 4.485843615452622e-06, "loss": 0.4454, "step": 3230 }, { "epoch": 1.0999957484800815, "grad_norm": 5.098276012820175, "learning_rate": 4.484194850153546e-06, "loss": 0.4561, "step": 3235 }, { "epoch": 1.10169635644743, "grad_norm": 7.273781115260355, "learning_rate": 4.4825437495318105e-06, "loss": 0.468, "step": 3240 }, { "epoch": 1.1033969644147783, "grad_norm": 4.933918921533191, "learning_rate": 4.480890315530698e-06, "loss": 0.453, "step": 3245 }, { "epoch": 1.1050975723821266, "grad_norm": 17.626403443693594, "learning_rate": 4.479234550096238e-06, "loss": 0.4442, "step": 3250 }, { "epoch": 1.1067981803494749, "grad_norm": 23.676732400103766, "learning_rate": 4.477576455177205e-06, "loss": 0.4455, "step": 3255 }, { "epoch": 1.1084987883168234, "grad_norm": 6.900734191009607, "learning_rate": 4.475916032725114e-06, "loss": 0.4584, "step": 3260 }, { "epoch": 1.1101993962841716, "grad_norm": 16.725403823482687, "learning_rate": 4.474253284694219e-06, "loss": 0.4481, "step": 3265 }, { "epoch": 1.11190000425152, "grad_norm": 4.13809372200513, "learning_rate": 4.472588213041514e-06, "loss": 0.4184, "step": 3270 }, { "epoch": 1.1136006122188682, "grad_norm": 11.916770024145645, "learning_rate": 4.470920819726722e-06, "loss": 0.4359, "step": 3275 }, { "epoch": 1.1153012201862165, "grad_norm": 4.327376908798274, "learning_rate": 4.469251106712306e-06, "loss": 0.4727, "step": 3280 }, { "epoch": 1.117001828153565, "grad_norm": 15.701918872574687, "learning_rate": 4.467579075963452e-06, "loss": 0.4573, "step": 3285 }, { "epoch": 1.1187024361209132, "grad_norm": 3.1919875586325404, "learning_rate": 4.46590472944808e-06, "loss": 0.4645, "step": 3290 }, { "epoch": 1.1204030440882615, "grad_norm": 3.694077218569007, "learning_rate": 4.464228069136832e-06, "loss": 0.4337, "step": 3295 }, { "epoch": 1.1221036520556098, "grad_norm": 4.949030798295776, "learning_rate": 4.462549097003074e-06, "loss": 0.4358, "step": 3300 }, { "epoch": 1.1238042600229583, "grad_norm": 9.681946220701647, "learning_rate": 4.460867815022892e-06, "loss": 0.4263, "step": 3305 }, { "epoch": 1.1255048679903066, "grad_norm": 3.977714371887995, "learning_rate": 4.459184225175093e-06, "loss": 0.3886, "step": 3310 }, { "epoch": 1.1272054759576549, "grad_norm": 11.168871492637267, "learning_rate": 4.4574983294411986e-06, "loss": 0.4295, "step": 3315 }, { "epoch": 1.1289060839250031, "grad_norm": 6.249038479588919, "learning_rate": 4.455810129805443e-06, "loss": 0.4602, "step": 3320 }, { "epoch": 1.1306066918923516, "grad_norm": 6.658957506492276, "learning_rate": 4.454119628254776e-06, "loss": 0.4495, "step": 3325 }, { "epoch": 1.1323072998597, "grad_norm": 3.170050443727355, "learning_rate": 4.452426826778854e-06, "loss": 0.4616, "step": 3330 }, { "epoch": 1.1340079078270482, "grad_norm": 4.034518863293827, "learning_rate": 4.45073172737004e-06, "loss": 0.4699, "step": 3335 }, { "epoch": 1.1357085157943965, "grad_norm": 14.956232374656196, "learning_rate": 4.449034332023401e-06, "loss": 0.4378, "step": 3340 }, { "epoch": 1.1374091237617447, "grad_norm": 5.544782939969213, "learning_rate": 4.447334642736709e-06, "loss": 0.4552, "step": 3345 }, { "epoch": 1.1391097317290932, "grad_norm": 4.562157624305602, "learning_rate": 4.445632661510434e-06, "loss": 0.4546, "step": 3350 }, { "epoch": 1.1408103396964415, "grad_norm": 4.225927268811578, "learning_rate": 4.443928390347744e-06, "loss": 0.4529, "step": 3355 }, { "epoch": 1.1425109476637898, "grad_norm": 4.3257536228426074, "learning_rate": 4.442221831254502e-06, "loss": 0.4285, "step": 3360 }, { "epoch": 1.144211555631138, "grad_norm": 5.977479998785959, "learning_rate": 4.440512986239263e-06, "loss": 0.449, "step": 3365 }, { "epoch": 1.1459121635984864, "grad_norm": 4.4066822856795635, "learning_rate": 4.438801857313274e-06, "loss": 0.4453, "step": 3370 }, { "epoch": 1.1476127715658349, "grad_norm": 3.491649964713033, "learning_rate": 4.437088446490469e-06, "loss": 0.4454, "step": 3375 }, { "epoch": 1.1493133795331831, "grad_norm": 7.232323088989081, "learning_rate": 4.435372755787469e-06, "loss": 0.4534, "step": 3380 }, { "epoch": 1.1510139875005314, "grad_norm": 3.293718858011363, "learning_rate": 4.433654787223576e-06, "loss": 0.4597, "step": 3385 }, { "epoch": 1.1527145954678797, "grad_norm": 7.433792579695618, "learning_rate": 4.431934542820775e-06, "loss": 0.4573, "step": 3390 }, { "epoch": 1.1544152034352282, "grad_norm": 3.82924370466945, "learning_rate": 4.4302120246037295e-06, "loss": 0.4567, "step": 3395 }, { "epoch": 1.1561158114025765, "grad_norm": 7.983481827345274, "learning_rate": 4.428487234599777e-06, "loss": 0.4414, "step": 3400 }, { "epoch": 1.1578164193699247, "grad_norm": 9.436609447791255, "learning_rate": 4.426760174838932e-06, "loss": 0.4262, "step": 3405 }, { "epoch": 1.159517027337273, "grad_norm": 4.743398659803592, "learning_rate": 4.425030847353878e-06, "loss": 0.4155, "step": 3410 }, { "epoch": 1.1612176353046215, "grad_norm": 3.204865750073265, "learning_rate": 4.42329925417997e-06, "loss": 0.4773, "step": 3415 }, { "epoch": 1.1629182432719698, "grad_norm": 12.234446040030164, "learning_rate": 4.421565397355225e-06, "loss": 0.4617, "step": 3420 }, { "epoch": 1.164618851239318, "grad_norm": 4.4116669121739305, "learning_rate": 4.41982927892033e-06, "loss": 0.4391, "step": 3425 }, { "epoch": 1.1663194592066664, "grad_norm": 5.111216592507992, "learning_rate": 4.418090900918629e-06, "loss": 0.4631, "step": 3430 }, { "epoch": 1.1680200671740146, "grad_norm": 7.803946682117352, "learning_rate": 4.416350265396129e-06, "loss": 0.4269, "step": 3435 }, { "epoch": 1.1697206751413631, "grad_norm": 3.828818855349512, "learning_rate": 4.414607374401492e-06, "loss": 0.4256, "step": 3440 }, { "epoch": 1.1714212831087114, "grad_norm": 5.059207154144129, "learning_rate": 4.412862229986034e-06, "loss": 0.4122, "step": 3445 }, { "epoch": 1.1731218910760597, "grad_norm": 5.035231185244624, "learning_rate": 4.411114834203726e-06, "loss": 0.4311, "step": 3450 }, { "epoch": 1.174822499043408, "grad_norm": 4.045539793414604, "learning_rate": 4.409365189111187e-06, "loss": 0.4287, "step": 3455 }, { "epoch": 1.1765231070107562, "grad_norm": 6.292496913133239, "learning_rate": 4.407613296767682e-06, "loss": 0.4475, "step": 3460 }, { "epoch": 1.1782237149781047, "grad_norm": 3.6674441134732283, "learning_rate": 4.405859159235123e-06, "loss": 0.4389, "step": 3465 }, { "epoch": 1.179924322945453, "grad_norm": 5.160559006742301, "learning_rate": 4.404102778578064e-06, "loss": 0.4276, "step": 3470 }, { "epoch": 1.1816249309128013, "grad_norm": 4.055580025128073, "learning_rate": 4.402344156863699e-06, "loss": 0.457, "step": 3475 }, { "epoch": 1.1833255388801496, "grad_norm": 4.361898800931103, "learning_rate": 4.40058329616186e-06, "loss": 0.4286, "step": 3480 }, { "epoch": 1.185026146847498, "grad_norm": 6.495140588757689, "learning_rate": 4.398820198545013e-06, "loss": 0.4453, "step": 3485 }, { "epoch": 1.1867267548148464, "grad_norm": 11.019077172110329, "learning_rate": 4.397054866088258e-06, "loss": 0.442, "step": 3490 }, { "epoch": 1.1884273627821946, "grad_norm": 5.228402398620584, "learning_rate": 4.3952873008693245e-06, "loss": 0.4321, "step": 3495 }, { "epoch": 1.190127970749543, "grad_norm": 4.805648806263351, "learning_rate": 4.39351750496857e-06, "loss": 0.4366, "step": 3500 }, { "epoch": 1.1918285787168914, "grad_norm": 7.796340025598192, "learning_rate": 4.391745480468978e-06, "loss": 0.461, "step": 3505 }, { "epoch": 1.1935291866842397, "grad_norm": 3.063990773463929, "learning_rate": 4.389971229456154e-06, "loss": 0.4458, "step": 3510 }, { "epoch": 1.195229794651588, "grad_norm": 4.429362476378763, "learning_rate": 4.388194754018327e-06, "loss": 0.4428, "step": 3515 }, { "epoch": 1.1969304026189362, "grad_norm": 2.9371227567250004, "learning_rate": 4.38641605624634e-06, "loss": 0.4306, "step": 3520 }, { "epoch": 1.1986310105862845, "grad_norm": 3.8651376005294944, "learning_rate": 4.384635138233653e-06, "loss": 0.4449, "step": 3525 }, { "epoch": 1.200331618553633, "grad_norm": 3.4940040974535727, "learning_rate": 4.38285200207634e-06, "loss": 0.4482, "step": 3530 }, { "epoch": 1.2020322265209813, "grad_norm": 5.782702541050796, "learning_rate": 4.381066649873085e-06, "loss": 0.4207, "step": 3535 }, { "epoch": 1.2037328344883296, "grad_norm": 5.50472142631619, "learning_rate": 4.37927908372518e-06, "loss": 0.4572, "step": 3540 }, { "epoch": 1.2054334424556779, "grad_norm": 7.824789252994781, "learning_rate": 4.3774893057365244e-06, "loss": 0.4318, "step": 3545 }, { "epoch": 1.2071340504230261, "grad_norm": 13.54244874218656, "learning_rate": 4.375697318013618e-06, "loss": 0.4301, "step": 3550 }, { "epoch": 1.2088346583903746, "grad_norm": 4.230530471384282, "learning_rate": 4.373903122665563e-06, "loss": 0.4292, "step": 3555 }, { "epoch": 1.210535266357723, "grad_norm": 6.658320209793368, "learning_rate": 4.372106721804061e-06, "loss": 0.4368, "step": 3560 }, { "epoch": 1.2122358743250712, "grad_norm": 2.857261861954253, "learning_rate": 4.370308117543407e-06, "loss": 0.4522, "step": 3565 }, { "epoch": 1.2139364822924195, "grad_norm": 5.39200827416774, "learning_rate": 4.368507312000491e-06, "loss": 0.4387, "step": 3570 }, { "epoch": 1.215637090259768, "grad_norm": 23.642260016581286, "learning_rate": 4.366704307294794e-06, "loss": 0.4245, "step": 3575 }, { "epoch": 1.2173376982271162, "grad_norm": 4.715256234640616, "learning_rate": 4.364899105548384e-06, "loss": 0.4123, "step": 3580 }, { "epoch": 1.2190383061944645, "grad_norm": 5.092755025756587, "learning_rate": 4.363091708885916e-06, "loss": 0.4303, "step": 3585 }, { "epoch": 1.2207389141618128, "grad_norm": 4.644408008547105, "learning_rate": 4.361282119434626e-06, "loss": 0.4406, "step": 3590 }, { "epoch": 1.2224395221291613, "grad_norm": 3.8748710347038995, "learning_rate": 4.359470339324335e-06, "loss": 0.4194, "step": 3595 }, { "epoch": 1.2241401300965096, "grad_norm": 6.939396520641551, "learning_rate": 4.35765637068744e-06, "loss": 0.4308, "step": 3600 }, { "epoch": 1.2258407380638578, "grad_norm": 5.883415452113619, "learning_rate": 4.355840215658912e-06, "loss": 0.4245, "step": 3605 }, { "epoch": 1.2275413460312061, "grad_norm": 4.372515269841919, "learning_rate": 4.354021876376297e-06, "loss": 0.466, "step": 3610 }, { "epoch": 1.2292419539985544, "grad_norm": 3.38508833964315, "learning_rate": 4.352201354979715e-06, "loss": 0.4437, "step": 3615 }, { "epoch": 1.230942561965903, "grad_norm": 4.334673736588709, "learning_rate": 4.350378653611848e-06, "loss": 0.4338, "step": 3620 }, { "epoch": 1.2326431699332512, "grad_norm": 4.362597499023024, "learning_rate": 4.348553774417948e-06, "loss": 0.4537, "step": 3625 }, { "epoch": 1.2343437779005995, "grad_norm": 10.152475333998257, "learning_rate": 4.346726719545828e-06, "loss": 0.437, "step": 3630 }, { "epoch": 1.2360443858679477, "grad_norm": 6.917711265677128, "learning_rate": 4.344897491145866e-06, "loss": 0.4567, "step": 3635 }, { "epoch": 1.237744993835296, "grad_norm": 3.0024625558233393, "learning_rate": 4.343066091370992e-06, "loss": 0.4586, "step": 3640 }, { "epoch": 1.2394456018026445, "grad_norm": 6.066921141014438, "learning_rate": 4.341232522376696e-06, "loss": 0.4439, "step": 3645 }, { "epoch": 1.2411462097699928, "grad_norm": 10.159516057402293, "learning_rate": 4.339396786321018e-06, "loss": 0.4262, "step": 3650 }, { "epoch": 1.242846817737341, "grad_norm": 6.1930370198982105, "learning_rate": 4.337558885364552e-06, "loss": 0.4147, "step": 3655 }, { "epoch": 1.2445474257046893, "grad_norm": 4.609436343311945, "learning_rate": 4.335718821670439e-06, "loss": 0.4256, "step": 3660 }, { "epoch": 1.2462480336720378, "grad_norm": 6.704588153460611, "learning_rate": 4.333876597404362e-06, "loss": 0.4298, "step": 3665 }, { "epoch": 1.2479486416393861, "grad_norm": 6.537990452906174, "learning_rate": 4.332032214734552e-06, "loss": 0.4309, "step": 3670 }, { "epoch": 1.2496492496067344, "grad_norm": 6.06218752888017, "learning_rate": 4.3301856758317765e-06, "loss": 0.4543, "step": 3675 }, { "epoch": 1.2513498575740827, "grad_norm": 4.072305293977542, "learning_rate": 4.328336982869343e-06, "loss": 0.4389, "step": 3680 }, { "epoch": 1.2530504655414312, "grad_norm": 5.420253035287267, "learning_rate": 4.326486138023094e-06, "loss": 0.4543, "step": 3685 }, { "epoch": 1.2547510735087795, "grad_norm": 4.952172100936549, "learning_rate": 4.324633143471402e-06, "loss": 0.4219, "step": 3690 }, { "epoch": 1.2564516814761277, "grad_norm": 4.895280001327636, "learning_rate": 4.322778001395174e-06, "loss": 0.4624, "step": 3695 }, { "epoch": 1.258152289443476, "grad_norm": 5.021111606296189, "learning_rate": 4.320920713977843e-06, "loss": 0.4493, "step": 3700 }, { "epoch": 1.2598528974108243, "grad_norm": 8.792884307722462, "learning_rate": 4.319061283405365e-06, "loss": 0.4507, "step": 3705 }, { "epoch": 1.2615535053781728, "grad_norm": 8.462433497526717, "learning_rate": 4.317199711866219e-06, "loss": 0.4387, "step": 3710 }, { "epoch": 1.263254113345521, "grad_norm": 11.089756877549231, "learning_rate": 4.315336001551407e-06, "loss": 0.4351, "step": 3715 }, { "epoch": 1.2649547213128693, "grad_norm": 4.625412332527817, "learning_rate": 4.313470154654443e-06, "loss": 0.4348, "step": 3720 }, { "epoch": 1.2666553292802176, "grad_norm": 5.432964358134367, "learning_rate": 4.311602173371362e-06, "loss": 0.4486, "step": 3725 }, { "epoch": 1.268355937247566, "grad_norm": 4.206142665765904, "learning_rate": 4.309732059900705e-06, "loss": 0.448, "step": 3730 }, { "epoch": 1.2700565452149144, "grad_norm": 6.159446545724075, "learning_rate": 4.307859816443526e-06, "loss": 0.4086, "step": 3735 }, { "epoch": 1.2717571531822627, "grad_norm": 7.831797795407591, "learning_rate": 4.305985445203385e-06, "loss": 0.4538, "step": 3740 }, { "epoch": 1.273457761149611, "grad_norm": 8.60007583844762, "learning_rate": 4.304108948386346e-06, "loss": 0.4534, "step": 3745 }, { "epoch": 1.2751583691169592, "grad_norm": 16.34293430521431, "learning_rate": 4.3022303282009755e-06, "loss": 0.4493, "step": 3750 }, { "epoch": 1.2768589770843075, "grad_norm": 5.278467269983003, "learning_rate": 4.30034958685834e-06, "loss": 0.4491, "step": 3755 }, { "epoch": 1.278559585051656, "grad_norm": 9.360024586550447, "learning_rate": 4.298466726571999e-06, "loss": 0.4212, "step": 3760 }, { "epoch": 1.2802601930190043, "grad_norm": 3.7113866114527947, "learning_rate": 4.296581749558011e-06, "loss": 0.4638, "step": 3765 }, { "epoch": 1.2819608009863526, "grad_norm": 5.854958952627499, "learning_rate": 4.29469465803492e-06, "loss": 0.4462, "step": 3770 }, { "epoch": 1.283661408953701, "grad_norm": 3.6818168663588784, "learning_rate": 4.292805454223763e-06, "loss": 0.4291, "step": 3775 }, { "epoch": 1.2853620169210493, "grad_norm": 3.243734301495061, "learning_rate": 4.290914140348063e-06, "loss": 0.4256, "step": 3780 }, { "epoch": 1.2870626248883976, "grad_norm": 3.2162443775372447, "learning_rate": 4.289020718633822e-06, "loss": 0.4217, "step": 3785 }, { "epoch": 1.288763232855746, "grad_norm": 5.687066509718208, "learning_rate": 4.28712519130953e-06, "loss": 0.4271, "step": 3790 }, { "epoch": 1.2904638408230942, "grad_norm": 24.368889431915512, "learning_rate": 4.285227560606149e-06, "loss": 0.4308, "step": 3795 }, { "epoch": 1.2921644487904427, "grad_norm": 4.154763886002037, "learning_rate": 4.2833278287571186e-06, "loss": 0.4554, "step": 3800 }, { "epoch": 1.293865056757791, "grad_norm": 6.2104169824121955, "learning_rate": 4.281425997998353e-06, "loss": 0.4408, "step": 3805 }, { "epoch": 1.2955656647251392, "grad_norm": 4.695856290250523, "learning_rate": 4.279522070568235e-06, "loss": 0.457, "step": 3810 }, { "epoch": 1.2972662726924875, "grad_norm": 5.810017736651081, "learning_rate": 4.277616048707615e-06, "loss": 0.4248, "step": 3815 }, { "epoch": 1.2989668806598358, "grad_norm": 6.107344653532907, "learning_rate": 4.2757079346598105e-06, "loss": 0.4452, "step": 3820 }, { "epoch": 1.3006674886271843, "grad_norm": 3.4942653230821863, "learning_rate": 4.273797730670598e-06, "loss": 0.4217, "step": 3825 }, { "epoch": 1.3023680965945326, "grad_norm": 18.112883039304148, "learning_rate": 4.271885438988217e-06, "loss": 0.437, "step": 3830 }, { "epoch": 1.3040687045618808, "grad_norm": 19.389834849422982, "learning_rate": 4.269971061863362e-06, "loss": 0.419, "step": 3835 }, { "epoch": 1.3057693125292291, "grad_norm": 5.474955920511076, "learning_rate": 4.268054601549183e-06, "loss": 0.4595, "step": 3840 }, { "epoch": 1.3074699204965774, "grad_norm": 2.9582870715276792, "learning_rate": 4.2661360603012825e-06, "loss": 0.4621, "step": 3845 }, { "epoch": 1.309170528463926, "grad_norm": 16.34027412673593, "learning_rate": 4.2642154403777105e-06, "loss": 0.4233, "step": 3850 }, { "epoch": 1.3108711364312742, "grad_norm": 5.918808175469473, "learning_rate": 4.262292744038964e-06, "loss": 0.4406, "step": 3855 }, { "epoch": 1.3125717443986225, "grad_norm": 3.399498455379893, "learning_rate": 4.260367973547985e-06, "loss": 0.4299, "step": 3860 }, { "epoch": 1.314272352365971, "grad_norm": 13.542704000607294, "learning_rate": 4.258441131170157e-06, "loss": 0.4132, "step": 3865 }, { "epoch": 1.3159729603333192, "grad_norm": 9.02967740830736, "learning_rate": 4.256512219173298e-06, "loss": 0.4239, "step": 3870 }, { "epoch": 1.3176735683006675, "grad_norm": 3.9130220430192524, "learning_rate": 4.254581239827667e-06, "loss": 0.4342, "step": 3875 }, { "epoch": 1.3193741762680158, "grad_norm": 5.574299002291679, "learning_rate": 4.252648195405954e-06, "loss": 0.4568, "step": 3880 }, { "epoch": 1.321074784235364, "grad_norm": 6.3324159546947465, "learning_rate": 4.250713088183278e-06, "loss": 0.4419, "step": 3885 }, { "epoch": 1.3227753922027126, "grad_norm": 14.164208290467345, "learning_rate": 4.248775920437191e-06, "loss": 0.4401, "step": 3890 }, { "epoch": 1.3244760001700608, "grad_norm": 6.1888271893560916, "learning_rate": 4.246836694447661e-06, "loss": 0.4475, "step": 3895 }, { "epoch": 1.3261766081374091, "grad_norm": 2.957848596009968, "learning_rate": 4.244895412497088e-06, "loss": 0.4035, "step": 3900 }, { "epoch": 1.3278772161047574, "grad_norm": 4.295157380784761, "learning_rate": 4.242952076870287e-06, "loss": 0.4524, "step": 3905 }, { "epoch": 1.3295778240721057, "grad_norm": 9.30317062363427, "learning_rate": 4.241006689854491e-06, "loss": 0.4306, "step": 3910 }, { "epoch": 1.3312784320394542, "grad_norm": 4.482380339897904, "learning_rate": 4.239059253739346e-06, "loss": 0.4121, "step": 3915 }, { "epoch": 1.3329790400068025, "grad_norm": 4.424068078801065, "learning_rate": 4.237109770816913e-06, "loss": 0.4164, "step": 3920 }, { "epoch": 1.3346796479741507, "grad_norm": 4.998850928883728, "learning_rate": 4.235158243381658e-06, "loss": 0.4392, "step": 3925 }, { "epoch": 1.336380255941499, "grad_norm": 6.3018273454234635, "learning_rate": 4.233204673730456e-06, "loss": 0.4501, "step": 3930 }, { "epoch": 1.3380808639088473, "grad_norm": 8.502288918753674, "learning_rate": 4.231249064162586e-06, "loss": 0.4359, "step": 3935 }, { "epoch": 1.3397814718761958, "grad_norm": 4.793062000875688, "learning_rate": 4.229291416979726e-06, "loss": 0.455, "step": 3940 }, { "epoch": 1.341482079843544, "grad_norm": 9.180274029644583, "learning_rate": 4.227331734485953e-06, "loss": 0.4115, "step": 3945 }, { "epoch": 1.3431826878108923, "grad_norm": 3.119289594195432, "learning_rate": 4.225370018987741e-06, "loss": 0.4362, "step": 3950 }, { "epoch": 1.3448832957782408, "grad_norm": 7.274300467490672, "learning_rate": 4.223406272793953e-06, "loss": 0.4284, "step": 3955 }, { "epoch": 1.3465839037455891, "grad_norm": 4.6503861534660675, "learning_rate": 4.221440498215845e-06, "loss": 0.426, "step": 3960 }, { "epoch": 1.3482845117129374, "grad_norm": 5.703563707259275, "learning_rate": 4.21947269756706e-06, "loss": 0.4249, "step": 3965 }, { "epoch": 1.3499851196802857, "grad_norm": 4.120422582463924, "learning_rate": 4.217502873163626e-06, "loss": 0.4319, "step": 3970 }, { "epoch": 1.351685727647634, "grad_norm": 7.764067180947526, "learning_rate": 4.215531027323952e-06, "loss": 0.4291, "step": 3975 }, { "epoch": 1.3533863356149824, "grad_norm": 3.2581402948638423, "learning_rate": 4.2135571623688244e-06, "loss": 0.4511, "step": 3980 }, { "epoch": 1.3550869435823307, "grad_norm": 4.221269122403145, "learning_rate": 4.211581280621411e-06, "loss": 0.4295, "step": 3985 }, { "epoch": 1.356787551549679, "grad_norm": 5.052878261691366, "learning_rate": 4.209603384407248e-06, "loss": 0.4488, "step": 3990 }, { "epoch": 1.3584881595170273, "grad_norm": 9.378569205330985, "learning_rate": 4.207623476054246e-06, "loss": 0.4381, "step": 3995 }, { "epoch": 1.3601887674843756, "grad_norm": 6.522064207623179, "learning_rate": 4.205641557892682e-06, "loss": 0.4298, "step": 4000 }, { "epoch": 1.361889375451724, "grad_norm": 5.294462301922804, "learning_rate": 4.203657632255199e-06, "loss": 0.421, "step": 4005 }, { "epoch": 1.3635899834190723, "grad_norm": 4.134239000036112, "learning_rate": 4.201671701476803e-06, "loss": 0.4444, "step": 4010 }, { "epoch": 1.3652905913864206, "grad_norm": 6.035380347976283, "learning_rate": 4.19968376789486e-06, "loss": 0.4213, "step": 4015 }, { "epoch": 1.366991199353769, "grad_norm": 4.7922651854375315, "learning_rate": 4.1976938338490925e-06, "loss": 0.4403, "step": 4020 }, { "epoch": 1.3686918073211172, "grad_norm": 6.631140177235056, "learning_rate": 4.195701901681579e-06, "loss": 0.4352, "step": 4025 }, { "epoch": 1.3703924152884657, "grad_norm": 2.7344288539898334, "learning_rate": 4.193707973736747e-06, "loss": 0.4093, "step": 4030 }, { "epoch": 1.372093023255814, "grad_norm": 4.278791460437773, "learning_rate": 4.1917120523613766e-06, "loss": 0.4422, "step": 4035 }, { "epoch": 1.3737936312231622, "grad_norm": 5.214406477643654, "learning_rate": 4.18971413990459e-06, "loss": 0.4579, "step": 4040 }, { "epoch": 1.3754942391905107, "grad_norm": 6.063078786132071, "learning_rate": 4.187714238717857e-06, "loss": 0.4394, "step": 4045 }, { "epoch": 1.377194847157859, "grad_norm": 63.69948112970655, "learning_rate": 4.185712351154985e-06, "loss": 0.439, "step": 4050 }, { "epoch": 1.3788954551252073, "grad_norm": 3.2261921517248573, "learning_rate": 4.18370847957212e-06, "loss": 0.4198, "step": 4055 }, { "epoch": 1.3805960630925556, "grad_norm": 3.634713290168712, "learning_rate": 4.181702626327745e-06, "loss": 0.461, "step": 4060 }, { "epoch": 1.3822966710599038, "grad_norm": 3.277595544835171, "learning_rate": 4.179694793782673e-06, "loss": 0.4596, "step": 4065 }, { "epoch": 1.3839972790272523, "grad_norm": 5.408346655373303, "learning_rate": 4.177684984300046e-06, "loss": 0.418, "step": 4070 }, { "epoch": 1.3856978869946006, "grad_norm": 4.2892952787221015, "learning_rate": 4.1756732002453345e-06, "loss": 0.4087, "step": 4075 }, { "epoch": 1.387398494961949, "grad_norm": 3.3627724696710706, "learning_rate": 4.173659443986334e-06, "loss": 0.4316, "step": 4080 }, { "epoch": 1.3890991029292972, "grad_norm": 3.651825188574973, "learning_rate": 4.17164371789316e-06, "loss": 0.4455, "step": 4085 }, { "epoch": 1.3907997108966454, "grad_norm": 3.366971683134897, "learning_rate": 4.169626024338245e-06, "loss": 0.4476, "step": 4090 }, { "epoch": 1.392500318863994, "grad_norm": 4.027885573886271, "learning_rate": 4.167606365696337e-06, "loss": 0.4213, "step": 4095 }, { "epoch": 1.3942009268313422, "grad_norm": 84.54064644279089, "learning_rate": 4.165584744344502e-06, "loss": 0.4188, "step": 4100 }, { "epoch": 1.3959015347986905, "grad_norm": 3.550654174206856, "learning_rate": 4.163561162662109e-06, "loss": 0.4367, "step": 4105 }, { "epoch": 1.3976021427660388, "grad_norm": 6.429938907038778, "learning_rate": 4.161535623030839e-06, "loss": 0.4347, "step": 4110 }, { "epoch": 1.399302750733387, "grad_norm": 3.3469080043284527, "learning_rate": 4.159508127834676e-06, "loss": 0.429, "step": 4115 }, { "epoch": 1.4010033587007356, "grad_norm": 3.932907324437497, "learning_rate": 4.157478679459904e-06, "loss": 0.45, "step": 4120 }, { "epoch": 1.4027039666680838, "grad_norm": 3.0465101479498093, "learning_rate": 4.155447280295109e-06, "loss": 0.4143, "step": 4125 }, { "epoch": 1.4044045746354321, "grad_norm": 6.2332284033665015, "learning_rate": 4.153413932731172e-06, "loss": 0.4155, "step": 4130 }, { "epoch": 1.4061051826027806, "grad_norm": 4.594711738829712, "learning_rate": 4.151378639161263e-06, "loss": 0.4174, "step": 4135 }, { "epoch": 1.407805790570129, "grad_norm": 8.369037345643367, "learning_rate": 4.14934140198085e-06, "loss": 0.4428, "step": 4140 }, { "epoch": 1.4095063985374772, "grad_norm": 4.450892512279648, "learning_rate": 4.147302223587683e-06, "loss": 0.4323, "step": 4145 }, { "epoch": 1.4112070065048254, "grad_norm": 3.5572501388195965, "learning_rate": 4.145261106381797e-06, "loss": 0.4364, "step": 4150 }, { "epoch": 1.4129076144721737, "grad_norm": 3.5926379046106303, "learning_rate": 4.1432180527655105e-06, "loss": 0.4504, "step": 4155 }, { "epoch": 1.4146082224395222, "grad_norm": 3.7409778582649875, "learning_rate": 4.1411730651434224e-06, "loss": 0.4366, "step": 4160 }, { "epoch": 1.4163088304068705, "grad_norm": 3.3685418588380545, "learning_rate": 4.1391261459224055e-06, "loss": 0.4467, "step": 4165 }, { "epoch": 1.4180094383742188, "grad_norm": 6.697596996121014, "learning_rate": 4.137077297511606e-06, "loss": 0.434, "step": 4170 }, { "epoch": 1.419710046341567, "grad_norm": 4.224825510306204, "learning_rate": 4.135026522322441e-06, "loss": 0.4157, "step": 4175 }, { "epoch": 1.4214106543089153, "grad_norm": 4.358559841751216, "learning_rate": 4.132973822768597e-06, "loss": 0.4242, "step": 4180 }, { "epoch": 1.4231112622762638, "grad_norm": 4.658618977150568, "learning_rate": 4.130919201266023e-06, "loss": 0.4228, "step": 4185 }, { "epoch": 1.424811870243612, "grad_norm": 4.397079121273564, "learning_rate": 4.1288626602329316e-06, "loss": 0.4458, "step": 4190 }, { "epoch": 1.4265124782109604, "grad_norm": 8.910498551520558, "learning_rate": 4.126804202089795e-06, "loss": 0.4336, "step": 4195 }, { "epoch": 1.4282130861783087, "grad_norm": 5.317988762383234, "learning_rate": 4.12474382925934e-06, "loss": 0.4274, "step": 4200 }, { "epoch": 1.429913694145657, "grad_norm": 6.420028086835303, "learning_rate": 4.122681544166548e-06, "loss": 0.4249, "step": 4205 }, { "epoch": 1.4316143021130054, "grad_norm": 3.3486351335128677, "learning_rate": 4.120617349238651e-06, "loss": 0.4311, "step": 4210 }, { "epoch": 1.4333149100803537, "grad_norm": 3.765520622561078, "learning_rate": 4.118551246905128e-06, "loss": 0.404, "step": 4215 }, { "epoch": 1.435015518047702, "grad_norm": 4.361629163635694, "learning_rate": 4.116483239597706e-06, "loss": 0.4358, "step": 4220 }, { "epoch": 1.4367161260150505, "grad_norm": 8.161954348572857, "learning_rate": 4.1144133297503495e-06, "loss": 0.4396, "step": 4225 }, { "epoch": 1.4384167339823988, "grad_norm": 5.343639505025115, "learning_rate": 4.1123415197992645e-06, "loss": 0.4488, "step": 4230 }, { "epoch": 1.440117341949747, "grad_norm": 3.896952357426291, "learning_rate": 4.1102678121828956e-06, "loss": 0.4164, "step": 4235 }, { "epoch": 1.4418179499170953, "grad_norm": 3.778144495904373, "learning_rate": 4.108192209341916e-06, "loss": 0.429, "step": 4240 }, { "epoch": 1.4435185578844436, "grad_norm": 4.379626057470475, "learning_rate": 4.1061147137192325e-06, "loss": 0.4273, "step": 4245 }, { "epoch": 1.445219165851792, "grad_norm": 3.3970834318244982, "learning_rate": 4.104035327759981e-06, "loss": 0.4324, "step": 4250 }, { "epoch": 1.4469197738191404, "grad_norm": 6.877015746846206, "learning_rate": 4.101954053911519e-06, "loss": 0.427, "step": 4255 }, { "epoch": 1.4486203817864887, "grad_norm": 8.771197237906213, "learning_rate": 4.099870894623429e-06, "loss": 0.427, "step": 4260 }, { "epoch": 1.450320989753837, "grad_norm": 4.106539562178847, "learning_rate": 4.097785852347509e-06, "loss": 0.4327, "step": 4265 }, { "epoch": 1.4520215977211852, "grad_norm": 3.0368575048625503, "learning_rate": 4.095698929537776e-06, "loss": 0.4247, "step": 4270 }, { "epoch": 1.4537222056885337, "grad_norm": 4.406684997834663, "learning_rate": 4.093610128650461e-06, "loss": 0.4328, "step": 4275 }, { "epoch": 1.455422813655882, "grad_norm": 10.807724313086286, "learning_rate": 4.091519452144002e-06, "loss": 0.4093, "step": 4280 }, { "epoch": 1.4571234216232303, "grad_norm": 4.2489620858096995, "learning_rate": 4.089426902479048e-06, "loss": 0.4166, "step": 4285 }, { "epoch": 1.4588240295905786, "grad_norm": 15.82562295022666, "learning_rate": 4.0873324821184505e-06, "loss": 0.4137, "step": 4290 }, { "epoch": 1.4605246375579268, "grad_norm": 20.912345821951686, "learning_rate": 4.085236193527264e-06, "loss": 0.4524, "step": 4295 }, { "epoch": 1.4622252455252753, "grad_norm": 10.918557580898346, "learning_rate": 4.08313803917274e-06, "loss": 0.4268, "step": 4300 }, { "epoch": 1.4639258534926236, "grad_norm": 3.4822802477088337, "learning_rate": 4.08103802152433e-06, "loss": 0.4283, "step": 4305 }, { "epoch": 1.4656264614599719, "grad_norm": 4.537596013386029, "learning_rate": 4.078936143053673e-06, "loss": 0.4502, "step": 4310 }, { "epoch": 1.4673270694273204, "grad_norm": 4.038796537691099, "learning_rate": 4.076832406234601e-06, "loss": 0.4429, "step": 4315 }, { "epoch": 1.4690276773946687, "grad_norm": 5.579890053055556, "learning_rate": 4.074726813543134e-06, "loss": 0.424, "step": 4320 }, { "epoch": 1.470728285362017, "grad_norm": 6.688734469203853, "learning_rate": 4.072619367457475e-06, "loss": 0.4596, "step": 4325 }, { "epoch": 1.4724288933293652, "grad_norm": 3.5175009361444314, "learning_rate": 4.070510070458009e-06, "loss": 0.4227, "step": 4330 }, { "epoch": 1.4741295012967135, "grad_norm": 10.407473182628523, "learning_rate": 4.068398925027299e-06, "loss": 0.4277, "step": 4335 }, { "epoch": 1.475830109264062, "grad_norm": 4.001806688070732, "learning_rate": 4.0662859336500834e-06, "loss": 0.4382, "step": 4340 }, { "epoch": 1.4775307172314103, "grad_norm": 9.423854658257099, "learning_rate": 4.064171098813274e-06, "loss": 0.4325, "step": 4345 }, { "epoch": 1.4792313251987586, "grad_norm": 2.8345757589574205, "learning_rate": 4.062054423005952e-06, "loss": 0.4203, "step": 4350 }, { "epoch": 1.4809319331661068, "grad_norm": 6.113162495170087, "learning_rate": 4.059935908719366e-06, "loss": 0.4194, "step": 4355 }, { "epoch": 1.482632541133455, "grad_norm": 3.5573072498137313, "learning_rate": 4.0578155584469245e-06, "loss": 0.4406, "step": 4360 }, { "epoch": 1.4843331491008036, "grad_norm": 6.944016100572673, "learning_rate": 4.055693374684203e-06, "loss": 0.4426, "step": 4365 }, { "epoch": 1.4860337570681519, "grad_norm": 3.0991563801030995, "learning_rate": 4.05356935992893e-06, "loss": 0.4286, "step": 4370 }, { "epoch": 1.4877343650355002, "grad_norm": 3.876016849273567, "learning_rate": 4.051443516680991e-06, "loss": 0.4059, "step": 4375 }, { "epoch": 1.4894349730028484, "grad_norm": 3.1618825548418514, "learning_rate": 4.049315847442426e-06, "loss": 0.4508, "step": 4380 }, { "epoch": 1.4911355809701967, "grad_norm": 3.26722333104821, "learning_rate": 4.047186354717419e-06, "loss": 0.4377, "step": 4385 }, { "epoch": 1.4928361889375452, "grad_norm": 4.949642233534566, "learning_rate": 4.045055041012304e-06, "loss": 0.4172, "step": 4390 }, { "epoch": 1.4945367969048935, "grad_norm": 3.9737091600809586, "learning_rate": 4.042921908835557e-06, "loss": 0.425, "step": 4395 }, { "epoch": 1.4962374048722418, "grad_norm": 5.237403020237026, "learning_rate": 4.040786960697793e-06, "loss": 0.4168, "step": 4400 }, { "epoch": 1.4979380128395903, "grad_norm": 3.30834199256293, "learning_rate": 4.038650199111766e-06, "loss": 0.4019, "step": 4405 }, { "epoch": 1.4996386208069385, "grad_norm": 9.193316798315635, "learning_rate": 4.036511626592366e-06, "loss": 0.4412, "step": 4410 }, { "epoch": 1.5013392287742868, "grad_norm": 3.367832782080482, "learning_rate": 4.034371245656611e-06, "loss": 0.4338, "step": 4415 }, { "epoch": 1.503039836741635, "grad_norm": 6.827626401756652, "learning_rate": 4.0322290588236475e-06, "loss": 0.4103, "step": 4420 }, { "epoch": 1.5047404447089834, "grad_norm": 9.80059877372828, "learning_rate": 4.03008506861475e-06, "loss": 0.4311, "step": 4425 }, { "epoch": 1.5064410526763319, "grad_norm": 4.0833672787431095, "learning_rate": 4.027939277553314e-06, "loss": 0.422, "step": 4430 }, { "epoch": 1.5081416606436802, "grad_norm": 4.352980395173143, "learning_rate": 4.025791688164856e-06, "loss": 0.4165, "step": 4435 }, { "epoch": 1.5098422686110284, "grad_norm": 3.6443666952119393, "learning_rate": 4.023642302977007e-06, "loss": 0.418, "step": 4440 }, { "epoch": 1.5115428765783767, "grad_norm": 2.5535581373220566, "learning_rate": 4.021491124519512e-06, "loss": 0.4128, "step": 4445 }, { "epoch": 1.513243484545725, "grad_norm": 2.5635629960000257, "learning_rate": 4.0193381553242275e-06, "loss": 0.4265, "step": 4450 }, { "epoch": 1.5149440925130735, "grad_norm": 9.71155883976001, "learning_rate": 4.017183397925116e-06, "loss": 0.4059, "step": 4455 }, { "epoch": 1.5166447004804218, "grad_norm": 3.963949214840473, "learning_rate": 4.015026854858248e-06, "loss": 0.4315, "step": 4460 }, { "epoch": 1.51834530844777, "grad_norm": 10.098506319170221, "learning_rate": 4.012868528661793e-06, "loss": 0.4197, "step": 4465 }, { "epoch": 1.5200459164151185, "grad_norm": 7.311071257965433, "learning_rate": 4.01070842187602e-06, "loss": 0.3961, "step": 4470 }, { "epoch": 1.5217465243824666, "grad_norm": 2.5919745636282623, "learning_rate": 4.0085465370432906e-06, "loss": 0.4339, "step": 4475 }, { "epoch": 1.523447132349815, "grad_norm": 9.414821359866528, "learning_rate": 4.006382876708066e-06, "loss": 0.4381, "step": 4480 }, { "epoch": 1.5251477403171634, "grad_norm": 3.2069921113732196, "learning_rate": 4.004217443416889e-06, "loss": 0.4221, "step": 4485 }, { "epoch": 1.5268483482845117, "grad_norm": 4.108172954869957, "learning_rate": 4.0020502397183955e-06, "loss": 0.4404, "step": 4490 }, { "epoch": 1.5285489562518602, "grad_norm": 4.7412987043119506, "learning_rate": 3.9998812681633036e-06, "loss": 0.4076, "step": 4495 }, { "epoch": 1.5302495642192082, "grad_norm": 5.177073536747887, "learning_rate": 3.9977105313044084e-06, "loss": 0.4039, "step": 4500 }, { "epoch": 1.5319501721865567, "grad_norm": 3.6093301676901537, "learning_rate": 3.995538031696588e-06, "loss": 0.4138, "step": 4505 }, { "epoch": 1.533650780153905, "grad_norm": 5.26144190406865, "learning_rate": 3.99336377189679e-06, "loss": 0.4079, "step": 4510 }, { "epoch": 1.5353513881212533, "grad_norm": 4.173292833080453, "learning_rate": 3.991187754464039e-06, "loss": 0.437, "step": 4515 }, { "epoch": 1.5370519960886018, "grad_norm": 4.696152943235458, "learning_rate": 3.989009981959424e-06, "loss": 0.4103, "step": 4520 }, { "epoch": 1.53875260405595, "grad_norm": 4.3396528642715335, "learning_rate": 3.986830456946102e-06, "loss": 0.3966, "step": 4525 }, { "epoch": 1.5404532120232983, "grad_norm": 9.149805076911427, "learning_rate": 3.984649181989292e-06, "loss": 0.4195, "step": 4530 }, { "epoch": 1.5421538199906466, "grad_norm": 6.507804562542518, "learning_rate": 3.982466159656271e-06, "loss": 0.4295, "step": 4535 }, { "epoch": 1.5438544279579949, "grad_norm": 3.6654863638989053, "learning_rate": 3.980281392516376e-06, "loss": 0.4303, "step": 4540 }, { "epoch": 1.5455550359253434, "grad_norm": 4.5023392766309325, "learning_rate": 3.978094883140996e-06, "loss": 0.42, "step": 4545 }, { "epoch": 1.5472556438926917, "grad_norm": 5.735810888404813, "learning_rate": 3.975906634103569e-06, "loss": 0.4228, "step": 4550 }, { "epoch": 1.54895625186004, "grad_norm": 4.02035436801197, "learning_rate": 3.973716647979581e-06, "loss": 0.4124, "step": 4555 }, { "epoch": 1.5506568598273884, "grad_norm": 4.557603410060842, "learning_rate": 3.971524927346565e-06, "loss": 0.4281, "step": 4560 }, { "epoch": 1.5523574677947365, "grad_norm": 4.962119472804869, "learning_rate": 3.969331474784092e-06, "loss": 0.4406, "step": 4565 }, { "epoch": 1.554058075762085, "grad_norm": 3.2416613132367713, "learning_rate": 3.967136292873776e-06, "loss": 0.4329, "step": 4570 }, { "epoch": 1.5557586837294333, "grad_norm": 7.2371279744266115, "learning_rate": 3.96493938419926e-06, "loss": 0.4146, "step": 4575 }, { "epoch": 1.5574592916967815, "grad_norm": 9.496380193325114, "learning_rate": 3.962740751346224e-06, "loss": 0.4391, "step": 4580 }, { "epoch": 1.55915989966413, "grad_norm": 3.9368690731569314, "learning_rate": 3.960540396902378e-06, "loss": 0.4201, "step": 4585 }, { "epoch": 1.560860507631478, "grad_norm": 11.53126552855203, "learning_rate": 3.958338323457455e-06, "loss": 0.4242, "step": 4590 }, { "epoch": 1.5625611155988266, "grad_norm": 3.5540690187464805, "learning_rate": 3.956134533603211e-06, "loss": 0.4191, "step": 4595 }, { "epoch": 1.5642617235661749, "grad_norm": 4.546387352827355, "learning_rate": 3.953929029933427e-06, "loss": 0.4361, "step": 4600 }, { "epoch": 1.5659623315335232, "grad_norm": 4.516207050932684, "learning_rate": 3.951721815043895e-06, "loss": 0.4319, "step": 4605 }, { "epoch": 1.5676629395008717, "grad_norm": 4.965601982415111, "learning_rate": 3.949512891532424e-06, "loss": 0.3922, "step": 4610 }, { "epoch": 1.56936354746822, "grad_norm": 4.085020023753369, "learning_rate": 3.9473022619988364e-06, "loss": 0.4238, "step": 4615 }, { "epoch": 1.5710641554355682, "grad_norm": 6.507811927998727, "learning_rate": 3.945089929044957e-06, "loss": 0.4134, "step": 4620 }, { "epoch": 1.5727647634029165, "grad_norm": 3.126306855943486, "learning_rate": 3.94287589527462e-06, "loss": 0.4193, "step": 4625 }, { "epoch": 1.5744653713702648, "grad_norm": 5.4206195687295295, "learning_rate": 3.940660163293659e-06, "loss": 0.4537, "step": 4630 }, { "epoch": 1.5761659793376133, "grad_norm": 4.308804498979996, "learning_rate": 3.9384427357099084e-06, "loss": 0.4532, "step": 4635 }, { "epoch": 1.5778665873049615, "grad_norm": 3.5101886165442937, "learning_rate": 3.936223615133195e-06, "loss": 0.3952, "step": 4640 }, { "epoch": 1.5795671952723098, "grad_norm": 5.5337619345336915, "learning_rate": 3.934002804175343e-06, "loss": 0.4122, "step": 4645 }, { "epoch": 1.5812678032396583, "grad_norm": 3.6597291462675376, "learning_rate": 3.931780305450161e-06, "loss": 0.4046, "step": 4650 }, { "epoch": 1.5829684112070064, "grad_norm": 22.15305892455393, "learning_rate": 3.929556121573447e-06, "loss": 0.4326, "step": 4655 }, { "epoch": 1.5846690191743549, "grad_norm": 3.649143816745209, "learning_rate": 3.9273302551629825e-06, "loss": 0.4141, "step": 4660 }, { "epoch": 1.5863696271417032, "grad_norm": 6.823266912741127, "learning_rate": 3.925102708838527e-06, "loss": 0.4178, "step": 4665 }, { "epoch": 1.5880702351090514, "grad_norm": 5.531710013288801, "learning_rate": 3.92287348522182e-06, "loss": 0.4406, "step": 4670 }, { "epoch": 1.5897708430764, "grad_norm": 8.859640722590902, "learning_rate": 3.920642586936573e-06, "loss": 0.4097, "step": 4675 }, { "epoch": 1.591471451043748, "grad_norm": 3.0945720240734005, "learning_rate": 3.918410016608469e-06, "loss": 0.4484, "step": 4680 }, { "epoch": 1.5931720590110965, "grad_norm": 5.242209952001623, "learning_rate": 3.916175776865161e-06, "loss": 0.4035, "step": 4685 }, { "epoch": 1.5948726669784448, "grad_norm": 4.970760265373998, "learning_rate": 3.9139398703362635e-06, "loss": 0.4475, "step": 4690 }, { "epoch": 1.596573274945793, "grad_norm": 4.629251668056431, "learning_rate": 3.911702299653355e-06, "loss": 0.4346, "step": 4695 }, { "epoch": 1.5982738829131415, "grad_norm": 6.736287650537567, "learning_rate": 3.909463067449971e-06, "loss": 0.4276, "step": 4700 }, { "epoch": 1.5999744908804898, "grad_norm": 5.197489467524276, "learning_rate": 3.907222176361605e-06, "loss": 0.4235, "step": 4705 }, { "epoch": 1.601675098847838, "grad_norm": 6.135582408058318, "learning_rate": 3.9049796290257e-06, "loss": 0.4121, "step": 4710 }, { "epoch": 1.6033757068151864, "grad_norm": 6.332586568891612, "learning_rate": 3.902735428081651e-06, "loss": 0.3969, "step": 4715 }, { "epoch": 1.6050763147825347, "grad_norm": 4.634540377565531, "learning_rate": 3.900489576170798e-06, "loss": 0.4181, "step": 4720 }, { "epoch": 1.6067769227498832, "grad_norm": 2.8978019838432, "learning_rate": 3.898242075936423e-06, "loss": 0.4428, "step": 4725 }, { "epoch": 1.6084775307172314, "grad_norm": 14.566279377899951, "learning_rate": 3.895992930023751e-06, "loss": 0.4039, "step": 4730 }, { "epoch": 1.6101781386845797, "grad_norm": 3.483953536288087, "learning_rate": 3.89374214107994e-06, "loss": 0.4383, "step": 4735 }, { "epoch": 1.6118787466519282, "grad_norm": 4.178882102249056, "learning_rate": 3.891489711754085e-06, "loss": 0.4253, "step": 4740 }, { "epoch": 1.6135793546192763, "grad_norm": 3.37242894448145, "learning_rate": 3.8892356446972115e-06, "loss": 0.43, "step": 4745 }, { "epoch": 1.6152799625866248, "grad_norm": 26.59330743195478, "learning_rate": 3.8869799425622695e-06, "loss": 0.3683, "step": 4750 }, { "epoch": 1.616980570553973, "grad_norm": 3.2112027869457656, "learning_rate": 3.884722608004137e-06, "loss": 0.4278, "step": 4755 }, { "epoch": 1.6186811785213213, "grad_norm": 3.344270702003903, "learning_rate": 3.882463643679612e-06, "loss": 0.4212, "step": 4760 }, { "epoch": 1.6203817864886698, "grad_norm": 2.75101259170878, "learning_rate": 3.880203052247409e-06, "loss": 0.4329, "step": 4765 }, { "epoch": 1.6220823944560179, "grad_norm": 3.9130782115066487, "learning_rate": 3.8779408363681596e-06, "loss": 0.4172, "step": 4770 }, { "epoch": 1.6237830024233664, "grad_norm": 4.017189388326885, "learning_rate": 3.875676998704408e-06, "loss": 0.4262, "step": 4775 }, { "epoch": 1.6254836103907147, "grad_norm": 3.8048248710539827, "learning_rate": 3.873411541920604e-06, "loss": 0.4205, "step": 4780 }, { "epoch": 1.627184218358063, "grad_norm": 3.3326058573835655, "learning_rate": 3.871144468683106e-06, "loss": 0.4345, "step": 4785 }, { "epoch": 1.6288848263254114, "grad_norm": 8.492427714369212, "learning_rate": 3.8688757816601746e-06, "loss": 0.4436, "step": 4790 }, { "epoch": 1.6305854342927597, "grad_norm": 3.1452242135910007, "learning_rate": 3.866605483521968e-06, "loss": 0.4281, "step": 4795 }, { "epoch": 1.632286042260108, "grad_norm": 5.627570929095714, "learning_rate": 3.864333576940542e-06, "loss": 0.4254, "step": 4800 }, { "epoch": 1.6339866502274563, "grad_norm": 2.913161920879035, "learning_rate": 3.862060064589845e-06, "loss": 0.4113, "step": 4805 }, { "epoch": 1.6356872581948045, "grad_norm": 3.7039601844045973, "learning_rate": 3.859784949145715e-06, "loss": 0.4204, "step": 4810 }, { "epoch": 1.637387866162153, "grad_norm": 3.0916900895398345, "learning_rate": 3.857508233285879e-06, "loss": 0.4287, "step": 4815 }, { "epoch": 1.6390884741295013, "grad_norm": 3.6692889623634755, "learning_rate": 3.855229919689944e-06, "loss": 0.4007, "step": 4820 }, { "epoch": 1.6407890820968496, "grad_norm": 2.8591255231281116, "learning_rate": 3.8529500110394e-06, "loss": 0.4335, "step": 4825 }, { "epoch": 1.642489690064198, "grad_norm": 4.056310941557023, "learning_rate": 3.850668510017613e-06, "loss": 0.4406, "step": 4830 }, { "epoch": 1.6441902980315461, "grad_norm": 5.395369896084443, "learning_rate": 3.848385419309826e-06, "loss": 0.3904, "step": 4835 }, { "epoch": 1.6458909059988946, "grad_norm": 4.861776683262736, "learning_rate": 3.846100741603148e-06, "loss": 0.4247, "step": 4840 }, { "epoch": 1.647591513966243, "grad_norm": 3.41023038334364, "learning_rate": 3.84381447958656e-06, "loss": 0.4076, "step": 4845 }, { "epoch": 1.6492921219335912, "grad_norm": 3.6805091047550236, "learning_rate": 3.8415266359509086e-06, "loss": 0.4053, "step": 4850 }, { "epoch": 1.6509927299009397, "grad_norm": 20.0982243813851, "learning_rate": 3.8392372133888955e-06, "loss": 0.4016, "step": 4855 }, { "epoch": 1.6526933378682878, "grad_norm": 3.7021790123051233, "learning_rate": 3.836946214595087e-06, "loss": 0.4285, "step": 4860 }, { "epoch": 1.6543939458356363, "grad_norm": 44.60756019965565, "learning_rate": 3.834653642265902e-06, "loss": 0.4207, "step": 4865 }, { "epoch": 1.6560945538029845, "grad_norm": 4.035211050978466, "learning_rate": 3.832359499099613e-06, "loss": 0.453, "step": 4870 }, { "epoch": 1.6577951617703328, "grad_norm": 6.698228887542318, "learning_rate": 3.830063787796339e-06, "loss": 0.4306, "step": 4875 }, { "epoch": 1.6594957697376813, "grad_norm": 4.749773815351558, "learning_rate": 3.827766511058046e-06, "loss": 0.4571, "step": 4880 }, { "epoch": 1.6611963777050296, "grad_norm": 22.04169770775326, "learning_rate": 3.8254676715885416e-06, "loss": 0.4191, "step": 4885 }, { "epoch": 1.6628969856723779, "grad_norm": 4.802174235750299, "learning_rate": 3.823167272093475e-06, "loss": 0.419, "step": 4890 }, { "epoch": 1.6645975936397261, "grad_norm": 12.428634508926901, "learning_rate": 3.820865315280329e-06, "loss": 0.4048, "step": 4895 }, { "epoch": 1.6662982016070744, "grad_norm": 4.007313720748017, "learning_rate": 3.81856180385842e-06, "loss": 0.4132, "step": 4900 }, { "epoch": 1.667998809574423, "grad_norm": 4.2309464190047805, "learning_rate": 3.816256740538894e-06, "loss": 0.3967, "step": 4905 }, { "epoch": 1.6696994175417712, "grad_norm": 11.11545501855301, "learning_rate": 3.8139501280347243e-06, "loss": 0.3825, "step": 4910 }, { "epoch": 1.6714000255091195, "grad_norm": 3.4032753612741136, "learning_rate": 3.8116419690607066e-06, "loss": 0.422, "step": 4915 }, { "epoch": 1.673100633476468, "grad_norm": 8.32912298085406, "learning_rate": 3.8093322663334574e-06, "loss": 0.4079, "step": 4920 }, { "epoch": 1.674801241443816, "grad_norm": 8.136105962229143, "learning_rate": 3.8070210225714092e-06, "loss": 0.4225, "step": 4925 }, { "epoch": 1.6765018494111645, "grad_norm": 6.552850991103475, "learning_rate": 3.80470824049481e-06, "loss": 0.4251, "step": 4930 }, { "epoch": 1.6782024573785128, "grad_norm": 3.70278560798378, "learning_rate": 3.802393922825717e-06, "loss": 0.4231, "step": 4935 }, { "epoch": 1.679903065345861, "grad_norm": 13.268090115376083, "learning_rate": 3.8000780722879937e-06, "loss": 0.4054, "step": 4940 }, { "epoch": 1.6816036733132096, "grad_norm": 4.909682908882996, "learning_rate": 3.7977606916073113e-06, "loss": 0.4298, "step": 4945 }, { "epoch": 1.6833042812805576, "grad_norm": 6.5152501229026605, "learning_rate": 3.795441783511138e-06, "loss": 0.4279, "step": 4950 }, { "epoch": 1.6850048892479061, "grad_norm": 8.879458780565201, "learning_rate": 3.7931213507287417e-06, "loss": 0.4232, "step": 4955 }, { "epoch": 1.6867054972152544, "grad_norm": 4.539443656122637, "learning_rate": 3.790799395991185e-06, "loss": 0.3957, "step": 4960 }, { "epoch": 1.6884061051826027, "grad_norm": 3.314679697063856, "learning_rate": 3.78847592203132e-06, "loss": 0.4335, "step": 4965 }, { "epoch": 1.6901067131499512, "grad_norm": 3.5744798605623167, "learning_rate": 3.7861509315837898e-06, "loss": 0.4205, "step": 4970 }, { "epoch": 1.6918073211172995, "grad_norm": 4.249754872291892, "learning_rate": 3.7838244273850187e-06, "loss": 0.3945, "step": 4975 }, { "epoch": 1.6935079290846478, "grad_norm": 3.409071184852231, "learning_rate": 3.7814964121732164e-06, "loss": 0.4317, "step": 4980 }, { "epoch": 1.695208537051996, "grad_norm": 3.6628545875674976, "learning_rate": 3.7791668886883675e-06, "loss": 0.4161, "step": 4985 }, { "epoch": 1.6969091450193443, "grad_norm": 7.581007499161183, "learning_rate": 3.7768358596722356e-06, "loss": 0.4171, "step": 4990 }, { "epoch": 1.6986097529866928, "grad_norm": 3.765634764539352, "learning_rate": 3.7745033278683506e-06, "loss": 0.4222, "step": 4995 }, { "epoch": 1.700310360954041, "grad_norm": 3.8136428107692986, "learning_rate": 3.772169296022019e-06, "loss": 0.4222, "step": 5000 }, { "epoch": 1.7020109689213894, "grad_norm": 4.324620527193306, "learning_rate": 3.7698337668803054e-06, "loss": 0.4023, "step": 5005 }, { "epoch": 1.7037115768887379, "grad_norm": 2.7411830689404, "learning_rate": 3.767496743192042e-06, "loss": 0.4215, "step": 5010 }, { "epoch": 1.705412184856086, "grad_norm": 3.7118408685683497, "learning_rate": 3.7651582277078148e-06, "loss": 0.4369, "step": 5015 }, { "epoch": 1.7071127928234344, "grad_norm": 9.36456704680626, "learning_rate": 3.7628182231799703e-06, "loss": 0.4033, "step": 5020 }, { "epoch": 1.7088134007907827, "grad_norm": 4.523007688855365, "learning_rate": 3.760476732362606e-06, "loss": 0.4084, "step": 5025 }, { "epoch": 1.710514008758131, "grad_norm": 3.6411870432062265, "learning_rate": 3.7581337580115683e-06, "loss": 0.3942, "step": 5030 }, { "epoch": 1.7122146167254795, "grad_norm": 3.4921491078651363, "learning_rate": 3.755789302884449e-06, "loss": 0.4075, "step": 5035 }, { "epoch": 1.7139152246928275, "grad_norm": 2.482904014486906, "learning_rate": 3.7534433697405842e-06, "loss": 0.417, "step": 5040 }, { "epoch": 1.715615832660176, "grad_norm": 2.297692680568294, "learning_rate": 3.751095961341049e-06, "loss": 0.4106, "step": 5045 }, { "epoch": 1.7173164406275243, "grad_norm": 3.4326386505188715, "learning_rate": 3.748747080448654e-06, "loss": 0.4226, "step": 5050 }, { "epoch": 1.7190170485948726, "grad_norm": 4.785651921836002, "learning_rate": 3.746396729827944e-06, "loss": 0.4081, "step": 5055 }, { "epoch": 1.720717656562221, "grad_norm": 3.0609624749389277, "learning_rate": 3.744044912245194e-06, "loss": 0.4008, "step": 5060 }, { "epoch": 1.7224182645295694, "grad_norm": 4.091890461031847, "learning_rate": 3.741691630468404e-06, "loss": 0.4173, "step": 5065 }, { "epoch": 1.7241188724969176, "grad_norm": 4.649672756554727, "learning_rate": 3.739336887267298e-06, "loss": 0.412, "step": 5070 }, { "epoch": 1.725819480464266, "grad_norm": 5.44341678306046, "learning_rate": 3.7369806854133204e-06, "loss": 0.4137, "step": 5075 }, { "epoch": 1.7275200884316142, "grad_norm": 3.474137553393895, "learning_rate": 3.7346230276796325e-06, "loss": 0.411, "step": 5080 }, { "epoch": 1.7292206963989627, "grad_norm": 4.395765896168521, "learning_rate": 3.7322639168411077e-06, "loss": 0.4293, "step": 5085 }, { "epoch": 1.730921304366311, "grad_norm": 7.0977714130839304, "learning_rate": 3.729903355674332e-06, "loss": 0.4142, "step": 5090 }, { "epoch": 1.7326219123336593, "grad_norm": 7.852932063860036, "learning_rate": 3.7275413469575955e-06, "loss": 0.4282, "step": 5095 }, { "epoch": 1.7343225203010078, "grad_norm": 4.817210240263915, "learning_rate": 3.725177893470895e-06, "loss": 0.4256, "step": 5100 }, { "epoch": 1.7360231282683558, "grad_norm": 2.5794826397829436, "learning_rate": 3.722812997995925e-06, "loss": 0.4058, "step": 5105 }, { "epoch": 1.7377237362357043, "grad_norm": 3.660698064573587, "learning_rate": 3.7204466633160796e-06, "loss": 0.4105, "step": 5110 }, { "epoch": 1.7394243442030526, "grad_norm": 7.6636100537984335, "learning_rate": 3.7180788922164446e-06, "loss": 0.4082, "step": 5115 }, { "epoch": 1.7411249521704009, "grad_norm": 3.8422655885497377, "learning_rate": 3.7157096874837985e-06, "loss": 0.4131, "step": 5120 }, { "epoch": 1.7428255601377494, "grad_norm": 4.651148660746592, "learning_rate": 3.7133390519066048e-06, "loss": 0.4213, "step": 5125 }, { "epoch": 1.7445261681050974, "grad_norm": 3.4297472647983045, "learning_rate": 3.7109669882750145e-06, "loss": 0.4318, "step": 5130 }, { "epoch": 1.746226776072446, "grad_norm": 35.092387165068224, "learning_rate": 3.7085934993808546e-06, "loss": 0.4223, "step": 5135 }, { "epoch": 1.7479273840397942, "grad_norm": 4.45847131877625, "learning_rate": 3.706218588017635e-06, "loss": 0.3913, "step": 5140 }, { "epoch": 1.7496279920071425, "grad_norm": 4.7597757325098256, "learning_rate": 3.7038422569805342e-06, "loss": 0.4193, "step": 5145 }, { "epoch": 1.751328599974491, "grad_norm": 4.462704348907563, "learning_rate": 3.7014645090664065e-06, "loss": 0.415, "step": 5150 }, { "epoch": 1.7530292079418393, "grad_norm": 12.282193653179133, "learning_rate": 3.6990853470737704e-06, "loss": 0.4181, "step": 5155 }, { "epoch": 1.7547298159091875, "grad_norm": 11.877335638069301, "learning_rate": 3.6967047738028106e-06, "loss": 0.4111, "step": 5160 }, { "epoch": 1.7564304238765358, "grad_norm": 6.323596753835098, "learning_rate": 3.6943227920553727e-06, "loss": 0.4228, "step": 5165 }, { "epoch": 1.758131031843884, "grad_norm": 5.0439197351461225, "learning_rate": 3.6919394046349583e-06, "loss": 0.4177, "step": 5170 }, { "epoch": 1.7598316398112326, "grad_norm": 7.547777679994188, "learning_rate": 3.6895546143467254e-06, "loss": 0.4305, "step": 5175 }, { "epoch": 1.7615322477785809, "grad_norm": 3.389867040154592, "learning_rate": 3.6871684239974825e-06, "loss": 0.4147, "step": 5180 }, { "epoch": 1.7632328557459291, "grad_norm": 3.1099841932552965, "learning_rate": 3.684780836395686e-06, "loss": 0.4026, "step": 5185 }, { "epoch": 1.7649334637132776, "grad_norm": 2.961474684532061, "learning_rate": 3.6823918543514365e-06, "loss": 0.4185, "step": 5190 }, { "epoch": 1.7666340716806257, "grad_norm": 5.724440893942992, "learning_rate": 3.680001480676475e-06, "loss": 0.412, "step": 5195 }, { "epoch": 1.7683346796479742, "grad_norm": 4.205497790469296, "learning_rate": 3.677609718184183e-06, "loss": 0.4054, "step": 5200 }, { "epoch": 1.7700352876153225, "grad_norm": 2.683317009048377, "learning_rate": 3.675216569689574e-06, "loss": 0.4122, "step": 5205 }, { "epoch": 1.7717358955826708, "grad_norm": 4.535946082169053, "learning_rate": 3.672822038009294e-06, "loss": 0.4146, "step": 5210 }, { "epoch": 1.7734365035500192, "grad_norm": 3.380527665829096, "learning_rate": 3.6704261259616164e-06, "loss": 0.4213, "step": 5215 }, { "epoch": 1.7751371115173673, "grad_norm": 3.870065998174079, "learning_rate": 3.6680288363664394e-06, "loss": 0.4074, "step": 5220 }, { "epoch": 1.7768377194847158, "grad_norm": 3.1354538564619334, "learning_rate": 3.6656301720452835e-06, "loss": 0.4228, "step": 5225 }, { "epoch": 1.778538327452064, "grad_norm": 5.400817932617311, "learning_rate": 3.6632301358212853e-06, "loss": 0.4246, "step": 5230 }, { "epoch": 1.7802389354194124, "grad_norm": 3.8399689380174276, "learning_rate": 3.6608287305191973e-06, "loss": 0.4308, "step": 5235 }, { "epoch": 1.7819395433867609, "grad_norm": 2.997454835558299, "learning_rate": 3.6584259589653837e-06, "loss": 0.4137, "step": 5240 }, { "epoch": 1.7836401513541091, "grad_norm": 3.350556475913989, "learning_rate": 3.656021823987815e-06, "loss": 0.4005, "step": 5245 }, { "epoch": 1.7853407593214574, "grad_norm": 2.8203194142875363, "learning_rate": 3.6536163284160693e-06, "loss": 0.4476, "step": 5250 }, { "epoch": 1.7870413672888057, "grad_norm": 3.308402155803443, "learning_rate": 3.6512094750813233e-06, "loss": 0.3968, "step": 5255 }, { "epoch": 1.788741975256154, "grad_norm": 5.763247278776057, "learning_rate": 3.6488012668163524e-06, "loss": 0.4151, "step": 5260 }, { "epoch": 1.7904425832235025, "grad_norm": 5.29037745075565, "learning_rate": 3.646391706455528e-06, "loss": 0.4132, "step": 5265 }, { "epoch": 1.7921431911908507, "grad_norm": 4.138638916673085, "learning_rate": 3.6439807968348124e-06, "loss": 0.4132, "step": 5270 }, { "epoch": 1.793843799158199, "grad_norm": 6.493935903659294, "learning_rate": 3.641568540791754e-06, "loss": 0.4089, "step": 5275 }, { "epoch": 1.7955444071255475, "grad_norm": 4.802012353682913, "learning_rate": 3.639154941165488e-06, "loss": 0.4227, "step": 5280 }, { "epoch": 1.7972450150928956, "grad_norm": 3.4193309281326165, "learning_rate": 3.6367400007967303e-06, "loss": 0.4108, "step": 5285 }, { "epoch": 1.798945623060244, "grad_norm": 4.888742378693622, "learning_rate": 3.634323722527775e-06, "loss": 0.4117, "step": 5290 }, { "epoch": 1.8006462310275924, "grad_norm": 3.1983098757845716, "learning_rate": 3.6319061092024908e-06, "loss": 0.404, "step": 5295 }, { "epoch": 1.8023468389949406, "grad_norm": 3.497239045953602, "learning_rate": 3.629487163666317e-06, "loss": 0.3974, "step": 5300 }, { "epoch": 1.8040474469622891, "grad_norm": 3.389532929241317, "learning_rate": 3.6270668887662617e-06, "loss": 0.4099, "step": 5305 }, { "epoch": 1.8057480549296372, "grad_norm": 5.75582582741186, "learning_rate": 3.6246452873508974e-06, "loss": 0.4205, "step": 5310 }, { "epoch": 1.8074486628969857, "grad_norm": 4.053834889668838, "learning_rate": 3.6222223622703588e-06, "loss": 0.4371, "step": 5315 }, { "epoch": 1.809149270864334, "grad_norm": 6.526089938179861, "learning_rate": 3.6197981163763363e-06, "loss": 0.4115, "step": 5320 }, { "epoch": 1.8108498788316822, "grad_norm": 3.8633098332276345, "learning_rate": 3.617372552522076e-06, "loss": 0.4198, "step": 5325 }, { "epoch": 1.8125504867990307, "grad_norm": 4.490269234498097, "learning_rate": 3.614945673562376e-06, "loss": 0.41, "step": 5330 }, { "epoch": 1.814251094766379, "grad_norm": 3.729763555379225, "learning_rate": 3.6125174823535814e-06, "loss": 0.4261, "step": 5335 }, { "epoch": 1.8159517027337273, "grad_norm": 4.209308445244779, "learning_rate": 3.610087981753582e-06, "loss": 0.3966, "step": 5340 }, { "epoch": 1.8176523107010756, "grad_norm": 3.114516904920482, "learning_rate": 3.607657174621807e-06, "loss": 0.4314, "step": 5345 }, { "epoch": 1.8193529186684239, "grad_norm": 4.563806015836235, "learning_rate": 3.605225063819227e-06, "loss": 0.3873, "step": 5350 }, { "epoch": 1.8210535266357724, "grad_norm": 3.4794691991429563, "learning_rate": 3.602791652208344e-06, "loss": 0.4083, "step": 5355 }, { "epoch": 1.8227541346031206, "grad_norm": 3.7479801450742025, "learning_rate": 3.6003569426531913e-06, "loss": 0.4211, "step": 5360 }, { "epoch": 1.824454742570469, "grad_norm": 5.836597057493091, "learning_rate": 3.597920938019332e-06, "loss": 0.4115, "step": 5365 }, { "epoch": 1.8261553505378174, "grad_norm": 4.694732953885266, "learning_rate": 3.5954836411738497e-06, "loss": 0.4068, "step": 5370 }, { "epoch": 1.8278559585051655, "grad_norm": 3.8101990170781477, "learning_rate": 3.5930450549853525e-06, "loss": 0.4094, "step": 5375 }, { "epoch": 1.829556566472514, "grad_norm": 3.51236481656127, "learning_rate": 3.5906051823239646e-06, "loss": 0.4285, "step": 5380 }, { "epoch": 1.8312571744398622, "grad_norm": 3.94658556315043, "learning_rate": 3.588164026061324e-06, "loss": 0.4235, "step": 5385 }, { "epoch": 1.8329577824072105, "grad_norm": 3.3174850981591684, "learning_rate": 3.58572158907058e-06, "loss": 0.41, "step": 5390 }, { "epoch": 1.834658390374559, "grad_norm": 3.6492814087849252, "learning_rate": 3.5832778742263887e-06, "loss": 0.429, "step": 5395 }, { "epoch": 1.836358998341907, "grad_norm": 2.8277702279887063, "learning_rate": 3.58083288440491e-06, "loss": 0.405, "step": 5400 }, { "epoch": 1.8380596063092556, "grad_norm": 4.03173637344181, "learning_rate": 3.5783866224838056e-06, "loss": 0.4175, "step": 5405 }, { "epoch": 1.8397602142766039, "grad_norm": 5.057067485565598, "learning_rate": 3.575939091342233e-06, "loss": 0.4081, "step": 5410 }, { "epoch": 1.8414608222439521, "grad_norm": 5.22123889542372, "learning_rate": 3.5734902938608464e-06, "loss": 0.4166, "step": 5415 }, { "epoch": 1.8431614302113006, "grad_norm": 4.538960817215644, "learning_rate": 3.5710402329217853e-06, "loss": 0.4129, "step": 5420 }, { "epoch": 1.844862038178649, "grad_norm": 4.407674164743875, "learning_rate": 3.568588911408681e-06, "loss": 0.43, "step": 5425 }, { "epoch": 1.8465626461459972, "grad_norm": 5.401174955894541, "learning_rate": 3.5661363322066457e-06, "loss": 0.4276, "step": 5430 }, { "epoch": 1.8482632541133455, "grad_norm": 5.251158658037828, "learning_rate": 3.5636824982022733e-06, "loss": 0.4123, "step": 5435 }, { "epoch": 1.8499638620806937, "grad_norm": 3.470885033365259, "learning_rate": 3.5612274122836347e-06, "loss": 0.4131, "step": 5440 }, { "epoch": 1.8516644700480422, "grad_norm": 2.7229742474641583, "learning_rate": 3.5587710773402728e-06, "loss": 0.4181, "step": 5445 }, { "epoch": 1.8533650780153905, "grad_norm": 5.8691764648272295, "learning_rate": 3.556313496263202e-06, "loss": 0.4139, "step": 5450 }, { "epoch": 1.8550656859827388, "grad_norm": 3.825299235777345, "learning_rate": 3.5538546719449016e-06, "loss": 0.3916, "step": 5455 }, { "epoch": 1.8567662939500873, "grad_norm": 2.5630247348522905, "learning_rate": 3.551394607279317e-06, "loss": 0.4, "step": 5460 }, { "epoch": 1.8584669019174354, "grad_norm": 4.1369968485018696, "learning_rate": 3.5489333051618502e-06, "loss": 0.4556, "step": 5465 }, { "epoch": 1.8601675098847839, "grad_norm": 3.8075109811197185, "learning_rate": 3.546470768489362e-06, "loss": 0.4064, "step": 5470 }, { "epoch": 1.8618681178521321, "grad_norm": 2.6887363622082976, "learning_rate": 3.5440070001601645e-06, "loss": 0.4219, "step": 5475 }, { "epoch": 1.8635687258194804, "grad_norm": 5.633259612800654, "learning_rate": 3.5415420030740213e-06, "loss": 0.4053, "step": 5480 }, { "epoch": 1.865269333786829, "grad_norm": 8.192273971343418, "learning_rate": 3.539075780132141e-06, "loss": 0.4197, "step": 5485 }, { "epoch": 1.866969941754177, "grad_norm": 8.451498019783399, "learning_rate": 3.5366083342371736e-06, "loss": 0.3956, "step": 5490 }, { "epoch": 1.8686705497215255, "grad_norm": 3.479976888795481, "learning_rate": 3.534139668293213e-06, "loss": 0.425, "step": 5495 }, { "epoch": 1.8703711576888737, "grad_norm": 5.866557178030037, "learning_rate": 3.5316697852057837e-06, "loss": 0.4102, "step": 5500 }, { "epoch": 1.872071765656222, "grad_norm": 4.035005484628097, "learning_rate": 3.5291986878818465e-06, "loss": 0.4199, "step": 5505 }, { "epoch": 1.8737723736235705, "grad_norm": 3.5002876163902474, "learning_rate": 3.526726379229789e-06, "loss": 0.3934, "step": 5510 }, { "epoch": 1.8754729815909188, "grad_norm": 3.6304680834425684, "learning_rate": 3.5242528621594258e-06, "loss": 0.4432, "step": 5515 }, { "epoch": 1.877173589558267, "grad_norm": 3.6584152163360937, "learning_rate": 3.5217781395819933e-06, "loss": 0.3992, "step": 5520 }, { "epoch": 1.8788741975256154, "grad_norm": 4.457327747635903, "learning_rate": 3.5193022144101474e-06, "loss": 0.4117, "step": 5525 }, { "epoch": 1.8805748054929636, "grad_norm": 4.636453776912117, "learning_rate": 3.516825089557958e-06, "loss": 0.3967, "step": 5530 }, { "epoch": 1.8822754134603121, "grad_norm": 4.035134315793152, "learning_rate": 3.5143467679409086e-06, "loss": 0.4277, "step": 5535 }, { "epoch": 1.8839760214276604, "grad_norm": 7.208813835387405, "learning_rate": 3.5118672524758902e-06, "loss": 0.4186, "step": 5540 }, { "epoch": 1.8856766293950087, "grad_norm": 4.0132329719104645, "learning_rate": 3.5093865460811986e-06, "loss": 0.4216, "step": 5545 }, { "epoch": 1.8873772373623572, "grad_norm": 2.7614191467947378, "learning_rate": 3.506904651676532e-06, "loss": 0.4247, "step": 5550 }, { "epoch": 1.8890778453297052, "grad_norm": 3.494289269774637, "learning_rate": 3.5044215721829877e-06, "loss": 0.4194, "step": 5555 }, { "epoch": 1.8907784532970537, "grad_norm": 5.42233312332927, "learning_rate": 3.501937310523056e-06, "loss": 0.3973, "step": 5560 }, { "epoch": 1.892479061264402, "grad_norm": 5.442951689643182, "learning_rate": 3.4994518696206193e-06, "loss": 0.4116, "step": 5565 }, { "epoch": 1.8941796692317503, "grad_norm": 5.737489385443094, "learning_rate": 3.4969652524009484e-06, "loss": 0.4183, "step": 5570 }, { "epoch": 1.8958802771990988, "grad_norm": 4.014297820507776, "learning_rate": 3.4944774617906985e-06, "loss": 0.3938, "step": 5575 }, { "epoch": 1.8975808851664469, "grad_norm": 4.66789293356542, "learning_rate": 3.4919885007179045e-06, "loss": 0.4294, "step": 5580 }, { "epoch": 1.8992814931337954, "grad_norm": 4.700100407150136, "learning_rate": 3.48949837211198e-06, "loss": 0.426, "step": 5585 }, { "epoch": 1.9009821011011436, "grad_norm": 4.840905102605415, "learning_rate": 3.4870070789037137e-06, "loss": 0.3858, "step": 5590 }, { "epoch": 1.902682709068492, "grad_norm": 9.12091742025157, "learning_rate": 3.484514624025263e-06, "loss": 0.4147, "step": 5595 }, { "epoch": 1.9043833170358404, "grad_norm": 4.724736678683829, "learning_rate": 3.4820210104101537e-06, "loss": 0.3841, "step": 5600 }, { "epoch": 1.9060839250031887, "grad_norm": 5.18009895076808, "learning_rate": 3.4795262409932755e-06, "loss": 0.4282, "step": 5605 }, { "epoch": 1.907784532970537, "grad_norm": 5.792537607638843, "learning_rate": 3.4770303187108775e-06, "loss": 0.4034, "step": 5610 }, { "epoch": 1.9094851409378852, "grad_norm": 4.6349091095733295, "learning_rate": 3.4745332465005673e-06, "loss": 0.4028, "step": 5615 }, { "epoch": 1.9111857489052335, "grad_norm": 6.723455838078338, "learning_rate": 3.4720350273013037e-06, "loss": 0.4166, "step": 5620 }, { "epoch": 1.912886356872582, "grad_norm": 6.242344377413638, "learning_rate": 3.469535664053397e-06, "loss": 0.4031, "step": 5625 }, { "epoch": 1.9145869648399303, "grad_norm": 6.84802664068361, "learning_rate": 3.4670351596985046e-06, "loss": 0.415, "step": 5630 }, { "epoch": 1.9162875728072786, "grad_norm": 7.228260182496643, "learning_rate": 3.464533517179625e-06, "loss": 0.4315, "step": 5635 }, { "epoch": 1.917988180774627, "grad_norm": 2.96606041432462, "learning_rate": 3.4620307394410978e-06, "loss": 0.4223, "step": 5640 }, { "epoch": 1.9196887887419751, "grad_norm": 3.7918240411783857, "learning_rate": 3.459526829428598e-06, "loss": 0.403, "step": 5645 }, { "epoch": 1.9213893967093236, "grad_norm": 3.1192412744558955, "learning_rate": 3.4570217900891334e-06, "loss": 0.4153, "step": 5650 }, { "epoch": 1.923090004676672, "grad_norm": 9.22746745511448, "learning_rate": 3.4545156243710416e-06, "loss": 0.4118, "step": 5655 }, { "epoch": 1.9247906126440202, "grad_norm": 4.8796467224589195, "learning_rate": 3.4520083352239843e-06, "loss": 0.4113, "step": 5660 }, { "epoch": 1.9264912206113687, "grad_norm": 2.9985883082157896, "learning_rate": 3.449499925598947e-06, "loss": 0.403, "step": 5665 }, { "epoch": 1.9281918285787167, "grad_norm": 3.2382926012581246, "learning_rate": 3.446990398448233e-06, "loss": 0.4105, "step": 5670 }, { "epoch": 1.9298924365460652, "grad_norm": 4.399404706868584, "learning_rate": 3.4444797567254618e-06, "loss": 0.4302, "step": 5675 }, { "epoch": 1.9315930445134135, "grad_norm": 3.5161582978925843, "learning_rate": 3.4419680033855646e-06, "loss": 0.4182, "step": 5680 }, { "epoch": 1.9332936524807618, "grad_norm": 13.298000660642423, "learning_rate": 3.43945514138478e-06, "loss": 0.4229, "step": 5685 }, { "epoch": 1.9349942604481103, "grad_norm": 6.021996835427646, "learning_rate": 3.4369411736806518e-06, "loss": 0.3972, "step": 5690 }, { "epoch": 1.9366948684154586, "grad_norm": 3.5836568774514412, "learning_rate": 3.4344261032320256e-06, "loss": 0.4128, "step": 5695 }, { "epoch": 1.9383954763828068, "grad_norm": 3.4951768750855345, "learning_rate": 3.431909932999045e-06, "loss": 0.4233, "step": 5700 }, { "epoch": 1.9400960843501551, "grad_norm": 4.848115803542296, "learning_rate": 3.4293926659431476e-06, "loss": 0.4363, "step": 5705 }, { "epoch": 1.9417966923175034, "grad_norm": 3.8444396087392927, "learning_rate": 3.4268743050270615e-06, "loss": 0.4047, "step": 5710 }, { "epoch": 1.943497300284852, "grad_norm": 3.2226949571838714, "learning_rate": 3.4243548532148023e-06, "loss": 0.4106, "step": 5715 }, { "epoch": 1.9451979082522002, "grad_norm": 3.3308747796425955, "learning_rate": 3.4218343134716714e-06, "loss": 0.3999, "step": 5720 }, { "epoch": 1.9468985162195485, "grad_norm": 4.88703347692355, "learning_rate": 3.419312688764248e-06, "loss": 0.4048, "step": 5725 }, { "epoch": 1.948599124186897, "grad_norm": 3.9874201659248003, "learning_rate": 3.41678998206039e-06, "loss": 0.4385, "step": 5730 }, { "epoch": 1.950299732154245, "grad_norm": 6.1419864939233175, "learning_rate": 3.414266196329228e-06, "loss": 0.416, "step": 5735 }, { "epoch": 1.9520003401215935, "grad_norm": 4.112898935066507, "learning_rate": 3.411741334541163e-06, "loss": 0.3927, "step": 5740 }, { "epoch": 1.9537009480889418, "grad_norm": 3.953089011926561, "learning_rate": 3.409215399667863e-06, "loss": 0.4042, "step": 5745 }, { "epoch": 1.95540155605629, "grad_norm": 5.767046903179801, "learning_rate": 3.4066883946822566e-06, "loss": 0.4006, "step": 5750 }, { "epoch": 1.9571021640236386, "grad_norm": 3.341317702928936, "learning_rate": 3.404160322558535e-06, "loss": 0.4227, "step": 5755 }, { "epoch": 1.9588027719909866, "grad_norm": 4.07190080441077, "learning_rate": 3.401631186272143e-06, "loss": 0.416, "step": 5760 }, { "epoch": 1.9605033799583351, "grad_norm": 12.21280293374208, "learning_rate": 3.39910098879978e-06, "loss": 0.3751, "step": 5765 }, { "epoch": 1.9622039879256834, "grad_norm": 4.172650121953233, "learning_rate": 3.396569733119392e-06, "loss": 0.4123, "step": 5770 }, { "epoch": 1.9639045958930317, "grad_norm": 3.6288758910428904, "learning_rate": 3.3940374222101718e-06, "loss": 0.3984, "step": 5775 }, { "epoch": 1.9656052038603802, "grad_norm": 3.3156182811454498, "learning_rate": 3.391504059052555e-06, "loss": 0.4042, "step": 5780 }, { "epoch": 1.9673058118277285, "grad_norm": 4.9764018707293, "learning_rate": 3.3889696466282133e-06, "loss": 0.4323, "step": 5785 }, { "epoch": 1.9690064197950767, "grad_norm": 3.0156131427562265, "learning_rate": 3.3864341879200564e-06, "loss": 0.4326, "step": 5790 }, { "epoch": 1.970707027762425, "grad_norm": 2.8678588097789293, "learning_rate": 3.3838976859122217e-06, "loss": 0.3995, "step": 5795 }, { "epoch": 1.9724076357297733, "grad_norm": 3.3362617100666356, "learning_rate": 3.381360143590078e-06, "loss": 0.3796, "step": 5800 }, { "epoch": 1.9741082436971218, "grad_norm": 4.58668537659758, "learning_rate": 3.3788215639402173e-06, "loss": 0.4115, "step": 5805 }, { "epoch": 1.97580885166447, "grad_norm": 4.345595276003884, "learning_rate": 3.3762819499504517e-06, "loss": 0.4085, "step": 5810 }, { "epoch": 1.9775094596318183, "grad_norm": 5.3576993167657605, "learning_rate": 3.3737413046098115e-06, "loss": 0.4264, "step": 5815 }, { "epoch": 1.9792100675991668, "grad_norm": 5.050489795627901, "learning_rate": 3.371199630908541e-06, "loss": 0.4042, "step": 5820 }, { "epoch": 1.980910675566515, "grad_norm": 5.566547262030448, "learning_rate": 3.3686569318380935e-06, "loss": 0.4063, "step": 5825 }, { "epoch": 1.9826112835338634, "grad_norm": 3.2549721138244974, "learning_rate": 3.366113210391131e-06, "loss": 0.4224, "step": 5830 }, { "epoch": 1.9843118915012117, "grad_norm": 3.515436243577237, "learning_rate": 3.3635684695615178e-06, "loss": 0.4238, "step": 5835 }, { "epoch": 1.98601249946856, "grad_norm": 7.536854238457131, "learning_rate": 3.3610227123443175e-06, "loss": 0.3986, "step": 5840 }, { "epoch": 1.9877131074359085, "grad_norm": 3.8966337878486166, "learning_rate": 3.358475941735791e-06, "loss": 0.3911, "step": 5845 }, { "epoch": 1.9894137154032565, "grad_norm": 3.690649105892383, "learning_rate": 3.355928160733391e-06, "loss": 0.4384, "step": 5850 }, { "epoch": 1.991114323370605, "grad_norm": 4.7328145823776095, "learning_rate": 3.3533793723357606e-06, "loss": 0.3964, "step": 5855 }, { "epoch": 1.9928149313379533, "grad_norm": 3.7226029893933834, "learning_rate": 3.3508295795427275e-06, "loss": 0.4241, "step": 5860 }, { "epoch": 1.9945155393053016, "grad_norm": 2.9375452734873773, "learning_rate": 3.3482787853553013e-06, "loss": 0.4104, "step": 5865 }, { "epoch": 1.99621614727265, "grad_norm": 2.990235322214759, "learning_rate": 3.3457269927756714e-06, "loss": 0.3995, "step": 5870 }, { "epoch": 1.9979167552399983, "grad_norm": 3.893323342077673, "learning_rate": 3.3431742048072013e-06, "loss": 0.4212, "step": 5875 }, { "epoch": 1.9996173632073466, "grad_norm": 3.4787084903115204, "learning_rate": 3.340620424454427e-06, "loss": 0.4086, "step": 5880 }, { "epoch": 2.001020364780409, "grad_norm": 3.2811677378027113, "learning_rate": 3.338065654723051e-06, "loss": 0.3351, "step": 5885 }, { "epoch": 2.0027209727477575, "grad_norm": 6.913529304657979, "learning_rate": 3.335509898619942e-06, "loss": 0.385, "step": 5890 }, { "epoch": 2.0044215807151056, "grad_norm": 3.8715857028540532, "learning_rate": 3.3329531591531276e-06, "loss": 0.4009, "step": 5895 }, { "epoch": 2.006122188682454, "grad_norm": 3.564027863152233, "learning_rate": 3.330395439331795e-06, "loss": 0.3685, "step": 5900 }, { "epoch": 2.007822796649802, "grad_norm": 6.615848796909299, "learning_rate": 3.327836742166284e-06, "loss": 0.4048, "step": 5905 }, { "epoch": 2.0095234046171506, "grad_norm": 2.8990099800711495, "learning_rate": 3.325277070668084e-06, "loss": 0.3822, "step": 5910 }, { "epoch": 2.011224012584499, "grad_norm": 10.672180921903456, "learning_rate": 3.3227164278498323e-06, "loss": 0.3806, "step": 5915 }, { "epoch": 2.012924620551847, "grad_norm": 6.1773874270484965, "learning_rate": 3.32015481672531e-06, "loss": 0.4091, "step": 5920 }, { "epoch": 2.0146252285191957, "grad_norm": 6.321820591462762, "learning_rate": 3.3175922403094356e-06, "loss": 0.3809, "step": 5925 }, { "epoch": 2.0163258364865437, "grad_norm": 3.0039511137847024, "learning_rate": 3.315028701618267e-06, "loss": 0.4017, "step": 5930 }, { "epoch": 2.0180264444538922, "grad_norm": 4.7799065896337245, "learning_rate": 3.312464203668991e-06, "loss": 0.3925, "step": 5935 }, { "epoch": 2.0197270524212407, "grad_norm": 3.6456258180111085, "learning_rate": 3.309898749479926e-06, "loss": 0.3817, "step": 5940 }, { "epoch": 2.021427660388589, "grad_norm": 3.6402333636980315, "learning_rate": 3.307332342070515e-06, "loss": 0.4025, "step": 5945 }, { "epoch": 2.0231282683559373, "grad_norm": 3.4980694486792325, "learning_rate": 3.3047649844613227e-06, "loss": 0.4214, "step": 5950 }, { "epoch": 2.024828876323286, "grad_norm": 3.7702137274256247, "learning_rate": 3.3021966796740322e-06, "loss": 0.4031, "step": 5955 }, { "epoch": 2.026529484290634, "grad_norm": 4.4118924243661, "learning_rate": 3.2996274307314425e-06, "loss": 0.3826, "step": 5960 }, { "epoch": 2.0282300922579823, "grad_norm": 3.4502156659163647, "learning_rate": 3.297057240657462e-06, "loss": 0.381, "step": 5965 }, { "epoch": 2.0299307002253304, "grad_norm": 2.7373229326317103, "learning_rate": 3.294486112477108e-06, "loss": 0.3835, "step": 5970 }, { "epoch": 2.031631308192679, "grad_norm": 6.504240505072941, "learning_rate": 3.291914049216501e-06, "loss": 0.4086, "step": 5975 }, { "epoch": 2.0333319161600274, "grad_norm": 3.76788660053163, "learning_rate": 3.289341053902863e-06, "loss": 0.4276, "step": 5980 }, { "epoch": 2.0350325241273755, "grad_norm": 4.846638448314908, "learning_rate": 3.2867671295645133e-06, "loss": 0.3647, "step": 5985 }, { "epoch": 2.036733132094724, "grad_norm": 3.2953006865029733, "learning_rate": 3.2841922792308634e-06, "loss": 0.3907, "step": 5990 }, { "epoch": 2.038433740062072, "grad_norm": 5.026985716820102, "learning_rate": 3.2816165059324152e-06, "loss": 0.3981, "step": 5995 }, { "epoch": 2.0401343480294205, "grad_norm": 3.30096376406445, "learning_rate": 3.2790398127007574e-06, "loss": 0.4084, "step": 6000 }, { "epoch": 2.041834955996769, "grad_norm": 3.4806017478356446, "learning_rate": 3.27646220256856e-06, "loss": 0.4015, "step": 6005 }, { "epoch": 2.043535563964117, "grad_norm": 3.1259456878617415, "learning_rate": 3.273883678569574e-06, "loss": 0.3957, "step": 6010 }, { "epoch": 2.0452361719314656, "grad_norm": 3.761567331812402, "learning_rate": 3.271304243738625e-06, "loss": 0.3947, "step": 6015 }, { "epoch": 2.0469367798988136, "grad_norm": 3.8267836804540933, "learning_rate": 3.2687239011116105e-06, "loss": 0.3958, "step": 6020 }, { "epoch": 2.048637387866162, "grad_norm": 3.842535621197304, "learning_rate": 3.266142653725497e-06, "loss": 0.4056, "step": 6025 }, { "epoch": 2.0503379958335106, "grad_norm": 2.8454844343863566, "learning_rate": 3.263560504618315e-06, "loss": 0.3951, "step": 6030 }, { "epoch": 2.0520386038008587, "grad_norm": 6.166504416915158, "learning_rate": 3.2609774568291565e-06, "loss": 0.3822, "step": 6035 }, { "epoch": 2.053739211768207, "grad_norm": 4.756891951256056, "learning_rate": 3.2583935133981725e-06, "loss": 0.3898, "step": 6040 }, { "epoch": 2.0554398197355557, "grad_norm": 3.6888598585629535, "learning_rate": 3.2558086773665665e-06, "loss": 0.4139, "step": 6045 }, { "epoch": 2.0571404277029037, "grad_norm": 3.8564068106904985, "learning_rate": 3.2532229517765932e-06, "loss": 0.4012, "step": 6050 }, { "epoch": 2.0588410356702522, "grad_norm": 6.97753452107306, "learning_rate": 3.2506363396715553e-06, "loss": 0.4005, "step": 6055 }, { "epoch": 2.0605416436376003, "grad_norm": 3.973785882545404, "learning_rate": 3.248048844095797e-06, "loss": 0.3909, "step": 6060 }, { "epoch": 2.062242251604949, "grad_norm": 4.201427492027756, "learning_rate": 3.2454604680947028e-06, "loss": 0.3915, "step": 6065 }, { "epoch": 2.0639428595722973, "grad_norm": 5.743366083731128, "learning_rate": 3.2428712147146945e-06, "loss": 0.4036, "step": 6070 }, { "epoch": 2.0656434675396453, "grad_norm": 3.6015493587753746, "learning_rate": 3.2402810870032266e-06, "loss": 0.3851, "step": 6075 }, { "epoch": 2.067344075506994, "grad_norm": 5.106010597540066, "learning_rate": 3.2376900880087803e-06, "loss": 0.4087, "step": 6080 }, { "epoch": 2.069044683474342, "grad_norm": 6.80410315583813, "learning_rate": 3.235098220780865e-06, "loss": 0.3809, "step": 6085 }, { "epoch": 2.0707452914416904, "grad_norm": 3.199373194447593, "learning_rate": 3.2325054883700106e-06, "loss": 0.3879, "step": 6090 }, { "epoch": 2.072445899409039, "grad_norm": 5.716365369394804, "learning_rate": 3.229911893827765e-06, "loss": 0.4057, "step": 6095 }, { "epoch": 2.074146507376387, "grad_norm": 22.332863912163635, "learning_rate": 3.227317440206693e-06, "loss": 0.3954, "step": 6100 }, { "epoch": 2.0758471153437354, "grad_norm": 3.950985622322699, "learning_rate": 3.224722130560367e-06, "loss": 0.3993, "step": 6105 }, { "epoch": 2.0775477233110835, "grad_norm": 4.89561391136477, "learning_rate": 3.2221259679433693e-06, "loss": 0.3784, "step": 6110 }, { "epoch": 2.079248331278432, "grad_norm": 4.647117117745478, "learning_rate": 3.219528955411286e-06, "loss": 0.3839, "step": 6115 }, { "epoch": 2.0809489392457805, "grad_norm": 3.601560868706918, "learning_rate": 3.2169310960207034e-06, "loss": 0.3793, "step": 6120 }, { "epoch": 2.0826495472131286, "grad_norm": 7.657508120264403, "learning_rate": 3.214332392829203e-06, "loss": 0.4051, "step": 6125 }, { "epoch": 2.084350155180477, "grad_norm": 3.133410726385583, "learning_rate": 3.211732848895362e-06, "loss": 0.3815, "step": 6130 }, { "epoch": 2.0860507631478256, "grad_norm": 4.2111262336931725, "learning_rate": 3.209132467278745e-06, "loss": 0.382, "step": 6135 }, { "epoch": 2.0877513711151736, "grad_norm": 3.693712293456679, "learning_rate": 3.206531251039904e-06, "loss": 0.3916, "step": 6140 }, { "epoch": 2.089451979082522, "grad_norm": 5.942814279257445, "learning_rate": 3.203929203240371e-06, "loss": 0.3899, "step": 6145 }, { "epoch": 2.09115258704987, "grad_norm": 15.8907229176879, "learning_rate": 3.201326326942661e-06, "loss": 0.3798, "step": 6150 }, { "epoch": 2.0928531950172187, "grad_norm": 3.0203623611068093, "learning_rate": 3.1987226252102588e-06, "loss": 0.3756, "step": 6155 }, { "epoch": 2.094553802984567, "grad_norm": 8.523617546410772, "learning_rate": 3.196118101107624e-06, "loss": 0.3834, "step": 6160 }, { "epoch": 2.0962544109519152, "grad_norm": 6.317721577230588, "learning_rate": 3.1935127577001845e-06, "loss": 0.4033, "step": 6165 }, { "epoch": 2.0979550189192637, "grad_norm": 2.8884097928072845, "learning_rate": 3.19090659805433e-06, "loss": 0.393, "step": 6170 }, { "epoch": 2.099655626886612, "grad_norm": 28.933403887416787, "learning_rate": 3.1882996252374143e-06, "loss": 0.399, "step": 6175 }, { "epoch": 2.1013562348539603, "grad_norm": 7.400556619958389, "learning_rate": 3.1856918423177446e-06, "loss": 0.3847, "step": 6180 }, { "epoch": 2.103056842821309, "grad_norm": 5.381824996885983, "learning_rate": 3.1830832523645836e-06, "loss": 0.3923, "step": 6185 }, { "epoch": 2.104757450788657, "grad_norm": 4.148318719654079, "learning_rate": 3.1804738584481437e-06, "loss": 0.3928, "step": 6190 }, { "epoch": 2.1064580587560053, "grad_norm": 7.3137959695835235, "learning_rate": 3.1778636636395833e-06, "loss": 0.4074, "step": 6195 }, { "epoch": 2.1081586667233534, "grad_norm": 5.23251922134586, "learning_rate": 3.1752526710110032e-06, "loss": 0.3702, "step": 6200 }, { "epoch": 2.109859274690702, "grad_norm": 6.960677215599259, "learning_rate": 3.1726408836354438e-06, "loss": 0.3798, "step": 6205 }, { "epoch": 2.1115598826580504, "grad_norm": 7.278541102535109, "learning_rate": 3.1700283045868807e-06, "loss": 0.3932, "step": 6210 }, { "epoch": 2.1132604906253984, "grad_norm": 5.0662289021962765, "learning_rate": 3.167414936940221e-06, "loss": 0.3938, "step": 6215 }, { "epoch": 2.114961098592747, "grad_norm": 4.225044206581991, "learning_rate": 3.164800783771299e-06, "loss": 0.3844, "step": 6220 }, { "epoch": 2.1166617065600954, "grad_norm": 3.622337780704819, "learning_rate": 3.1621858481568755e-06, "loss": 0.3807, "step": 6225 }, { "epoch": 2.1183623145274435, "grad_norm": 4.3833292474446495, "learning_rate": 3.1595701331746313e-06, "loss": 0.3765, "step": 6230 }, { "epoch": 2.120062922494792, "grad_norm": 3.363567775993743, "learning_rate": 3.156953641903165e-06, "loss": 0.4002, "step": 6235 }, { "epoch": 2.12176353046214, "grad_norm": 4.60752552686727, "learning_rate": 3.1543363774219877e-06, "loss": 0.3636, "step": 6240 }, { "epoch": 2.1234641384294886, "grad_norm": 28.46386141345009, "learning_rate": 3.151718342811521e-06, "loss": 0.405, "step": 6245 }, { "epoch": 2.125164746396837, "grad_norm": 6.232217445947908, "learning_rate": 3.1490995411530936e-06, "loss": 0.3801, "step": 6250 }, { "epoch": 2.126865354364185, "grad_norm": 3.5755057375891424, "learning_rate": 3.1464799755289367e-06, "loss": 0.4139, "step": 6255 }, { "epoch": 2.1285659623315336, "grad_norm": 3.6657643773760253, "learning_rate": 3.1438596490221797e-06, "loss": 0.3807, "step": 6260 }, { "epoch": 2.1302665702988817, "grad_norm": 4.206142634849997, "learning_rate": 3.141238564716848e-06, "loss": 0.3787, "step": 6265 }, { "epoch": 2.13196717826623, "grad_norm": 9.747712005275096, "learning_rate": 3.1386167256978606e-06, "loss": 0.3961, "step": 6270 }, { "epoch": 2.1336677862335787, "grad_norm": 10.02961085346867, "learning_rate": 3.135994135051022e-06, "loss": 0.3995, "step": 6275 }, { "epoch": 2.1353683942009267, "grad_norm": 2.920442716069924, "learning_rate": 3.1333707958630232e-06, "loss": 0.3877, "step": 6280 }, { "epoch": 2.1370690021682752, "grad_norm": 4.088653138808995, "learning_rate": 3.130746711221436e-06, "loss": 0.3912, "step": 6285 }, { "epoch": 2.1387696101356233, "grad_norm": 2.919705730672031, "learning_rate": 3.128121884214709e-06, "loss": 0.3757, "step": 6290 }, { "epoch": 2.1404702181029718, "grad_norm": 5.139100889696049, "learning_rate": 3.1254963179321645e-06, "loss": 0.4051, "step": 6295 }, { "epoch": 2.1421708260703203, "grad_norm": 8.86992930823528, "learning_rate": 3.1228700154639957e-06, "loss": 0.4186, "step": 6300 }, { "epoch": 2.1438714340376683, "grad_norm": 4.007375688761407, "learning_rate": 3.1202429799012612e-06, "loss": 0.3816, "step": 6305 }, { "epoch": 2.145572042005017, "grad_norm": 3.187399320395256, "learning_rate": 3.117615214335884e-06, "loss": 0.3822, "step": 6310 }, { "epoch": 2.147272649972365, "grad_norm": 3.756238102309157, "learning_rate": 3.1149867218606437e-06, "loss": 0.3872, "step": 6315 }, { "epoch": 2.1489732579397134, "grad_norm": 3.6426880756085707, "learning_rate": 3.1123575055691786e-06, "loss": 0.3747, "step": 6320 }, { "epoch": 2.150673865907062, "grad_norm": 2.9304753447302168, "learning_rate": 3.1097275685559764e-06, "loss": 0.403, "step": 6325 }, { "epoch": 2.15237447387441, "grad_norm": 2.8120536872009683, "learning_rate": 3.1070969139163744e-06, "loss": 0.3761, "step": 6330 }, { "epoch": 2.1540750818417584, "grad_norm": 3.217746393107104, "learning_rate": 3.1044655447465537e-06, "loss": 0.3917, "step": 6335 }, { "epoch": 2.155775689809107, "grad_norm": 8.022088518257592, "learning_rate": 3.1018334641435365e-06, "loss": 0.3975, "step": 6340 }, { "epoch": 2.157476297776455, "grad_norm": 9.534585463758319, "learning_rate": 3.099200675205184e-06, "loss": 0.3926, "step": 6345 }, { "epoch": 2.1591769057438035, "grad_norm": 3.0045397628551926, "learning_rate": 3.096567181030188e-06, "loss": 0.3936, "step": 6350 }, { "epoch": 2.1608775137111516, "grad_norm": 3.9190507509718318, "learning_rate": 3.0939329847180725e-06, "loss": 0.3816, "step": 6355 }, { "epoch": 2.1625781216785, "grad_norm": 4.673292029453938, "learning_rate": 3.0912980893691883e-06, "loss": 0.3907, "step": 6360 }, { "epoch": 2.1642787296458486, "grad_norm": 3.315784859499087, "learning_rate": 3.088662498084708e-06, "loss": 0.3989, "step": 6365 }, { "epoch": 2.1659793376131966, "grad_norm": 6.957493367825952, "learning_rate": 3.086026213966622e-06, "loss": 0.3893, "step": 6370 }, { "epoch": 2.167679945580545, "grad_norm": 4.573861308675167, "learning_rate": 3.083389240117739e-06, "loss": 0.3838, "step": 6375 }, { "epoch": 2.169380553547893, "grad_norm": 5.145518565607208, "learning_rate": 3.0807515796416766e-06, "loss": 0.3884, "step": 6380 }, { "epoch": 2.1710811615152417, "grad_norm": 3.8807568929567764, "learning_rate": 3.0781132356428633e-06, "loss": 0.3845, "step": 6385 }, { "epoch": 2.17278176948259, "grad_norm": 5.3203271906906915, "learning_rate": 3.0754742112265294e-06, "loss": 0.4042, "step": 6390 }, { "epoch": 2.174482377449938, "grad_norm": 3.3306654186981595, "learning_rate": 3.0728345094987078e-06, "loss": 0.3845, "step": 6395 }, { "epoch": 2.1761829854172867, "grad_norm": 3.1182587811158404, "learning_rate": 3.070194133566229e-06, "loss": 0.3963, "step": 6400 }, { "epoch": 2.177883593384635, "grad_norm": 2.8494439811287555, "learning_rate": 3.0675530865367143e-06, "loss": 0.3861, "step": 6405 }, { "epoch": 2.1795842013519833, "grad_norm": 25.04637976539768, "learning_rate": 3.0649113715185772e-06, "loss": 0.3752, "step": 6410 }, { "epoch": 2.1812848093193318, "grad_norm": 6.638253290510534, "learning_rate": 3.0622689916210185e-06, "loss": 0.4129, "step": 6415 }, { "epoch": 2.18298541728668, "grad_norm": 4.096836227802035, "learning_rate": 3.0596259499540178e-06, "loss": 0.4029, "step": 6420 }, { "epoch": 2.1846860252540283, "grad_norm": 5.949966887843709, "learning_rate": 3.056982249628337e-06, "loss": 0.3671, "step": 6425 }, { "epoch": 2.186386633221377, "grad_norm": 5.186398482392616, "learning_rate": 3.0543378937555113e-06, "loss": 0.3921, "step": 6430 }, { "epoch": 2.188087241188725, "grad_norm": 4.8792204209101815, "learning_rate": 3.0516928854478478e-06, "loss": 0.4044, "step": 6435 }, { "epoch": 2.1897878491560734, "grad_norm": 3.003562599707504, "learning_rate": 3.0490472278184226e-06, "loss": 0.3849, "step": 6440 }, { "epoch": 2.1914884571234214, "grad_norm": 4.526727944224896, "learning_rate": 3.0464009239810745e-06, "loss": 0.3804, "step": 6445 }, { "epoch": 2.19318906509077, "grad_norm": 28.20529948007441, "learning_rate": 3.0437539770504038e-06, "loss": 0.4042, "step": 6450 }, { "epoch": 2.1948896730581184, "grad_norm": 4.058541629984453, "learning_rate": 3.041106390141767e-06, "loss": 0.3709, "step": 6455 }, { "epoch": 2.1965902810254665, "grad_norm": 3.1362985280636346, "learning_rate": 3.0384581663712747e-06, "loss": 0.3841, "step": 6460 }, { "epoch": 2.198290888992815, "grad_norm": 4.053826773580988, "learning_rate": 3.0358093088557867e-06, "loss": 0.3952, "step": 6465 }, { "epoch": 2.199991496960163, "grad_norm": 6.578596836430825, "learning_rate": 3.0331598207129078e-06, "loss": 0.3776, "step": 6470 }, { "epoch": 2.2016921049275116, "grad_norm": 3.073007554583588, "learning_rate": 3.0305097050609868e-06, "loss": 0.3831, "step": 6475 }, { "epoch": 2.20339271289486, "grad_norm": 10.660662387188902, "learning_rate": 3.0278589650191086e-06, "loss": 0.4201, "step": 6480 }, { "epoch": 2.205093320862208, "grad_norm": 3.634437435981205, "learning_rate": 3.025207603707096e-06, "loss": 0.4143, "step": 6485 }, { "epoch": 2.2067939288295566, "grad_norm": 2.3541378661435775, "learning_rate": 3.022555624245501e-06, "loss": 0.3784, "step": 6490 }, { "epoch": 2.2084945367969047, "grad_norm": 3.713354619796085, "learning_rate": 3.019903029755604e-06, "loss": 0.3941, "step": 6495 }, { "epoch": 2.210195144764253, "grad_norm": 3.4344135334816386, "learning_rate": 3.0172498233594085e-06, "loss": 0.3905, "step": 6500 }, { "epoch": 2.2118957527316017, "grad_norm": 5.643221608685602, "learning_rate": 3.014596008179638e-06, "loss": 0.3904, "step": 6505 }, { "epoch": 2.2135963606989497, "grad_norm": 3.2816927802420754, "learning_rate": 3.011941587339734e-06, "loss": 0.3775, "step": 6510 }, { "epoch": 2.215296968666298, "grad_norm": 6.472534170411608, "learning_rate": 3.0092865639638496e-06, "loss": 0.3884, "step": 6515 }, { "epoch": 2.2169975766336467, "grad_norm": 5.360474824776912, "learning_rate": 3.006630941176847e-06, "loss": 0.3934, "step": 6520 }, { "epoch": 2.2186981846009948, "grad_norm": 4.213214855907222, "learning_rate": 3.0039747221042947e-06, "loss": 0.4072, "step": 6525 }, { "epoch": 2.2203987925683433, "grad_norm": 4.0313901632910305, "learning_rate": 3.0013179098724626e-06, "loss": 0.4198, "step": 6530 }, { "epoch": 2.2220994005356913, "grad_norm": 6.320108057489615, "learning_rate": 2.9986605076083185e-06, "loss": 0.3846, "step": 6535 }, { "epoch": 2.22380000850304, "grad_norm": 3.3568667388543942, "learning_rate": 2.9960025184395248e-06, "loss": 0.3887, "step": 6540 }, { "epoch": 2.2255006164703883, "grad_norm": 6.005606524940383, "learning_rate": 2.9933439454944346e-06, "loss": 0.3782, "step": 6545 }, { "epoch": 2.2272012244377364, "grad_norm": 5.17729596720581, "learning_rate": 2.990684791902089e-06, "loss": 0.3946, "step": 6550 }, { "epoch": 2.228901832405085, "grad_norm": 2.794064491309928, "learning_rate": 2.9880250607922107e-06, "loss": 0.3722, "step": 6555 }, { "epoch": 2.230602440372433, "grad_norm": 7.343356762429561, "learning_rate": 2.9853647552952037e-06, "loss": 0.3866, "step": 6560 }, { "epoch": 2.2323030483397814, "grad_norm": 3.7349707455890697, "learning_rate": 2.982703878542147e-06, "loss": 0.3865, "step": 6565 }, { "epoch": 2.23400365630713, "grad_norm": 7.457076904709625, "learning_rate": 2.9800424336647933e-06, "loss": 0.4007, "step": 6570 }, { "epoch": 2.235704264274478, "grad_norm": 3.2239996372192397, "learning_rate": 2.9773804237955616e-06, "loss": 0.3824, "step": 6575 }, { "epoch": 2.2374048722418265, "grad_norm": 2.9883472148059393, "learning_rate": 2.974717852067539e-06, "loss": 0.3784, "step": 6580 }, { "epoch": 2.239105480209175, "grad_norm": 3.6203687143679373, "learning_rate": 2.9720547216144714e-06, "loss": 0.3922, "step": 6585 }, { "epoch": 2.240806088176523, "grad_norm": 4.057120091702925, "learning_rate": 2.9693910355707622e-06, "loss": 0.4069, "step": 6590 }, { "epoch": 2.2425066961438715, "grad_norm": 2.8483307786069685, "learning_rate": 2.9667267970714714e-06, "loss": 0.4098, "step": 6595 }, { "epoch": 2.2442073041112196, "grad_norm": 3.3810780108085052, "learning_rate": 2.9640620092523064e-06, "loss": 0.4086, "step": 6600 }, { "epoch": 2.245907912078568, "grad_norm": 4.052869027985857, "learning_rate": 2.9613966752496215e-06, "loss": 0.3857, "step": 6605 }, { "epoch": 2.2476085200459166, "grad_norm": 4.073745702007441, "learning_rate": 2.958730798200416e-06, "loss": 0.3902, "step": 6610 }, { "epoch": 2.2493091280132647, "grad_norm": 9.767610746705577, "learning_rate": 2.9560643812423258e-06, "loss": 0.3944, "step": 6615 }, { "epoch": 2.251009735980613, "grad_norm": 3.8926434508545413, "learning_rate": 2.953397427513624e-06, "loss": 0.399, "step": 6620 }, { "epoch": 2.252710343947961, "grad_norm": 4.857648522720984, "learning_rate": 2.950729940153215e-06, "loss": 0.3699, "step": 6625 }, { "epoch": 2.2544109519153097, "grad_norm": 4.578974296469672, "learning_rate": 2.9480619223006297e-06, "loss": 0.4074, "step": 6630 }, { "epoch": 2.256111559882658, "grad_norm": 2.9057422570205422, "learning_rate": 2.9453933770960254e-06, "loss": 0.4073, "step": 6635 }, { "epoch": 2.2578121678500063, "grad_norm": 8.47568626328369, "learning_rate": 2.9427243076801797e-06, "loss": 0.3659, "step": 6640 }, { "epoch": 2.2595127758173548, "grad_norm": 3.2068623693933263, "learning_rate": 2.9400547171944864e-06, "loss": 0.3765, "step": 6645 }, { "epoch": 2.2612133837847033, "grad_norm": 3.742057678575811, "learning_rate": 2.937384608780953e-06, "loss": 0.3909, "step": 6650 }, { "epoch": 2.2629139917520513, "grad_norm": 2.860519356877674, "learning_rate": 2.9347139855821978e-06, "loss": 0.3816, "step": 6655 }, { "epoch": 2.2646145997194, "grad_norm": 2.580593782356621, "learning_rate": 2.932042850741442e-06, "loss": 0.3787, "step": 6660 }, { "epoch": 2.266315207686748, "grad_norm": 4.783146635286192, "learning_rate": 2.929371207402511e-06, "loss": 0.3949, "step": 6665 }, { "epoch": 2.2680158156540964, "grad_norm": 13.918648907622003, "learning_rate": 2.9266990587098297e-06, "loss": 0.384, "step": 6670 }, { "epoch": 2.2697164236214444, "grad_norm": 3.798237473233975, "learning_rate": 2.9240264078084163e-06, "loss": 0.3929, "step": 6675 }, { "epoch": 2.271417031588793, "grad_norm": 3.673668146030382, "learning_rate": 2.9213532578438797e-06, "loss": 0.3897, "step": 6680 }, { "epoch": 2.2731176395561414, "grad_norm": 4.35686860967354, "learning_rate": 2.9186796119624166e-06, "loss": 0.3788, "step": 6685 }, { "epoch": 2.2748182475234895, "grad_norm": 3.013009008419703, "learning_rate": 2.9160054733108085e-06, "loss": 0.375, "step": 6690 }, { "epoch": 2.276518855490838, "grad_norm": 4.732871979512938, "learning_rate": 2.913330845036415e-06, "loss": 0.3877, "step": 6695 }, { "epoch": 2.2782194634581865, "grad_norm": 2.8601369197988844, "learning_rate": 2.9106557302871735e-06, "loss": 0.3728, "step": 6700 }, { "epoch": 2.2799200714255345, "grad_norm": 3.4292070952688776, "learning_rate": 2.9079801322115938e-06, "loss": 0.4044, "step": 6705 }, { "epoch": 2.281620679392883, "grad_norm": 7.024904512911684, "learning_rate": 2.905304053958753e-06, "loss": 0.3762, "step": 6710 }, { "epoch": 2.283321287360231, "grad_norm": 3.25399253417125, "learning_rate": 2.902627498678295e-06, "loss": 0.3875, "step": 6715 }, { "epoch": 2.2850218953275796, "grad_norm": 30.94644471801256, "learning_rate": 2.8999504695204246e-06, "loss": 0.3727, "step": 6720 }, { "epoch": 2.286722503294928, "grad_norm": 5.520919390543611, "learning_rate": 2.8972729696359035e-06, "loss": 0.3785, "step": 6725 }, { "epoch": 2.288423111262276, "grad_norm": 4.206500382437017, "learning_rate": 2.8945950021760504e-06, "loss": 0.4015, "step": 6730 }, { "epoch": 2.2901237192296247, "grad_norm": 3.5001389066297905, "learning_rate": 2.891916570292731e-06, "loss": 0.3736, "step": 6735 }, { "epoch": 2.2918243271969727, "grad_norm": 4.046070751027091, "learning_rate": 2.8892376771383586e-06, "loss": 0.3694, "step": 6740 }, { "epoch": 2.293524935164321, "grad_norm": 6.65064340381129, "learning_rate": 2.8865583258658895e-06, "loss": 0.3982, "step": 6745 }, { "epoch": 2.2952255431316697, "grad_norm": 2.9008563706204176, "learning_rate": 2.8838785196288205e-06, "loss": 0.4032, "step": 6750 }, { "epoch": 2.2969261510990178, "grad_norm": 3.406796171684056, "learning_rate": 2.881198261581182e-06, "loss": 0.3912, "step": 6755 }, { "epoch": 2.2986267590663663, "grad_norm": 5.817153214684034, "learning_rate": 2.878517554877536e-06, "loss": 0.3682, "step": 6760 }, { "epoch": 2.3003273670337148, "grad_norm": 5.904612058981914, "learning_rate": 2.8758364026729742e-06, "loss": 0.3938, "step": 6765 }, { "epoch": 2.302027975001063, "grad_norm": 4.497725265776514, "learning_rate": 2.8731548081231114e-06, "loss": 0.4018, "step": 6770 }, { "epoch": 2.3037285829684113, "grad_norm": 3.0306298954449185, "learning_rate": 2.870472774384084e-06, "loss": 0.3864, "step": 6775 }, { "epoch": 2.3054291909357594, "grad_norm": 3.957901406872624, "learning_rate": 2.867790304612545e-06, "loss": 0.3897, "step": 6780 }, { "epoch": 2.307129798903108, "grad_norm": 4.19134334326208, "learning_rate": 2.8651074019656594e-06, "loss": 0.3919, "step": 6785 }, { "epoch": 2.3088304068704564, "grad_norm": 4.062622192145356, "learning_rate": 2.862424069601103e-06, "loss": 0.4088, "step": 6790 }, { "epoch": 2.3105310148378044, "grad_norm": 7.4185656087697405, "learning_rate": 2.859740310677058e-06, "loss": 0.3816, "step": 6795 }, { "epoch": 2.312231622805153, "grad_norm": 4.226402215845138, "learning_rate": 2.8570561283522063e-06, "loss": 0.3814, "step": 6800 }, { "epoch": 2.313932230772501, "grad_norm": 5.2505222772393525, "learning_rate": 2.8543715257857294e-06, "loss": 0.3777, "step": 6805 }, { "epoch": 2.3156328387398495, "grad_norm": 6.356064017229173, "learning_rate": 2.851686506137305e-06, "loss": 0.3712, "step": 6810 }, { "epoch": 2.317333446707198, "grad_norm": 3.5856118910345103, "learning_rate": 2.849001072567099e-06, "loss": 0.3847, "step": 6815 }, { "epoch": 2.319034054674546, "grad_norm": 4.549197226747274, "learning_rate": 2.846315228235766e-06, "loss": 0.4028, "step": 6820 }, { "epoch": 2.3207346626418945, "grad_norm": 3.485550956628789, "learning_rate": 2.8436289763044434e-06, "loss": 0.3679, "step": 6825 }, { "epoch": 2.322435270609243, "grad_norm": 3.378212405100649, "learning_rate": 2.8409423199347484e-06, "loss": 0.4032, "step": 6830 }, { "epoch": 2.324135878576591, "grad_norm": 3.121843876887772, "learning_rate": 2.8382552622887753e-06, "loss": 0.3889, "step": 6835 }, { "epoch": 2.3258364865439396, "grad_norm": 7.439721804395752, "learning_rate": 2.8355678065290893e-06, "loss": 0.3964, "step": 6840 }, { "epoch": 2.3275370945112877, "grad_norm": 3.5065346203448944, "learning_rate": 2.8328799558187237e-06, "loss": 0.398, "step": 6845 }, { "epoch": 2.329237702478636, "grad_norm": 4.170041707692432, "learning_rate": 2.83019171332118e-06, "loss": 0.377, "step": 6850 }, { "epoch": 2.330938310445984, "grad_norm": 5.121636763633387, "learning_rate": 2.8275030822004165e-06, "loss": 0.3692, "step": 6855 }, { "epoch": 2.3326389184133327, "grad_norm": 5.461538166371756, "learning_rate": 2.824814065620851e-06, "loss": 0.413, "step": 6860 }, { "epoch": 2.334339526380681, "grad_norm": 3.3821381664776826, "learning_rate": 2.822124666747356e-06, "loss": 0.3692, "step": 6865 }, { "epoch": 2.3360401343480293, "grad_norm": 3.14786263283993, "learning_rate": 2.8194348887452518e-06, "loss": 0.397, "step": 6870 }, { "epoch": 2.3377407423153778, "grad_norm": 3.8455746251884926, "learning_rate": 2.8167447347803057e-06, "loss": 0.3823, "step": 6875 }, { "epoch": 2.3394413502827263, "grad_norm": 3.294819905660892, "learning_rate": 2.814054208018728e-06, "loss": 0.3888, "step": 6880 }, { "epoch": 2.3411419582500743, "grad_norm": 4.042913409326378, "learning_rate": 2.811363311627168e-06, "loss": 0.3892, "step": 6885 }, { "epoch": 2.342842566217423, "grad_norm": 5.614812550089777, "learning_rate": 2.808672048772709e-06, "loss": 0.3675, "step": 6890 }, { "epoch": 2.344543174184771, "grad_norm": 9.879426887704412, "learning_rate": 2.8059804226228655e-06, "loss": 0.4198, "step": 6895 }, { "epoch": 2.3462437821521194, "grad_norm": 2.4671869965760216, "learning_rate": 2.803288436345581e-06, "loss": 0.3663, "step": 6900 }, { "epoch": 2.347944390119468, "grad_norm": 3.8103039255669935, "learning_rate": 2.8005960931092207e-06, "loss": 0.3709, "step": 6905 }, { "epoch": 2.349644998086816, "grad_norm": 3.0719064184061162, "learning_rate": 2.7979033960825734e-06, "loss": 0.3774, "step": 6910 }, { "epoch": 2.3513456060541644, "grad_norm": 5.930297960854046, "learning_rate": 2.7952103484348407e-06, "loss": 0.37, "step": 6915 }, { "epoch": 2.3530462140215125, "grad_norm": 4.240313783774013, "learning_rate": 2.792516953335639e-06, "loss": 0.3748, "step": 6920 }, { "epoch": 2.354746821988861, "grad_norm": 5.371956477197449, "learning_rate": 2.7898232139549917e-06, "loss": 0.3681, "step": 6925 }, { "epoch": 2.3564474299562095, "grad_norm": 4.448795807879105, "learning_rate": 2.7871291334633305e-06, "loss": 0.378, "step": 6930 }, { "epoch": 2.3581480379235575, "grad_norm": 2.398314300258431, "learning_rate": 2.784434715031486e-06, "loss": 0.378, "step": 6935 }, { "epoch": 2.359848645890906, "grad_norm": 4.053282477849935, "learning_rate": 2.781739961830687e-06, "loss": 0.3919, "step": 6940 }, { "epoch": 2.3615492538582545, "grad_norm": 3.434192241461476, "learning_rate": 2.779044877032556e-06, "loss": 0.3864, "step": 6945 }, { "epoch": 2.3632498618256026, "grad_norm": 3.5127030227379796, "learning_rate": 2.7763494638091074e-06, "loss": 0.3837, "step": 6950 }, { "epoch": 2.364950469792951, "grad_norm": 12.549471937745547, "learning_rate": 2.773653725332741e-06, "loss": 0.4015, "step": 6955 }, { "epoch": 2.366651077760299, "grad_norm": 4.330594942310785, "learning_rate": 2.770957664776239e-06, "loss": 0.3928, "step": 6960 }, { "epoch": 2.3683516857276476, "grad_norm": 2.6158581072547653, "learning_rate": 2.7682612853127634e-06, "loss": 0.3895, "step": 6965 }, { "epoch": 2.370052293694996, "grad_norm": 4.641906618466447, "learning_rate": 2.7655645901158516e-06, "loss": 0.3757, "step": 6970 }, { "epoch": 2.371752901662344, "grad_norm": 10.877258540651997, "learning_rate": 2.7628675823594132e-06, "loss": 0.3811, "step": 6975 }, { "epoch": 2.3734535096296927, "grad_norm": 3.0562321854812358, "learning_rate": 2.7601702652177225e-06, "loss": 0.3778, "step": 6980 }, { "epoch": 2.3751541175970408, "grad_norm": 11.777994691598085, "learning_rate": 2.7574726418654225e-06, "loss": 0.4107, "step": 6985 }, { "epoch": 2.3768547255643893, "grad_norm": 3.2157204196506908, "learning_rate": 2.7547747154775133e-06, "loss": 0.3915, "step": 6990 }, { "epoch": 2.3785553335317378, "grad_norm": 4.373250719021987, "learning_rate": 2.752076489229353e-06, "loss": 0.3949, "step": 6995 }, { "epoch": 2.380255941499086, "grad_norm": 4.667751747557542, "learning_rate": 2.749377966296652e-06, "loss": 0.3898, "step": 7000 }, { "epoch": 2.3819565494664343, "grad_norm": 2.70105224071973, "learning_rate": 2.74667914985547e-06, "loss": 0.3866, "step": 7005 }, { "epoch": 2.383657157433783, "grad_norm": 3.0249429323232353, "learning_rate": 2.743980043082214e-06, "loss": 0.369, "step": 7010 }, { "epoch": 2.385357765401131, "grad_norm": 3.4397168493562216, "learning_rate": 2.741280649153629e-06, "loss": 0.396, "step": 7015 }, { "epoch": 2.3870583733684794, "grad_norm": 5.2931519021161515, "learning_rate": 2.738580971246801e-06, "loss": 0.407, "step": 7020 }, { "epoch": 2.3887589813358274, "grad_norm": 3.093863474694001, "learning_rate": 2.735881012539149e-06, "loss": 0.3708, "step": 7025 }, { "epoch": 2.390459589303176, "grad_norm": 2.7433960769201233, "learning_rate": 2.7331807762084236e-06, "loss": 0.3787, "step": 7030 }, { "epoch": 2.392160197270524, "grad_norm": 4.579031308987416, "learning_rate": 2.7304802654327007e-06, "loss": 0.3801, "step": 7035 }, { "epoch": 2.3938608052378725, "grad_norm": 7.896593870725533, "learning_rate": 2.727779483390379e-06, "loss": 0.3988, "step": 7040 }, { "epoch": 2.395561413205221, "grad_norm": 2.920203355513295, "learning_rate": 2.7250784332601793e-06, "loss": 0.3777, "step": 7045 }, { "epoch": 2.397262021172569, "grad_norm": 3.2350813759464345, "learning_rate": 2.722377118221135e-06, "loss": 0.3658, "step": 7050 }, { "epoch": 2.3989626291399175, "grad_norm": 6.238155764872062, "learning_rate": 2.719675541452592e-06, "loss": 0.4156, "step": 7055 }, { "epoch": 2.400663237107266, "grad_norm": 2.976892920763275, "learning_rate": 2.7169737061342044e-06, "loss": 0.3966, "step": 7060 }, { "epoch": 2.402363845074614, "grad_norm": 3.093711932280492, "learning_rate": 2.7142716154459307e-06, "loss": 0.3618, "step": 7065 }, { "epoch": 2.4040644530419626, "grad_norm": 3.4625004458174695, "learning_rate": 2.7115692725680304e-06, "loss": 0.376, "step": 7070 }, { "epoch": 2.4057650610093106, "grad_norm": 4.281537573703623, "learning_rate": 2.708866680681059e-06, "loss": 0.3883, "step": 7075 }, { "epoch": 2.407465668976659, "grad_norm": 3.716958213713566, "learning_rate": 2.7061638429658653e-06, "loss": 0.3723, "step": 7080 }, { "epoch": 2.4091662769440076, "grad_norm": 3.6006577849348447, "learning_rate": 2.703460762603588e-06, "loss": 0.3617, "step": 7085 }, { "epoch": 2.4108668849113557, "grad_norm": 24.360757548436837, "learning_rate": 2.700757442775651e-06, "loss": 0.3924, "step": 7090 }, { "epoch": 2.412567492878704, "grad_norm": 2.888222589544978, "learning_rate": 2.6980538866637594e-06, "loss": 0.3937, "step": 7095 }, { "epoch": 2.4142681008460523, "grad_norm": 2.921687785117801, "learning_rate": 2.695350097449897e-06, "loss": 0.3888, "step": 7100 }, { "epoch": 2.4159687088134008, "grad_norm": 4.1461127523046, "learning_rate": 2.6926460783163223e-06, "loss": 0.3968, "step": 7105 }, { "epoch": 2.4176693167807493, "grad_norm": 4.712695647518259, "learning_rate": 2.6899418324455643e-06, "loss": 0.389, "step": 7110 }, { "epoch": 2.4193699247480973, "grad_norm": 3.115151541680386, "learning_rate": 2.6872373630204186e-06, "loss": 0.3727, "step": 7115 }, { "epoch": 2.421070532715446, "grad_norm": 3.157897674566553, "learning_rate": 2.684532673223943e-06, "loss": 0.3753, "step": 7120 }, { "epoch": 2.4227711406827943, "grad_norm": 3.154771734393668, "learning_rate": 2.6818277662394567e-06, "loss": 0.3981, "step": 7125 }, { "epoch": 2.4244717486501424, "grad_norm": 2.9277590011213386, "learning_rate": 2.6791226452505326e-06, "loss": 0.3768, "step": 7130 }, { "epoch": 2.426172356617491, "grad_norm": 3.4408684506274283, "learning_rate": 2.676417313440997e-06, "loss": 0.3779, "step": 7135 }, { "epoch": 2.427872964584839, "grad_norm": 3.7907855368258234, "learning_rate": 2.673711773994923e-06, "loss": 0.3543, "step": 7140 }, { "epoch": 2.4295735725521874, "grad_norm": 2.7577432226511003, "learning_rate": 2.671006030096629e-06, "loss": 0.3591, "step": 7145 }, { "epoch": 2.431274180519536, "grad_norm": 3.70322966299185, "learning_rate": 2.668300084930674e-06, "loss": 0.3771, "step": 7150 }, { "epoch": 2.432974788486884, "grad_norm": 3.5439886993149985, "learning_rate": 2.6655939416818534e-06, "loss": 0.3956, "step": 7155 }, { "epoch": 2.4346753964542325, "grad_norm": 3.1874622661076577, "learning_rate": 2.6628876035351948e-06, "loss": 0.3989, "step": 7160 }, { "epoch": 2.4363760044215805, "grad_norm": 5.7706010189966985, "learning_rate": 2.660181073675958e-06, "loss": 0.3575, "step": 7165 }, { "epoch": 2.438076612388929, "grad_norm": 3.431719425794485, "learning_rate": 2.6574743552896266e-06, "loss": 0.3687, "step": 7170 }, { "epoch": 2.4397772203562775, "grad_norm": 15.30462636095367, "learning_rate": 2.6547674515619053e-06, "loss": 0.368, "step": 7175 }, { "epoch": 2.4414778283236256, "grad_norm": 3.7179034277483787, "learning_rate": 2.6520603656787187e-06, "loss": 0.3843, "step": 7180 }, { "epoch": 2.443178436290974, "grad_norm": 3.49585049531343, "learning_rate": 2.6493531008262054e-06, "loss": 0.3933, "step": 7185 }, { "epoch": 2.4448790442583226, "grad_norm": 5.402626100935758, "learning_rate": 2.6466456601907127e-06, "loss": 0.3975, "step": 7190 }, { "epoch": 2.4465796522256706, "grad_norm": 3.944811421987731, "learning_rate": 2.643938046958797e-06, "loss": 0.3778, "step": 7195 }, { "epoch": 2.448280260193019, "grad_norm": 4.352984068739917, "learning_rate": 2.6412302643172184e-06, "loss": 0.3751, "step": 7200 }, { "epoch": 2.449980868160367, "grad_norm": 5.990844269985125, "learning_rate": 2.638522315452934e-06, "loss": 0.384, "step": 7205 }, { "epoch": 2.4516814761277157, "grad_norm": 4.424332453063507, "learning_rate": 2.635814203553097e-06, "loss": 0.3828, "step": 7210 }, { "epoch": 2.4533820840950638, "grad_norm": 4.176743372483272, "learning_rate": 2.6331059318050543e-06, "loss": 0.3881, "step": 7215 }, { "epoch": 2.4550826920624123, "grad_norm": 3.310052928713474, "learning_rate": 2.6303975033963396e-06, "loss": 0.3757, "step": 7220 }, { "epoch": 2.4567833000297608, "grad_norm": 8.163648025882706, "learning_rate": 2.627688921514672e-06, "loss": 0.4053, "step": 7225 }, { "epoch": 2.458483907997109, "grad_norm": 5.165457546817537, "learning_rate": 2.6249801893479483e-06, "loss": 0.3735, "step": 7230 }, { "epoch": 2.4601845159644573, "grad_norm": 6.269256791836688, "learning_rate": 2.622271310084246e-06, "loss": 0.3573, "step": 7235 }, { "epoch": 2.461885123931806, "grad_norm": 9.252976144429491, "learning_rate": 2.619562286911814e-06, "loss": 0.3759, "step": 7240 }, { "epoch": 2.463585731899154, "grad_norm": 3.833607786003582, "learning_rate": 2.6168531230190703e-06, "loss": 0.3699, "step": 7245 }, { "epoch": 2.4652863398665024, "grad_norm": 4.39713759616983, "learning_rate": 2.6141438215945986e-06, "loss": 0.3835, "step": 7250 }, { "epoch": 2.4669869478338504, "grad_norm": 5.834908443228501, "learning_rate": 2.6114343858271444e-06, "loss": 0.3663, "step": 7255 }, { "epoch": 2.468687555801199, "grad_norm": 3.411864572013808, "learning_rate": 2.608724818905613e-06, "loss": 0.3713, "step": 7260 }, { "epoch": 2.4703881637685474, "grad_norm": 2.9180662262381007, "learning_rate": 2.606015124019061e-06, "loss": 0.3644, "step": 7265 }, { "epoch": 2.4720887717358955, "grad_norm": 4.7665976923772435, "learning_rate": 2.603305304356699e-06, "loss": 0.3739, "step": 7270 }, { "epoch": 2.473789379703244, "grad_norm": 4.417886027471119, "learning_rate": 2.600595363107881e-06, "loss": 0.4001, "step": 7275 }, { "epoch": 2.475489987670592, "grad_norm": 3.611893878891544, "learning_rate": 2.5978853034621068e-06, "loss": 0.3836, "step": 7280 }, { "epoch": 2.4771905956379405, "grad_norm": 7.797442710115348, "learning_rate": 2.5951751286090147e-06, "loss": 0.4007, "step": 7285 }, { "epoch": 2.478891203605289, "grad_norm": 5.48946205897742, "learning_rate": 2.5924648417383785e-06, "loss": 0.3738, "step": 7290 }, { "epoch": 2.480591811572637, "grad_norm": 8.268572046751316, "learning_rate": 2.5897544460401035e-06, "loss": 0.4021, "step": 7295 }, { "epoch": 2.4822924195399856, "grad_norm": 3.1373030798338823, "learning_rate": 2.5870439447042233e-06, "loss": 0.3594, "step": 7300 }, { "epoch": 2.483993027507334, "grad_norm": 6.809225971579622, "learning_rate": 2.5843333409208965e-06, "loss": 0.393, "step": 7305 }, { "epoch": 2.485693635474682, "grad_norm": 3.408878613042235, "learning_rate": 2.5816226378804016e-06, "loss": 0.3796, "step": 7310 }, { "epoch": 2.4873942434420306, "grad_norm": 3.403340689742314, "learning_rate": 2.578911838773134e-06, "loss": 0.3846, "step": 7315 }, { "epoch": 2.4890948514093787, "grad_norm": 8.252308876674388, "learning_rate": 2.5762009467896023e-06, "loss": 0.3864, "step": 7320 }, { "epoch": 2.490795459376727, "grad_norm": 8.842409358021643, "learning_rate": 2.573489965120424e-06, "loss": 0.3872, "step": 7325 }, { "epoch": 2.4924960673440757, "grad_norm": 3.189248487541075, "learning_rate": 2.570778896956322e-06, "loss": 0.3687, "step": 7330 }, { "epoch": 2.4941966753114237, "grad_norm": 4.395130713415741, "learning_rate": 2.5680677454881233e-06, "loss": 0.3742, "step": 7335 }, { "epoch": 2.4958972832787722, "grad_norm": 6.024759124178671, "learning_rate": 2.565356513906748e-06, "loss": 0.3626, "step": 7340 }, { "epoch": 2.4975978912461203, "grad_norm": 5.154437866323346, "learning_rate": 2.5626452054032176e-06, "loss": 0.3732, "step": 7345 }, { "epoch": 2.499298499213469, "grad_norm": 5.064941238616173, "learning_rate": 2.5599338231686377e-06, "loss": 0.382, "step": 7350 }, { "epoch": 2.5009991071808173, "grad_norm": 2.9330903967823887, "learning_rate": 2.5572223703942035e-06, "loss": 0.357, "step": 7355 }, { "epoch": 2.5026997151481654, "grad_norm": 4.296783574336214, "learning_rate": 2.554510850271193e-06, "loss": 0.3939, "step": 7360 }, { "epoch": 2.504400323115514, "grad_norm": 6.214571798228819, "learning_rate": 2.5517992659909634e-06, "loss": 0.382, "step": 7365 }, { "epoch": 2.5061009310828624, "grad_norm": 3.970546982199594, "learning_rate": 2.5490876207449475e-06, "loss": 0.4019, "step": 7370 }, { "epoch": 2.5078015390502104, "grad_norm": 39.025384810741045, "learning_rate": 2.5463759177246495e-06, "loss": 0.3722, "step": 7375 }, { "epoch": 2.509502147017559, "grad_norm": 3.188083488241429, "learning_rate": 2.5436641601216415e-06, "loss": 0.3802, "step": 7380 }, { "epoch": 2.511202754984907, "grad_norm": 3.14525105512372, "learning_rate": 2.5409523511275606e-06, "loss": 0.3773, "step": 7385 }, { "epoch": 2.5129033629522555, "grad_norm": 3.4363663027066664, "learning_rate": 2.5382404939341036e-06, "loss": 0.386, "step": 7390 }, { "epoch": 2.5146039709196035, "grad_norm": 3.2902728132282326, "learning_rate": 2.5355285917330246e-06, "loss": 0.3919, "step": 7395 }, { "epoch": 2.516304578886952, "grad_norm": 3.3955336441336237, "learning_rate": 2.53281664771613e-06, "loss": 0.3862, "step": 7400 }, { "epoch": 2.5180051868543005, "grad_norm": 30.81157843740032, "learning_rate": 2.5301046650752763e-06, "loss": 0.3748, "step": 7405 }, { "epoch": 2.5197057948216486, "grad_norm": 6.625981841806447, "learning_rate": 2.527392647002365e-06, "loss": 0.3738, "step": 7410 }, { "epoch": 2.521406402788997, "grad_norm": 3.0921998463324494, "learning_rate": 2.5246805966893388e-06, "loss": 0.3802, "step": 7415 }, { "epoch": 2.5231070107563456, "grad_norm": 5.253948797961146, "learning_rate": 2.5219685173281797e-06, "loss": 0.3834, "step": 7420 }, { "epoch": 2.5248076187236936, "grad_norm": 4.97384495475131, "learning_rate": 2.5192564121109025e-06, "loss": 0.3917, "step": 7425 }, { "epoch": 2.526508226691042, "grad_norm": 3.525846323462171, "learning_rate": 2.516544284229553e-06, "loss": 0.4042, "step": 7430 }, { "epoch": 2.5282088346583906, "grad_norm": 3.6262970555576683, "learning_rate": 2.5138321368762036e-06, "loss": 0.3846, "step": 7435 }, { "epoch": 2.5299094426257387, "grad_norm": 7.761359044261236, "learning_rate": 2.5111199732429497e-06, "loss": 0.3536, "step": 7440 }, { "epoch": 2.531610050593087, "grad_norm": 3.173733817605067, "learning_rate": 2.5084077965219056e-06, "loss": 0.3805, "step": 7445 }, { "epoch": 2.5333106585604352, "grad_norm": 4.144913659058304, "learning_rate": 2.5056956099052017e-06, "loss": 0.3615, "step": 7450 }, { "epoch": 2.5350112665277837, "grad_norm": 3.5670625717185755, "learning_rate": 2.5029834165849787e-06, "loss": 0.4021, "step": 7455 }, { "epoch": 2.536711874495132, "grad_norm": 3.7333967813566105, "learning_rate": 2.500271219753387e-06, "loss": 0.3697, "step": 7460 }, { "epoch": 2.5384124824624803, "grad_norm": 3.3655959893083276, "learning_rate": 2.49755902260258e-06, "loss": 0.4013, "step": 7465 }, { "epoch": 2.540113090429829, "grad_norm": 3.864579886438611, "learning_rate": 2.494846828324711e-06, "loss": 0.3678, "step": 7470 }, { "epoch": 2.541813698397177, "grad_norm": 3.570992827071064, "learning_rate": 2.4921346401119317e-06, "loss": 0.3591, "step": 7475 }, { "epoch": 2.5435143063645254, "grad_norm": 5.462664500161082, "learning_rate": 2.489422461156385e-06, "loss": 0.3782, "step": 7480 }, { "epoch": 2.545214914331874, "grad_norm": 3.398428072748691, "learning_rate": 2.4867102946502034e-06, "loss": 0.377, "step": 7485 }, { "epoch": 2.546915522299222, "grad_norm": 3.4400743234994033, "learning_rate": 2.4839981437855045e-06, "loss": 0.3901, "step": 7490 }, { "epoch": 2.5486161302665704, "grad_norm": 3.4595834831198524, "learning_rate": 2.4812860117543883e-06, "loss": 0.3861, "step": 7495 }, { "epoch": 2.5503167382339185, "grad_norm": 5.267665892342849, "learning_rate": 2.478573901748932e-06, "loss": 0.4066, "step": 7500 }, { "epoch": 2.552017346201267, "grad_norm": 5.184477130223167, "learning_rate": 2.475861816961187e-06, "loss": 0.4084, "step": 7505 }, { "epoch": 2.553717954168615, "grad_norm": 2.513941647767793, "learning_rate": 2.4731497605831747e-06, "loss": 0.3718, "step": 7510 }, { "epoch": 2.5554185621359635, "grad_norm": 3.5225126273980467, "learning_rate": 2.470437735806884e-06, "loss": 0.3827, "step": 7515 }, { "epoch": 2.557119170103312, "grad_norm": 4.293542179809013, "learning_rate": 2.4677257458242645e-06, "loss": 0.3697, "step": 7520 }, { "epoch": 2.55881977807066, "grad_norm": 5.69680532630941, "learning_rate": 2.4650137938272285e-06, "loss": 0.3802, "step": 7525 }, { "epoch": 2.5605203860380086, "grad_norm": 2.967764381460697, "learning_rate": 2.4623018830076405e-06, "loss": 0.373, "step": 7530 }, { "epoch": 2.562220994005357, "grad_norm": 7.904769990452013, "learning_rate": 2.459590016557317e-06, "loss": 0.347, "step": 7535 }, { "epoch": 2.563921601972705, "grad_norm": 2.6712999240040443, "learning_rate": 2.4568781976680233e-06, "loss": 0.3489, "step": 7540 }, { "epoch": 2.5656222099400536, "grad_norm": 4.817275571665213, "learning_rate": 2.4541664295314677e-06, "loss": 0.368, "step": 7545 }, { "epoch": 2.567322817907402, "grad_norm": 2.9038353180708527, "learning_rate": 2.4514547153392997e-06, "loss": 0.3886, "step": 7550 }, { "epoch": 2.56902342587475, "grad_norm": 3.755370708675757, "learning_rate": 2.4487430582831047e-06, "loss": 0.3554, "step": 7555 }, { "epoch": 2.5707240338420987, "grad_norm": 4.313990405515008, "learning_rate": 2.446031461554401e-06, "loss": 0.3781, "step": 7560 }, { "epoch": 2.5724246418094467, "grad_norm": 7.213752168555879, "learning_rate": 2.4433199283446355e-06, "loss": 0.3746, "step": 7565 }, { "epoch": 2.5741252497767952, "grad_norm": 4.781137209485712, "learning_rate": 2.4406084618451814e-06, "loss": 0.3818, "step": 7570 }, { "epoch": 2.5758258577441433, "grad_norm": 3.7032399148982402, "learning_rate": 2.4378970652473326e-06, "loss": 0.3767, "step": 7575 }, { "epoch": 2.577526465711492, "grad_norm": 3.3414048650087937, "learning_rate": 2.4351857417422997e-06, "loss": 0.3805, "step": 7580 }, { "epoch": 2.5792270736788403, "grad_norm": 3.0772607371490155, "learning_rate": 2.43247449452121e-06, "loss": 0.3903, "step": 7585 }, { "epoch": 2.5809276816461884, "grad_norm": 3.571353844353706, "learning_rate": 2.429763326775099e-06, "loss": 0.3954, "step": 7590 }, { "epoch": 2.582628289613537, "grad_norm": 4.5065287338944895, "learning_rate": 2.4270522416949087e-06, "loss": 0.3657, "step": 7595 }, { "epoch": 2.5843288975808854, "grad_norm": 8.471981991390132, "learning_rate": 2.4243412424714845e-06, "loss": 0.4025, "step": 7600 }, { "epoch": 2.5860295055482334, "grad_norm": 7.019370364912208, "learning_rate": 2.42163033229557e-06, "loss": 0.3711, "step": 7605 }, { "epoch": 2.587730113515582, "grad_norm": 2.6919262297176045, "learning_rate": 2.4189195143578055e-06, "loss": 0.3839, "step": 7610 }, { "epoch": 2.5894307214829304, "grad_norm": 4.008927772103962, "learning_rate": 2.4162087918487207e-06, "loss": 0.3821, "step": 7615 }, { "epoch": 2.5911313294502785, "grad_norm": 5.902137336977824, "learning_rate": 2.4134981679587342e-06, "loss": 0.3883, "step": 7620 }, { "epoch": 2.592831937417627, "grad_norm": 4.924077039933856, "learning_rate": 2.4107876458781485e-06, "loss": 0.3907, "step": 7625 }, { "epoch": 2.594532545384975, "grad_norm": 2.8942783036038042, "learning_rate": 2.4080772287971455e-06, "loss": 0.3619, "step": 7630 }, { "epoch": 2.5962331533523235, "grad_norm": 3.664197602241772, "learning_rate": 2.405366919905785e-06, "loss": 0.3676, "step": 7635 }, { "epoch": 2.5979337613196716, "grad_norm": 3.3608785711897693, "learning_rate": 2.4026567223939976e-06, "loss": 0.395, "step": 7640 }, { "epoch": 2.59963436928702, "grad_norm": 3.8505348636741195, "learning_rate": 2.3999466394515846e-06, "loss": 0.3761, "step": 7645 }, { "epoch": 2.6013349772543686, "grad_norm": 5.337704928704311, "learning_rate": 2.397236674268211e-06, "loss": 0.3904, "step": 7650 }, { "epoch": 2.6030355852217166, "grad_norm": 5.094246440451878, "learning_rate": 2.3945268300334047e-06, "loss": 0.3831, "step": 7655 }, { "epoch": 2.604736193189065, "grad_norm": 26.06405211120412, "learning_rate": 2.3918171099365493e-06, "loss": 0.3683, "step": 7660 }, { "epoch": 2.6064368011564136, "grad_norm": 4.381549185854137, "learning_rate": 2.389107517166884e-06, "loss": 0.3577, "step": 7665 }, { "epoch": 2.6081374091237617, "grad_norm": 5.131982881725748, "learning_rate": 2.386398054913497e-06, "loss": 0.4, "step": 7670 }, { "epoch": 2.60983801709111, "grad_norm": 3.852507064477302, "learning_rate": 2.3836887263653246e-06, "loss": 0.3688, "step": 7675 }, { "epoch": 2.6115386250584582, "grad_norm": 5.5768234648563215, "learning_rate": 2.380979534711143e-06, "loss": 0.3816, "step": 7680 }, { "epoch": 2.6132392330258067, "grad_norm": 6.229018627969279, "learning_rate": 2.3782704831395694e-06, "loss": 0.3757, "step": 7685 }, { "epoch": 2.614939840993155, "grad_norm": 4.587671023782299, "learning_rate": 2.3755615748390563e-06, "loss": 0.3922, "step": 7690 }, { "epoch": 2.6166404489605033, "grad_norm": 3.3821758897028174, "learning_rate": 2.372852812997886e-06, "loss": 0.3755, "step": 7695 }, { "epoch": 2.618341056927852, "grad_norm": 5.019978799382523, "learning_rate": 2.3701442008041682e-06, "loss": 0.3646, "step": 7700 }, { "epoch": 2.6200416648952, "grad_norm": 5.35780972776436, "learning_rate": 2.3674357414458395e-06, "loss": 0.3646, "step": 7705 }, { "epoch": 2.6217422728625484, "grad_norm": 8.517200690934615, "learning_rate": 2.364727438110654e-06, "loss": 0.3682, "step": 7710 }, { "epoch": 2.623442880829897, "grad_norm": 4.786010589800363, "learning_rate": 2.3620192939861827e-06, "loss": 0.3739, "step": 7715 }, { "epoch": 2.625143488797245, "grad_norm": 6.383939384051332, "learning_rate": 2.359311312259809e-06, "loss": 0.3655, "step": 7720 }, { "epoch": 2.6268440967645934, "grad_norm": 4.309724877465426, "learning_rate": 2.356603496118726e-06, "loss": 0.393, "step": 7725 }, { "epoch": 2.628544704731942, "grad_norm": 5.480641163818891, "learning_rate": 2.353895848749931e-06, "loss": 0.3788, "step": 7730 }, { "epoch": 2.63024531269929, "grad_norm": 4.328706468107978, "learning_rate": 2.351188373340223e-06, "loss": 0.377, "step": 7735 }, { "epoch": 2.6319459206666385, "grad_norm": 7.62919261883074, "learning_rate": 2.348481073076199e-06, "loss": 0.4051, "step": 7740 }, { "epoch": 2.6336465286339865, "grad_norm": 3.224620235847286, "learning_rate": 2.345773951144249e-06, "loss": 0.3943, "step": 7745 }, { "epoch": 2.635347136601335, "grad_norm": 4.335437896054658, "learning_rate": 2.343067010730554e-06, "loss": 0.3917, "step": 7750 }, { "epoch": 2.637047744568683, "grad_norm": 6.3436081714943935, "learning_rate": 2.340360255021081e-06, "loss": 0.3914, "step": 7755 }, { "epoch": 2.6387483525360316, "grad_norm": 3.5420470161572357, "learning_rate": 2.337653687201579e-06, "loss": 0.4012, "step": 7760 }, { "epoch": 2.64044896050338, "grad_norm": 5.199887613583664, "learning_rate": 2.3349473104575775e-06, "loss": 0.3701, "step": 7765 }, { "epoch": 2.642149568470728, "grad_norm": 6.659111837995282, "learning_rate": 2.3322411279743794e-06, "loss": 0.3658, "step": 7770 }, { "epoch": 2.6438501764380766, "grad_norm": 9.759159522946431, "learning_rate": 2.32953514293706e-06, "loss": 0.3669, "step": 7775 }, { "epoch": 2.645550784405425, "grad_norm": 8.286491160452785, "learning_rate": 2.3268293585304615e-06, "loss": 0.3769, "step": 7780 }, { "epoch": 2.647251392372773, "grad_norm": 2.889876656758775, "learning_rate": 2.32412377793919e-06, "loss": 0.4135, "step": 7785 }, { "epoch": 2.6489520003401217, "grad_norm": 3.5824133480839353, "learning_rate": 2.321418404347613e-06, "loss": 0.394, "step": 7790 }, { "epoch": 2.65065260830747, "grad_norm": 3.9983799200992594, "learning_rate": 2.318713240939853e-06, "loss": 0.3895, "step": 7795 }, { "epoch": 2.6523532162748182, "grad_norm": 6.824853929282826, "learning_rate": 2.316008290899785e-06, "loss": 0.3843, "step": 7800 }, { "epoch": 2.6540538242421667, "grad_norm": 3.310059658229797, "learning_rate": 2.3133035574110338e-06, "loss": 0.3845, "step": 7805 }, { "epoch": 2.655754432209515, "grad_norm": 11.233557190194349, "learning_rate": 2.310599043656969e-06, "loss": 0.3888, "step": 7810 }, { "epoch": 2.6574550401768633, "grad_norm": 6.894650044327275, "learning_rate": 2.3078947528207012e-06, "loss": 0.3815, "step": 7815 }, { "epoch": 2.6591556481442113, "grad_norm": 4.2025389632182515, "learning_rate": 2.3051906880850786e-06, "loss": 0.3697, "step": 7820 }, { "epoch": 2.66085625611156, "grad_norm": 5.769520512488445, "learning_rate": 2.3024868526326846e-06, "loss": 0.3762, "step": 7825 }, { "epoch": 2.6625568640789083, "grad_norm": 4.6035878269965425, "learning_rate": 2.299783249645832e-06, "loss": 0.3663, "step": 7830 }, { "epoch": 2.6642574720462564, "grad_norm": 3.1697397351640513, "learning_rate": 2.297079882306558e-06, "loss": 0.3667, "step": 7835 }, { "epoch": 2.665958080013605, "grad_norm": 4.3110879400985604, "learning_rate": 2.294376753796626e-06, "loss": 0.3558, "step": 7840 }, { "epoch": 2.6676586879809534, "grad_norm": 16.53847455813875, "learning_rate": 2.2916738672975154e-06, "loss": 0.3876, "step": 7845 }, { "epoch": 2.6693592959483015, "grad_norm": 3.59045893067525, "learning_rate": 2.2889712259904222e-06, "loss": 0.3901, "step": 7850 }, { "epoch": 2.67105990391565, "grad_norm": 4.1466790838667755, "learning_rate": 2.286268833056254e-06, "loss": 0.3908, "step": 7855 }, { "epoch": 2.672760511882998, "grad_norm": 4.030465815283501, "learning_rate": 2.283566691675625e-06, "loss": 0.3778, "step": 7860 }, { "epoch": 2.6744611198503465, "grad_norm": 4.993116520883105, "learning_rate": 2.2808648050288535e-06, "loss": 0.3882, "step": 7865 }, { "epoch": 2.6761617278176946, "grad_norm": 3.8372994898270005, "learning_rate": 2.2781631762959596e-06, "loss": 0.3817, "step": 7870 }, { "epoch": 2.677862335785043, "grad_norm": 3.519788857821788, "learning_rate": 2.2754618086566572e-06, "loss": 0.4041, "step": 7875 }, { "epoch": 2.6795629437523916, "grad_norm": 14.842927848150968, "learning_rate": 2.272760705290356e-06, "loss": 0.3651, "step": 7880 }, { "epoch": 2.6812635517197396, "grad_norm": 7.4190183694324965, "learning_rate": 2.270059869376151e-06, "loss": 0.3959, "step": 7885 }, { "epoch": 2.682964159687088, "grad_norm": 2.8103680082154043, "learning_rate": 2.267359304092826e-06, "loss": 0.3607, "step": 7890 }, { "epoch": 2.6846647676544366, "grad_norm": 7.937493753557344, "learning_rate": 2.264659012618845e-06, "loss": 0.385, "step": 7895 }, { "epoch": 2.6863653756217847, "grad_norm": 6.71945655343373, "learning_rate": 2.2619589981323483e-06, "loss": 0.392, "step": 7900 }, { "epoch": 2.688065983589133, "grad_norm": 6.118543164068601, "learning_rate": 2.259259263811151e-06, "loss": 0.3672, "step": 7905 }, { "epoch": 2.6897665915564817, "grad_norm": 6.386964430087839, "learning_rate": 2.2565598128327406e-06, "loss": 0.4052, "step": 7910 }, { "epoch": 2.6914671995238297, "grad_norm": 4.389010997821173, "learning_rate": 2.2538606483742676e-06, "loss": 0.4002, "step": 7915 }, { "epoch": 2.6931678074911782, "grad_norm": 6.583744813712634, "learning_rate": 2.2511617736125474e-06, "loss": 0.4012, "step": 7920 }, { "epoch": 2.6948684154585263, "grad_norm": 4.355606914465958, "learning_rate": 2.2484631917240545e-06, "loss": 0.3847, "step": 7925 }, { "epoch": 2.696569023425875, "grad_norm": 5.6245867527012035, "learning_rate": 2.245764905884918e-06, "loss": 0.3712, "step": 7930 }, { "epoch": 2.698269631393223, "grad_norm": 6.816235104622127, "learning_rate": 2.2430669192709185e-06, "loss": 0.3903, "step": 7935 }, { "epoch": 2.6999702393605713, "grad_norm": 3.7447476154787367, "learning_rate": 2.240369235057485e-06, "loss": 0.3696, "step": 7940 }, { "epoch": 2.70167084732792, "grad_norm": 3.505473103172748, "learning_rate": 2.2376718564196893e-06, "loss": 0.3748, "step": 7945 }, { "epoch": 2.703371455295268, "grad_norm": 8.182338734493161, "learning_rate": 2.2349747865322463e-06, "loss": 0.3671, "step": 7950 }, { "epoch": 2.7050720632626164, "grad_norm": 3.3738020778194224, "learning_rate": 2.232278028569504e-06, "loss": 0.3735, "step": 7955 }, { "epoch": 2.706772671229965, "grad_norm": 6.372010091411438, "learning_rate": 2.229581585705447e-06, "loss": 0.3805, "step": 7960 }, { "epoch": 2.708473279197313, "grad_norm": 3.721118322118046, "learning_rate": 2.2268854611136853e-06, "loss": 0.3487, "step": 7965 }, { "epoch": 2.7101738871646615, "grad_norm": 4.129352252876067, "learning_rate": 2.2241896579674563e-06, "loss": 0.3678, "step": 7970 }, { "epoch": 2.71187449513201, "grad_norm": 6.703226073088949, "learning_rate": 2.22149417943962e-06, "loss": 0.3751, "step": 7975 }, { "epoch": 2.713575103099358, "grad_norm": 7.087381797195783, "learning_rate": 2.2187990287026525e-06, "loss": 0.3662, "step": 7980 }, { "epoch": 2.7152757110667065, "grad_norm": 7.831322740182201, "learning_rate": 2.2161042089286444e-06, "loss": 0.4128, "step": 7985 }, { "epoch": 2.7169763190340546, "grad_norm": 23.023277527211253, "learning_rate": 2.2134097232892974e-06, "loss": 0.373, "step": 7990 }, { "epoch": 2.718676927001403, "grad_norm": 3.767570360293103, "learning_rate": 2.21071557495592e-06, "loss": 0.3876, "step": 7995 }, { "epoch": 2.720377534968751, "grad_norm": 3.786013061698436, "learning_rate": 2.208021767099423e-06, "loss": 0.3597, "step": 8000 }, { "epoch": 2.7220781429360996, "grad_norm": 4.453520170115316, "learning_rate": 2.2053283028903174e-06, "loss": 0.3893, "step": 8005 }, { "epoch": 2.723778750903448, "grad_norm": 3.5345683175464413, "learning_rate": 2.2026351854987084e-06, "loss": 0.3609, "step": 8010 }, { "epoch": 2.725479358870796, "grad_norm": 2.971594563933989, "learning_rate": 2.1999424180942945e-06, "loss": 0.3667, "step": 8015 }, { "epoch": 2.7271799668381447, "grad_norm": 11.62878333088526, "learning_rate": 2.1972500038463614e-06, "loss": 0.372, "step": 8020 }, { "epoch": 2.728880574805493, "grad_norm": 6.510889807932155, "learning_rate": 2.1945579459237787e-06, "loss": 0.3886, "step": 8025 }, { "epoch": 2.7305811827728412, "grad_norm": 32.701936557309374, "learning_rate": 2.1918662474949974e-06, "loss": 0.3719, "step": 8030 }, { "epoch": 2.7322817907401897, "grad_norm": 4.0058386725211, "learning_rate": 2.1891749117280463e-06, "loss": 0.3832, "step": 8035 }, { "epoch": 2.733982398707538, "grad_norm": 3.019026214432123, "learning_rate": 2.186483941790526e-06, "loss": 0.389, "step": 8040 }, { "epoch": 2.7356830066748863, "grad_norm": 3.770997398095723, "learning_rate": 2.183793340849606e-06, "loss": 0.3508, "step": 8045 }, { "epoch": 2.7373836146422343, "grad_norm": 4.222862912262694, "learning_rate": 2.181103112072023e-06, "loss": 0.3507, "step": 8050 }, { "epoch": 2.739084222609583, "grad_norm": 3.910203241840868, "learning_rate": 2.1784132586240746e-06, "loss": 0.3844, "step": 8055 }, { "epoch": 2.7407848305769313, "grad_norm": 21.535114374675835, "learning_rate": 2.1757237836716173e-06, "loss": 0.3707, "step": 8060 }, { "epoch": 2.7424854385442794, "grad_norm": 4.1731150684600555, "learning_rate": 2.1730346903800625e-06, "loss": 0.3872, "step": 8065 }, { "epoch": 2.744186046511628, "grad_norm": 82.33717574896272, "learning_rate": 2.170345981914371e-06, "loss": 0.3993, "step": 8070 }, { "epoch": 2.7458866544789764, "grad_norm": 6.524400159721665, "learning_rate": 2.167657661439051e-06, "loss": 0.3801, "step": 8075 }, { "epoch": 2.7475872624463245, "grad_norm": 4.9763136904935115, "learning_rate": 2.1649697321181555e-06, "loss": 0.3828, "step": 8080 }, { "epoch": 2.749287870413673, "grad_norm": 5.610444848264231, "learning_rate": 2.1622821971152762e-06, "loss": 0.3753, "step": 8085 }, { "epoch": 2.7509884783810215, "grad_norm": 3.6915191716116125, "learning_rate": 2.1595950595935393e-06, "loss": 0.3803, "step": 8090 }, { "epoch": 2.7526890863483695, "grad_norm": 6.5278947678428025, "learning_rate": 2.1569083227156064e-06, "loss": 0.3637, "step": 8095 }, { "epoch": 2.754389694315718, "grad_norm": 5.5775103216434205, "learning_rate": 2.1542219896436647e-06, "loss": 0.3746, "step": 8100 }, { "epoch": 2.756090302283066, "grad_norm": 4.85406609326352, "learning_rate": 2.151536063539427e-06, "loss": 0.3846, "step": 8105 }, { "epoch": 2.7577909102504146, "grad_norm": 8.979352424237328, "learning_rate": 2.148850547564128e-06, "loss": 0.3774, "step": 8110 }, { "epoch": 2.7594915182177626, "grad_norm": 4.647022479580419, "learning_rate": 2.146165444878518e-06, "loss": 0.3612, "step": 8115 }, { "epoch": 2.761192126185111, "grad_norm": 4.257524473576935, "learning_rate": 2.143480758642862e-06, "loss": 0.3877, "step": 8120 }, { "epoch": 2.7628927341524596, "grad_norm": 5.083975554617405, "learning_rate": 2.140796492016935e-06, "loss": 0.3972, "step": 8125 }, { "epoch": 2.7645933421198077, "grad_norm": 3.7558709013901317, "learning_rate": 2.1381126481600177e-06, "loss": 0.3704, "step": 8130 }, { "epoch": 2.766293950087156, "grad_norm": 6.936058030138578, "learning_rate": 2.1354292302308934e-06, "loss": 0.3993, "step": 8135 }, { "epoch": 2.7679945580545047, "grad_norm": 3.8309644397369635, "learning_rate": 2.1327462413878435e-06, "loss": 0.3586, "step": 8140 }, { "epoch": 2.7696951660218527, "grad_norm": 6.63698719991675, "learning_rate": 2.1300636847886454e-06, "loss": 0.3789, "step": 8145 }, { "epoch": 2.7713957739892012, "grad_norm": 3.6810387030854956, "learning_rate": 2.1273815635905665e-06, "loss": 0.3583, "step": 8150 }, { "epoch": 2.7730963819565497, "grad_norm": 5.340160023912234, "learning_rate": 2.124699880950364e-06, "loss": 0.3606, "step": 8155 }, { "epoch": 2.774796989923898, "grad_norm": 3.3374867768954144, "learning_rate": 2.122018640024276e-06, "loss": 0.3534, "step": 8160 }, { "epoch": 2.7764975978912463, "grad_norm": 23.05919247327434, "learning_rate": 2.119337843968023e-06, "loss": 0.366, "step": 8165 }, { "epoch": 2.7781982058585943, "grad_norm": 10.548598893835438, "learning_rate": 2.1166574959368007e-06, "loss": 0.3637, "step": 8170 }, { "epoch": 2.779898813825943, "grad_norm": 3.719602530650269, "learning_rate": 2.1139775990852777e-06, "loss": 0.376, "step": 8175 }, { "epoch": 2.781599421793291, "grad_norm": 3.9314066322384407, "learning_rate": 2.111298156567592e-06, "loss": 0.3735, "step": 8180 }, { "epoch": 2.7833000297606394, "grad_norm": 3.8470975640453413, "learning_rate": 2.1086191715373465e-06, "loss": 0.3667, "step": 8185 }, { "epoch": 2.785000637727988, "grad_norm": 13.062597481548549, "learning_rate": 2.105940647147606e-06, "loss": 0.3639, "step": 8190 }, { "epoch": 2.786701245695336, "grad_norm": 4.425463584564762, "learning_rate": 2.1032625865508927e-06, "loss": 0.3811, "step": 8195 }, { "epoch": 2.7884018536626844, "grad_norm": 8.416501818865054, "learning_rate": 2.1005849928991827e-06, "loss": 0.3811, "step": 8200 }, { "epoch": 2.790102461630033, "grad_norm": 8.558767887079613, "learning_rate": 2.0979078693439038e-06, "loss": 0.3711, "step": 8205 }, { "epoch": 2.791803069597381, "grad_norm": 3.4236488145310986, "learning_rate": 2.0952312190359287e-06, "loss": 0.3891, "step": 8210 }, { "epoch": 2.7935036775647295, "grad_norm": 5.493554420139218, "learning_rate": 2.0925550451255747e-06, "loss": 0.376, "step": 8215 }, { "epoch": 2.7952042855320776, "grad_norm": 3.313325831993448, "learning_rate": 2.089879350762598e-06, "loss": 0.3685, "step": 8220 }, { "epoch": 2.796904893499426, "grad_norm": 22.4153088648218, "learning_rate": 2.08720413909619e-06, "loss": 0.3602, "step": 8225 }, { "epoch": 2.798605501466774, "grad_norm": 8.98408239788447, "learning_rate": 2.0845294132749736e-06, "loss": 0.3681, "step": 8230 }, { "epoch": 2.8003061094341226, "grad_norm": 3.4347162415729353, "learning_rate": 2.081855176447001e-06, "loss": 0.3784, "step": 8235 }, { "epoch": 2.802006717401471, "grad_norm": 4.215328915462785, "learning_rate": 2.079181431759748e-06, "loss": 0.3803, "step": 8240 }, { "epoch": 2.803707325368819, "grad_norm": 119.95437737730414, "learning_rate": 2.076508182360111e-06, "loss": 0.3872, "step": 8245 }, { "epoch": 2.8054079333361677, "grad_norm": 3.9604876165994285, "learning_rate": 2.0738354313944055e-06, "loss": 0.3849, "step": 8250 }, { "epoch": 2.807108541303516, "grad_norm": 4.523244770758718, "learning_rate": 2.0711631820083575e-06, "loss": 0.3715, "step": 8255 }, { "epoch": 2.8088091492708642, "grad_norm": 3.6297302660625874, "learning_rate": 2.068491437347104e-06, "loss": 0.3842, "step": 8260 }, { "epoch": 2.8105097572382127, "grad_norm": 5.359805674286737, "learning_rate": 2.065820200555188e-06, "loss": 0.3387, "step": 8265 }, { "epoch": 2.8122103652055612, "grad_norm": 4.557516040202472, "learning_rate": 2.0631494747765546e-06, "loss": 0.3659, "step": 8270 }, { "epoch": 2.8139109731729093, "grad_norm": 6.7908173456117895, "learning_rate": 2.0604792631545482e-06, "loss": 0.3928, "step": 8275 }, { "epoch": 2.815611581140258, "grad_norm": 11.121940372885511, "learning_rate": 2.057809568831907e-06, "loss": 0.3543, "step": 8280 }, { "epoch": 2.817312189107606, "grad_norm": 30.478693939131286, "learning_rate": 2.0551403949507604e-06, "loss": 0.3676, "step": 8285 }, { "epoch": 2.8190127970749543, "grad_norm": 10.862270715950094, "learning_rate": 2.0524717446526264e-06, "loss": 0.3529, "step": 8290 }, { "epoch": 2.8207134050423024, "grad_norm": 9.092002991175214, "learning_rate": 2.049803621078405e-06, "loss": 0.3663, "step": 8295 }, { "epoch": 2.822414013009651, "grad_norm": 3.4487696947423574, "learning_rate": 2.047136027368378e-06, "loss": 0.3568, "step": 8300 }, { "epoch": 2.8241146209769994, "grad_norm": 3.5544329404182404, "learning_rate": 2.044468966662202e-06, "loss": 0.3785, "step": 8305 }, { "epoch": 2.8258152289443474, "grad_norm": 14.164049469477636, "learning_rate": 2.0418024420989075e-06, "loss": 0.3825, "step": 8310 }, { "epoch": 2.827515836911696, "grad_norm": 4.361080170557115, "learning_rate": 2.0391364568168936e-06, "loss": 0.3776, "step": 8315 }, { "epoch": 2.8292164448790444, "grad_norm": 7.4345207036596115, "learning_rate": 2.036471013953925e-06, "loss": 0.3928, "step": 8320 }, { "epoch": 2.8309170528463925, "grad_norm": 3.1225404452489536, "learning_rate": 2.033806116647127e-06, "loss": 0.3881, "step": 8325 }, { "epoch": 2.832617660813741, "grad_norm": 7.796796229295892, "learning_rate": 2.031141768032983e-06, "loss": 0.3757, "step": 8330 }, { "epoch": 2.8343182687810895, "grad_norm": 3.299102522694853, "learning_rate": 2.028477971247332e-06, "loss": 0.3402, "step": 8335 }, { "epoch": 2.8360188767484376, "grad_norm": 4.067819755694867, "learning_rate": 2.0258147294253627e-06, "loss": 0.3761, "step": 8340 }, { "epoch": 2.837719484715786, "grad_norm": 8.741701238488007, "learning_rate": 2.02315204570161e-06, "loss": 0.346, "step": 8345 }, { "epoch": 2.839420092683134, "grad_norm": 4.864531791571482, "learning_rate": 2.0204899232099527e-06, "loss": 0.3809, "step": 8350 }, { "epoch": 2.8411207006504826, "grad_norm": 4.759769443217969, "learning_rate": 2.017828365083608e-06, "loss": 0.3468, "step": 8355 }, { "epoch": 2.8428213086178307, "grad_norm": 5.282635242019972, "learning_rate": 2.0151673744551305e-06, "loss": 0.3933, "step": 8360 }, { "epoch": 2.844521916585179, "grad_norm": 6.999001062086554, "learning_rate": 2.0125069544564057e-06, "loss": 0.382, "step": 8365 }, { "epoch": 2.8462225245525277, "grad_norm": 4.122810194501148, "learning_rate": 2.009847108218648e-06, "loss": 0.3712, "step": 8370 }, { "epoch": 2.8479231325198757, "grad_norm": 3.8107728928149824, "learning_rate": 2.007187838872396e-06, "loss": 0.3765, "step": 8375 }, { "epoch": 2.849623740487224, "grad_norm": 10.353358184774276, "learning_rate": 2.00452914954751e-06, "loss": 0.3617, "step": 8380 }, { "epoch": 2.8513243484545727, "grad_norm": 7.173375709728723, "learning_rate": 2.0018710433731667e-06, "loss": 0.3511, "step": 8385 }, { "epoch": 2.8530249564219208, "grad_norm": 4.87187528806106, "learning_rate": 1.999213523477857e-06, "loss": 0.3847, "step": 8390 }, { "epoch": 2.8547255643892693, "grad_norm": 4.975178172526431, "learning_rate": 1.9965565929893825e-06, "loss": 0.3707, "step": 8395 }, { "epoch": 2.8564261723566173, "grad_norm": 10.3854682049754, "learning_rate": 1.9939002550348506e-06, "loss": 0.3702, "step": 8400 }, { "epoch": 2.858126780323966, "grad_norm": 20.460389393682824, "learning_rate": 1.99124451274067e-06, "loss": 0.3771, "step": 8405 }, { "epoch": 2.859827388291314, "grad_norm": 5.005293028199917, "learning_rate": 1.98858936923255e-06, "loss": 0.3575, "step": 8410 }, { "epoch": 2.8615279962586624, "grad_norm": 2.8275024075063144, "learning_rate": 1.985934827635495e-06, "loss": 0.3681, "step": 8415 }, { "epoch": 2.863228604226011, "grad_norm": 4.977576052712527, "learning_rate": 1.9832808910738e-06, "loss": 0.3991, "step": 8420 }, { "epoch": 2.864929212193359, "grad_norm": 3.3366440868337612, "learning_rate": 1.9806275626710483e-06, "loss": 0.3887, "step": 8425 }, { "epoch": 2.8666298201607074, "grad_norm": 31.395805092917065, "learning_rate": 1.977974845550108e-06, "loss": 0.395, "step": 8430 }, { "epoch": 2.868330428128056, "grad_norm": 2.9171081488360238, "learning_rate": 1.975322742833127e-06, "loss": 0.3638, "step": 8435 }, { "epoch": 2.870031036095404, "grad_norm": 7.071623054198009, "learning_rate": 1.972671257641531e-06, "loss": 0.3572, "step": 8440 }, { "epoch": 2.8717316440627525, "grad_norm": 4.116140867083445, "learning_rate": 1.970020393096017e-06, "loss": 0.3551, "step": 8445 }, { "epoch": 2.873432252030101, "grad_norm": 4.582359505283773, "learning_rate": 1.9673701523165537e-06, "loss": 0.3589, "step": 8450 }, { "epoch": 2.875132859997449, "grad_norm": 4.402633709070161, "learning_rate": 1.964720538422375e-06, "loss": 0.3824, "step": 8455 }, { "epoch": 2.8768334679647976, "grad_norm": 3.686748929733052, "learning_rate": 1.9620715545319763e-06, "loss": 0.386, "step": 8460 }, { "epoch": 2.8785340759321456, "grad_norm": 3.501304828830748, "learning_rate": 1.959423203763112e-06, "loss": 0.3814, "step": 8465 }, { "epoch": 2.880234683899494, "grad_norm": 4.273306250942062, "learning_rate": 1.9567754892327913e-06, "loss": 0.3611, "step": 8470 }, { "epoch": 2.881935291866842, "grad_norm": 4.377492367442627, "learning_rate": 1.9541284140572747e-06, "loss": 0.373, "step": 8475 }, { "epoch": 2.8836358998341907, "grad_norm": 3.4896546835096607, "learning_rate": 1.9514819813520697e-06, "loss": 0.3733, "step": 8480 }, { "epoch": 2.885336507801539, "grad_norm": 6.416417362163938, "learning_rate": 1.9488361942319283e-06, "loss": 0.3781, "step": 8485 }, { "epoch": 2.887037115768887, "grad_norm": 4.073770948021414, "learning_rate": 1.946191055810842e-06, "loss": 0.3742, "step": 8490 }, { "epoch": 2.8887377237362357, "grad_norm": 3.2692801404584086, "learning_rate": 1.94354656920204e-06, "loss": 0.38, "step": 8495 }, { "epoch": 2.890438331703584, "grad_norm": 3.868075079047741, "learning_rate": 1.9409027375179827e-06, "loss": 0.3727, "step": 8500 }, { "epoch": 2.8921389396709323, "grad_norm": 3.498489470199948, "learning_rate": 1.9382595638703603e-06, "loss": 0.3687, "step": 8505 }, { "epoch": 2.8938395476382808, "grad_norm": 4.898602514999291, "learning_rate": 1.935617051370089e-06, "loss": 0.3813, "step": 8510 }, { "epoch": 2.8955401556056293, "grad_norm": 3.7056569273722206, "learning_rate": 1.9329752031273073e-06, "loss": 0.3644, "step": 8515 }, { "epoch": 2.8972407635729773, "grad_norm": 3.270896760191762, "learning_rate": 1.93033402225137e-06, "loss": 0.3922, "step": 8520 }, { "epoch": 2.898941371540326, "grad_norm": 11.671782966595131, "learning_rate": 1.927693511850849e-06, "loss": 0.3801, "step": 8525 }, { "epoch": 2.900641979507674, "grad_norm": 7.352868576954751, "learning_rate": 1.925053675033524e-06, "loss": 0.3963, "step": 8530 }, { "epoch": 2.9023425874750224, "grad_norm": 14.006913439271788, "learning_rate": 1.9224145149063845e-06, "loss": 0.3601, "step": 8535 }, { "epoch": 2.9040431954423704, "grad_norm": 3.647756897066697, "learning_rate": 1.9197760345756227e-06, "loss": 0.3596, "step": 8540 }, { "epoch": 2.905743803409719, "grad_norm": 4.815506591854202, "learning_rate": 1.9171382371466302e-06, "loss": 0.3814, "step": 8545 }, { "epoch": 2.9074444113770674, "grad_norm": 6.5540382195623375, "learning_rate": 1.9145011257239957e-06, "loss": 0.3995, "step": 8550 }, { "epoch": 2.9091450193444155, "grad_norm": 4.2081097534769745, "learning_rate": 1.9118647034115e-06, "loss": 0.3768, "step": 8555 }, { "epoch": 2.910845627311764, "grad_norm": 3.1412509842846186, "learning_rate": 1.909228973312113e-06, "loss": 0.3569, "step": 8560 }, { "epoch": 2.9125462352791125, "grad_norm": 5.596007643345584, "learning_rate": 1.9065939385279892e-06, "loss": 0.371, "step": 8565 }, { "epoch": 2.9142468432464605, "grad_norm": 5.501455976838975, "learning_rate": 1.9039596021604654e-06, "loss": 0.3831, "step": 8570 }, { "epoch": 2.915947451213809, "grad_norm": 2.852898924760873, "learning_rate": 1.9013259673100577e-06, "loss": 0.3613, "step": 8575 }, { "epoch": 2.917648059181157, "grad_norm": 5.966472806636236, "learning_rate": 1.898693037076454e-06, "loss": 0.38, "step": 8580 }, { "epoch": 2.9193486671485056, "grad_norm": 5.9729388381891155, "learning_rate": 1.8960608145585143e-06, "loss": 0.3894, "step": 8585 }, { "epoch": 2.9210492751158537, "grad_norm": 6.982534331399855, "learning_rate": 1.8934293028542657e-06, "loss": 0.353, "step": 8590 }, { "epoch": 2.922749883083202, "grad_norm": 23.139564328892174, "learning_rate": 1.8907985050608984e-06, "loss": 0.3714, "step": 8595 }, { "epoch": 2.9244504910505507, "grad_norm": 5.93645905058757, "learning_rate": 1.8881684242747622e-06, "loss": 0.3774, "step": 8600 }, { "epoch": 2.9261510990178987, "grad_norm": 8.136608402056961, "learning_rate": 1.8855390635913634e-06, "loss": 0.3897, "step": 8605 }, { "epoch": 2.927851706985247, "grad_norm": 3.021573214294128, "learning_rate": 1.8829104261053602e-06, "loss": 0.3736, "step": 8610 }, { "epoch": 2.9295523149525957, "grad_norm": 4.933923209392697, "learning_rate": 1.8802825149105603e-06, "loss": 0.3653, "step": 8615 }, { "epoch": 2.9312529229199438, "grad_norm": 4.706545349659176, "learning_rate": 1.877655333099916e-06, "loss": 0.3516, "step": 8620 }, { "epoch": 2.9329535308872923, "grad_norm": 6.6339352896567165, "learning_rate": 1.8750288837655218e-06, "loss": 0.3747, "step": 8625 }, { "epoch": 2.9346541388546408, "grad_norm": 6.618639590750141, "learning_rate": 1.8724031699986089e-06, "loss": 0.3814, "step": 8630 }, { "epoch": 2.936354746821989, "grad_norm": 7.978336468342465, "learning_rate": 1.8697781948895446e-06, "loss": 0.3676, "step": 8635 }, { "epoch": 2.9380553547893373, "grad_norm": 4.414106043923808, "learning_rate": 1.8671539615278257e-06, "loss": 0.389, "step": 8640 }, { "epoch": 2.9397559627566854, "grad_norm": 3.3613193471561646, "learning_rate": 1.8645304730020752e-06, "loss": 0.3667, "step": 8645 }, { "epoch": 2.941456570724034, "grad_norm": 4.364863672568588, "learning_rate": 1.8619077324000414e-06, "loss": 0.3637, "step": 8650 }, { "epoch": 2.943157178691382, "grad_norm": 5.026429905846794, "learning_rate": 1.8592857428085909e-06, "loss": 0.3717, "step": 8655 }, { "epoch": 2.9448577866587304, "grad_norm": 8.479583243415247, "learning_rate": 1.8566645073137065e-06, "loss": 0.379, "step": 8660 }, { "epoch": 2.946558394626079, "grad_norm": 6.665050142493819, "learning_rate": 1.854044029000484e-06, "loss": 0.3963, "step": 8665 }, { "epoch": 2.948259002593427, "grad_norm": 3.48398056319533, "learning_rate": 1.8514243109531277e-06, "loss": 0.3789, "step": 8670 }, { "epoch": 2.9499596105607755, "grad_norm": 4.310777202329698, "learning_rate": 1.848805356254947e-06, "loss": 0.3705, "step": 8675 }, { "epoch": 2.951660218528124, "grad_norm": 4.9980406559805415, "learning_rate": 1.8461871679883531e-06, "loss": 0.3973, "step": 8680 }, { "epoch": 2.953360826495472, "grad_norm": 3.63822826338399, "learning_rate": 1.843569749234855e-06, "loss": 0.3765, "step": 8685 }, { "epoch": 2.9550614344628205, "grad_norm": 4.862423352223616, "learning_rate": 1.8409531030750563e-06, "loss": 0.3962, "step": 8690 }, { "epoch": 2.956762042430169, "grad_norm": 4.381906282945819, "learning_rate": 1.83833723258865e-06, "loss": 0.3688, "step": 8695 }, { "epoch": 2.958462650397517, "grad_norm": 4.069621307312408, "learning_rate": 1.835722140854419e-06, "loss": 0.3868, "step": 8700 }, { "epoch": 2.9601632583648656, "grad_norm": 4.377704750031197, "learning_rate": 1.833107830950227e-06, "loss": 0.3683, "step": 8705 }, { "epoch": 2.9618638663322137, "grad_norm": 4.293619625964835, "learning_rate": 1.8304943059530178e-06, "loss": 0.3584, "step": 8710 }, { "epoch": 2.963564474299562, "grad_norm": 6.002080159765941, "learning_rate": 1.827881568938813e-06, "loss": 0.3672, "step": 8715 }, { "epoch": 2.96526508226691, "grad_norm": 9.916525104628843, "learning_rate": 1.825269622982705e-06, "loss": 0.3741, "step": 8720 }, { "epoch": 2.9669656902342587, "grad_norm": 5.040093533159837, "learning_rate": 1.8226584711588557e-06, "loss": 0.3687, "step": 8725 }, { "epoch": 2.968666298201607, "grad_norm": 7.20763291595424, "learning_rate": 1.8200481165404932e-06, "loss": 0.3708, "step": 8730 }, { "epoch": 2.9703669061689553, "grad_norm": 9.340020113857413, "learning_rate": 1.8174385621999064e-06, "loss": 0.3835, "step": 8735 }, { "epoch": 2.9720675141363038, "grad_norm": 8.400152105913959, "learning_rate": 1.8148298112084425e-06, "loss": 0.3856, "step": 8740 }, { "epoch": 2.9737681221036523, "grad_norm": 3.43374814609035, "learning_rate": 1.8122218666365032e-06, "loss": 0.3865, "step": 8745 }, { "epoch": 2.9754687300710003, "grad_norm": 10.47961502658653, "learning_rate": 1.8096147315535409e-06, "loss": 0.3608, "step": 8750 }, { "epoch": 2.977169338038349, "grad_norm": 6.460121108404991, "learning_rate": 1.8070084090280554e-06, "loss": 0.3916, "step": 8755 }, { "epoch": 2.978869946005697, "grad_norm": 5.4173512022009, "learning_rate": 1.8044029021275905e-06, "loss": 0.381, "step": 8760 }, { "epoch": 2.9805705539730454, "grad_norm": 6.102582510431227, "learning_rate": 1.8017982139187303e-06, "loss": 0.3747, "step": 8765 }, { "epoch": 2.9822711619403934, "grad_norm": 3.955489197067311, "learning_rate": 1.7991943474670942e-06, "loss": 0.3737, "step": 8770 }, { "epoch": 2.983971769907742, "grad_norm": 9.085768480316174, "learning_rate": 1.7965913058373346e-06, "loss": 0.394, "step": 8775 }, { "epoch": 2.9856723778750904, "grad_norm": 10.373678822487804, "learning_rate": 1.7939890920931346e-06, "loss": 0.3818, "step": 8780 }, { "epoch": 2.9873729858424385, "grad_norm": 3.4127171232770994, "learning_rate": 1.7913877092972009e-06, "loss": 0.3655, "step": 8785 }, { "epoch": 2.989073593809787, "grad_norm": 4.316849860712377, "learning_rate": 1.7887871605112635e-06, "loss": 0.3831, "step": 8790 }, { "epoch": 2.9907742017771355, "grad_norm": 4.078438401282003, "learning_rate": 1.7861874487960707e-06, "loss": 0.3907, "step": 8795 }, { "epoch": 2.9924748097444835, "grad_norm": 4.4078567477042405, "learning_rate": 1.7835885772113846e-06, "loss": 0.3705, "step": 8800 }, { "epoch": 2.994175417711832, "grad_norm": 5.163696644043879, "learning_rate": 1.7809905488159799e-06, "loss": 0.3633, "step": 8805 }, { "epoch": 2.9958760256791805, "grad_norm": 3.1201895676547964, "learning_rate": 1.7783933666676378e-06, "loss": 0.3684, "step": 8810 }, { "epoch": 2.9975766336465286, "grad_norm": 4.596905642906636, "learning_rate": 1.775797033823144e-06, "loss": 0.3733, "step": 8815 }, { "epoch": 2.999277241613877, "grad_norm": 4.953548102346647, "learning_rate": 1.773201553338285e-06, "loss": 0.3861, "step": 8820 }, { "epoch": 3.0006802431869395, "grad_norm": 4.837213083684313, "learning_rate": 1.7706069282678436e-06, "loss": 0.3291, "step": 8825 }, { "epoch": 3.0023808511542875, "grad_norm": 4.289894082584498, "learning_rate": 1.7680131616655954e-06, "loss": 0.3361, "step": 8830 }, { "epoch": 3.004081459121636, "grad_norm": 3.67762501007464, "learning_rate": 1.7654202565843065e-06, "loss": 0.3633, "step": 8835 }, { "epoch": 3.005782067088984, "grad_norm": 8.297568543931774, "learning_rate": 1.762828216075728e-06, "loss": 0.3682, "step": 8840 }, { "epoch": 3.0074826750563326, "grad_norm": 53.25224778009304, "learning_rate": 1.7602370431905952e-06, "loss": 0.3527, "step": 8845 }, { "epoch": 3.009183283023681, "grad_norm": 8.804512663857691, "learning_rate": 1.7576467409786196e-06, "loss": 0.3799, "step": 8850 }, { "epoch": 3.010883890991029, "grad_norm": 7.103880302255103, "learning_rate": 1.7550573124884901e-06, "loss": 0.3329, "step": 8855 }, { "epoch": 3.0125844989583777, "grad_norm": 3.330752538820018, "learning_rate": 1.7524687607678666e-06, "loss": 0.345, "step": 8860 }, { "epoch": 3.014285106925726, "grad_norm": 3.869647579638839, "learning_rate": 1.749881088863377e-06, "loss": 0.3655, "step": 8865 }, { "epoch": 3.015985714893074, "grad_norm": 6.557253327984917, "learning_rate": 1.7472942998206137e-06, "loss": 0.3451, "step": 8870 }, { "epoch": 3.0176863228604227, "grad_norm": 4.054072341150636, "learning_rate": 1.74470839668413e-06, "loss": 0.3508, "step": 8875 }, { "epoch": 3.0193869308277708, "grad_norm": 6.761454233909646, "learning_rate": 1.7421233824974367e-06, "loss": 0.3523, "step": 8880 }, { "epoch": 3.0210875387951193, "grad_norm": 7.976619077262149, "learning_rate": 1.7395392603029984e-06, "loss": 0.3578, "step": 8885 }, { "epoch": 3.0227881467624678, "grad_norm": 3.1686734317850247, "learning_rate": 1.7369560331422292e-06, "loss": 0.3558, "step": 8890 }, { "epoch": 3.024488754729816, "grad_norm": 5.485257218129294, "learning_rate": 1.7343737040554908e-06, "loss": 0.3611, "step": 8895 }, { "epoch": 3.0261893626971643, "grad_norm": 3.962374694461126, "learning_rate": 1.7317922760820868e-06, "loss": 0.3563, "step": 8900 }, { "epoch": 3.0278899706645124, "grad_norm": 3.051993159505651, "learning_rate": 1.7292117522602608e-06, "loss": 0.3558, "step": 8905 }, { "epoch": 3.029590578631861, "grad_norm": 8.579964201999829, "learning_rate": 1.7266321356271929e-06, "loss": 0.36, "step": 8910 }, { "epoch": 3.0312911865992094, "grad_norm": 17.88966263136647, "learning_rate": 1.7240534292189937e-06, "loss": 0.371, "step": 8915 }, { "epoch": 3.0329917945665574, "grad_norm": 3.9542284226956825, "learning_rate": 1.7214756360707047e-06, "loss": 0.3697, "step": 8920 }, { "epoch": 3.034692402533906, "grad_norm": 3.5475828665981006, "learning_rate": 1.7188987592162907e-06, "loss": 0.3687, "step": 8925 }, { "epoch": 3.036393010501254, "grad_norm": 11.781307466979413, "learning_rate": 1.7163228016886388e-06, "loss": 0.3838, "step": 8930 }, { "epoch": 3.0380936184686025, "grad_norm": 6.924010342238102, "learning_rate": 1.7137477665195538e-06, "loss": 0.3548, "step": 8935 }, { "epoch": 3.039794226435951, "grad_norm": 2.5011090104936735, "learning_rate": 1.711173656739756e-06, "loss": 0.3519, "step": 8940 }, { "epoch": 3.041494834403299, "grad_norm": 5.426408581554422, "learning_rate": 1.7086004753788755e-06, "loss": 0.3363, "step": 8945 }, { "epoch": 3.0431954423706475, "grad_norm": 4.791366672605923, "learning_rate": 1.7060282254654497e-06, "loss": 0.3527, "step": 8950 }, { "epoch": 3.044896050337996, "grad_norm": 6.066732269091804, "learning_rate": 1.70345691002692e-06, "loss": 0.3684, "step": 8955 }, { "epoch": 3.046596658305344, "grad_norm": 4.22017798132994, "learning_rate": 1.7008865320896279e-06, "loss": 0.3665, "step": 8960 }, { "epoch": 3.0482972662726926, "grad_norm": 3.464794634492069, "learning_rate": 1.6983170946788114e-06, "loss": 0.3529, "step": 8965 }, { "epoch": 3.0499978742400407, "grad_norm": 3.9222375322099614, "learning_rate": 1.6957486008186019e-06, "loss": 0.3619, "step": 8970 }, { "epoch": 3.051698482207389, "grad_norm": 4.633984968672792, "learning_rate": 1.6931810535320194e-06, "loss": 0.3632, "step": 8975 }, { "epoch": 3.0533990901747377, "grad_norm": 6.470570887351088, "learning_rate": 1.690614455840971e-06, "loss": 0.3528, "step": 8980 }, { "epoch": 3.0550996981420857, "grad_norm": 4.116975735798495, "learning_rate": 1.6880488107662457e-06, "loss": 0.3654, "step": 8985 }, { "epoch": 3.056800306109434, "grad_norm": 5.905237632834165, "learning_rate": 1.6854841213275105e-06, "loss": 0.3465, "step": 8990 }, { "epoch": 3.0585009140767823, "grad_norm": 3.8943219832538514, "learning_rate": 1.6829203905433084e-06, "loss": 0.3335, "step": 8995 }, { "epoch": 3.0602015220441308, "grad_norm": 3.687716783594247, "learning_rate": 1.680357621431055e-06, "loss": 0.3576, "step": 9000 }, { "epoch": 3.0619021300114793, "grad_norm": 5.149070151852963, "learning_rate": 1.677795817007032e-06, "loss": 0.3671, "step": 9005 }, { "epoch": 3.0636027379788273, "grad_norm": 3.494475439567705, "learning_rate": 1.6752349802863877e-06, "loss": 0.3471, "step": 9010 }, { "epoch": 3.065303345946176, "grad_norm": 4.954298151400518, "learning_rate": 1.67267511428313e-06, "loss": 0.3656, "step": 9015 }, { "epoch": 3.067003953913524, "grad_norm": 28.806765576574186, "learning_rate": 1.6701162220101249e-06, "loss": 0.3686, "step": 9020 }, { "epoch": 3.0687045618808724, "grad_norm": 6.240541900027824, "learning_rate": 1.6675583064790923e-06, "loss": 0.3642, "step": 9025 }, { "epoch": 3.070405169848221, "grad_norm": 8.834333729157482, "learning_rate": 1.665001370700603e-06, "loss": 0.3629, "step": 9030 }, { "epoch": 3.072105777815569, "grad_norm": 5.592392981972255, "learning_rate": 1.6624454176840732e-06, "loss": 0.3448, "step": 9035 }, { "epoch": 3.0738063857829174, "grad_norm": 4.3555719604118615, "learning_rate": 1.6598904504377638e-06, "loss": 0.3514, "step": 9040 }, { "epoch": 3.075506993750266, "grad_norm": 7.081636040320824, "learning_rate": 1.6573364719687758e-06, "loss": 0.3624, "step": 9045 }, { "epoch": 3.077207601717614, "grad_norm": 2.918298030062138, "learning_rate": 1.6547834852830447e-06, "loss": 0.3239, "step": 9050 }, { "epoch": 3.0789082096849625, "grad_norm": 9.006194751909424, "learning_rate": 1.6522314933853395e-06, "loss": 0.3435, "step": 9055 }, { "epoch": 3.0806088176523105, "grad_norm": 5.497911481021905, "learning_rate": 1.6496804992792604e-06, "loss": 0.3404, "step": 9060 }, { "epoch": 3.082309425619659, "grad_norm": 3.4235907173639317, "learning_rate": 1.64713050596723e-06, "loss": 0.3373, "step": 9065 }, { "epoch": 3.0840100335870075, "grad_norm": 4.502392674589181, "learning_rate": 1.6445815164504947e-06, "loss": 0.3454, "step": 9070 }, { "epoch": 3.0857106415543556, "grad_norm": 5.548983937554486, "learning_rate": 1.6420335337291197e-06, "loss": 0.3647, "step": 9075 }, { "epoch": 3.087411249521704, "grad_norm": 4.505852938618983, "learning_rate": 1.6394865608019842e-06, "loss": 0.3529, "step": 9080 }, { "epoch": 3.089111857489052, "grad_norm": 3.760432711472048, "learning_rate": 1.6369406006667795e-06, "loss": 0.3572, "step": 9085 }, { "epoch": 3.0908124654564006, "grad_norm": 17.584180935267348, "learning_rate": 1.6343956563200053e-06, "loss": 0.3559, "step": 9090 }, { "epoch": 3.092513073423749, "grad_norm": 3.4865540887340956, "learning_rate": 1.6318517307569648e-06, "loss": 0.3491, "step": 9095 }, { "epoch": 3.094213681391097, "grad_norm": 9.388707399553276, "learning_rate": 1.6293088269717633e-06, "loss": 0.3304, "step": 9100 }, { "epoch": 3.0959142893584457, "grad_norm": 4.835587502782633, "learning_rate": 1.6267669479573023e-06, "loss": 0.3427, "step": 9105 }, { "epoch": 3.0976148973257938, "grad_norm": 6.611333595318636, "learning_rate": 1.6242260967052776e-06, "loss": 0.3757, "step": 9110 }, { "epoch": 3.0993155052931423, "grad_norm": 7.241838210451506, "learning_rate": 1.6216862762061753e-06, "loss": 0.3755, "step": 9115 }, { "epoch": 3.1010161132604908, "grad_norm": 7.020118083200031, "learning_rate": 1.6191474894492698e-06, "loss": 0.3425, "step": 9120 }, { "epoch": 3.102716721227839, "grad_norm": 25.608830701241615, "learning_rate": 1.6166097394226165e-06, "loss": 0.3485, "step": 9125 }, { "epoch": 3.1044173291951873, "grad_norm": 4.755580662343598, "learning_rate": 1.6140730291130518e-06, "loss": 0.3482, "step": 9130 }, { "epoch": 3.106117937162536, "grad_norm": 4.701139521392903, "learning_rate": 1.6115373615061886e-06, "loss": 0.3559, "step": 9135 }, { "epoch": 3.107818545129884, "grad_norm": 4.272936049205998, "learning_rate": 1.6090027395864122e-06, "loss": 0.3609, "step": 9140 }, { "epoch": 3.1095191530972324, "grad_norm": 3.775254133893892, "learning_rate": 1.606469166336877e-06, "loss": 0.3529, "step": 9145 }, { "epoch": 3.1112197610645804, "grad_norm": 4.470528534439289, "learning_rate": 1.603936644739503e-06, "loss": 0.349, "step": 9150 }, { "epoch": 3.112920369031929, "grad_norm": 4.541302046969266, "learning_rate": 1.6014051777749734e-06, "loss": 0.3489, "step": 9155 }, { "epoch": 3.1146209769992774, "grad_norm": 6.250817802123763, "learning_rate": 1.5988747684227296e-06, "loss": 0.3377, "step": 9160 }, { "epoch": 3.1163215849666255, "grad_norm": 3.6038813728861294, "learning_rate": 1.5963454196609673e-06, "loss": 0.346, "step": 9165 }, { "epoch": 3.118022192933974, "grad_norm": 6.335899443657108, "learning_rate": 1.593817134466636e-06, "loss": 0.3402, "step": 9170 }, { "epoch": 3.119722800901322, "grad_norm": 6.969567561661201, "learning_rate": 1.591289915815431e-06, "loss": 0.3756, "step": 9175 }, { "epoch": 3.1214234088686705, "grad_norm": 4.221959987706928, "learning_rate": 1.588763766681794e-06, "loss": 0.344, "step": 9180 }, { "epoch": 3.123124016836019, "grad_norm": 4.604883109011117, "learning_rate": 1.5862386900389081e-06, "loss": 0.3714, "step": 9185 }, { "epoch": 3.124824624803367, "grad_norm": 6.435163478598471, "learning_rate": 1.5837146888586929e-06, "loss": 0.3589, "step": 9190 }, { "epoch": 3.1265252327707156, "grad_norm": 4.786425219960417, "learning_rate": 1.581191766111803e-06, "loss": 0.3357, "step": 9195 }, { "epoch": 3.1282258407380636, "grad_norm": 23.43921705701697, "learning_rate": 1.5786699247676232e-06, "loss": 0.3623, "step": 9200 }, { "epoch": 3.129926448705412, "grad_norm": 7.913970059170245, "learning_rate": 1.5761491677942664e-06, "loss": 0.3452, "step": 9205 }, { "epoch": 3.1316270566727606, "grad_norm": 4.869166593219783, "learning_rate": 1.573629498158568e-06, "loss": 0.3327, "step": 9210 }, { "epoch": 3.1333276646401087, "grad_norm": 5.034791596752279, "learning_rate": 1.571110918826085e-06, "loss": 0.3744, "step": 9215 }, { "epoch": 3.135028272607457, "grad_norm": 6.145183450030516, "learning_rate": 1.5685934327610902e-06, "loss": 0.3625, "step": 9220 }, { "epoch": 3.1367288805748057, "grad_norm": 6.360077325313443, "learning_rate": 1.5660770429265696e-06, "loss": 0.3476, "step": 9225 }, { "epoch": 3.1384294885421538, "grad_norm": 5.018401042849134, "learning_rate": 1.5635617522842197e-06, "loss": 0.3413, "step": 9230 }, { "epoch": 3.1401300965095023, "grad_norm": 26.970733338180644, "learning_rate": 1.5610475637944428e-06, "loss": 0.3496, "step": 9235 }, { "epoch": 3.1418307044768503, "grad_norm": 3.169329874562929, "learning_rate": 1.5585344804163443e-06, "loss": 0.3591, "step": 9240 }, { "epoch": 3.143531312444199, "grad_norm": 5.80691730548006, "learning_rate": 1.5560225051077284e-06, "loss": 0.3648, "step": 9245 }, { "epoch": 3.1452319204115473, "grad_norm": 9.811921458054437, "learning_rate": 1.5535116408250962e-06, "loss": 0.3505, "step": 9250 }, { "epoch": 3.1469325283788954, "grad_norm": 4.216795739393517, "learning_rate": 1.5510018905236395e-06, "loss": 0.3748, "step": 9255 }, { "epoch": 3.148633136346244, "grad_norm": 8.489736019684033, "learning_rate": 1.5484932571572397e-06, "loss": 0.3417, "step": 9260 }, { "epoch": 3.150333744313592, "grad_norm": 2.752951299093961, "learning_rate": 1.5459857436784655e-06, "loss": 0.3281, "step": 9265 }, { "epoch": 3.1520343522809404, "grad_norm": 4.242891501647164, "learning_rate": 1.543479353038565e-06, "loss": 0.362, "step": 9270 }, { "epoch": 3.153734960248289, "grad_norm": 6.2034575198668955, "learning_rate": 1.5409740881874655e-06, "loss": 0.343, "step": 9275 }, { "epoch": 3.155435568215637, "grad_norm": 5.08187694241715, "learning_rate": 1.5384699520737694e-06, "loss": 0.3532, "step": 9280 }, { "epoch": 3.1571361761829855, "grad_norm": 4.3705969329428545, "learning_rate": 1.535966947644751e-06, "loss": 0.3694, "step": 9285 }, { "epoch": 3.158836784150334, "grad_norm": 8.073484197259168, "learning_rate": 1.5334650778463522e-06, "loss": 0.3432, "step": 9290 }, { "epoch": 3.160537392117682, "grad_norm": 2.939116319643779, "learning_rate": 1.5309643456231793e-06, "loss": 0.3537, "step": 9295 }, { "epoch": 3.1622380000850305, "grad_norm": 4.847115814511223, "learning_rate": 1.5284647539185003e-06, "loss": 0.3517, "step": 9300 }, { "epoch": 3.1639386080523786, "grad_norm": 4.707389413634161, "learning_rate": 1.5259663056742403e-06, "loss": 0.3503, "step": 9305 }, { "epoch": 3.165639216019727, "grad_norm": 5.03051762560974, "learning_rate": 1.5234690038309791e-06, "loss": 0.3575, "step": 9310 }, { "epoch": 3.1673398239870756, "grad_norm": 3.3528175997137977, "learning_rate": 1.520972851327947e-06, "loss": 0.3639, "step": 9315 }, { "epoch": 3.1690404319544236, "grad_norm": 4.350561925961647, "learning_rate": 1.518477851103021e-06, "loss": 0.3315, "step": 9320 }, { "epoch": 3.170741039921772, "grad_norm": 18.377674093948617, "learning_rate": 1.5159840060927234e-06, "loss": 0.3701, "step": 9325 }, { "epoch": 3.17244164788912, "grad_norm": 4.034533545384697, "learning_rate": 1.5134913192322153e-06, "loss": 0.3704, "step": 9330 }, { "epoch": 3.1741422558564687, "grad_norm": 5.015297989412574, "learning_rate": 1.5109997934552957e-06, "loss": 0.355, "step": 9335 }, { "epoch": 3.175842863823817, "grad_norm": 3.4254130478277465, "learning_rate": 1.508509431694396e-06, "loss": 0.3618, "step": 9340 }, { "epoch": 3.1775434717911653, "grad_norm": 7.713471497146769, "learning_rate": 1.506020236880579e-06, "loss": 0.3788, "step": 9345 }, { "epoch": 3.1792440797585138, "grad_norm": 4.7558887015027755, "learning_rate": 1.503532211943533e-06, "loss": 0.3519, "step": 9350 }, { "epoch": 3.180944687725862, "grad_norm": 5.434857137728759, "learning_rate": 1.5010453598115694e-06, "loss": 0.369, "step": 9355 }, { "epoch": 3.1826452956932103, "grad_norm": 5.727333889997782, "learning_rate": 1.49855968341162e-06, "loss": 0.3473, "step": 9360 }, { "epoch": 3.184345903660559, "grad_norm": 4.348652363906678, "learning_rate": 1.4960751856692323e-06, "loss": 0.3323, "step": 9365 }, { "epoch": 3.186046511627907, "grad_norm": 12.327728424966066, "learning_rate": 1.4935918695085667e-06, "loss": 0.3443, "step": 9370 }, { "epoch": 3.1877471195952554, "grad_norm": 3.4232141587420175, "learning_rate": 1.4911097378523926e-06, "loss": 0.3536, "step": 9375 }, { "epoch": 3.1894477275626034, "grad_norm": 7.428770846564405, "learning_rate": 1.4886287936220851e-06, "loss": 0.3467, "step": 9380 }, { "epoch": 3.191148335529952, "grad_norm": 6.069640666537968, "learning_rate": 1.4861490397376234e-06, "loss": 0.3441, "step": 9385 }, { "epoch": 3.1928489434973004, "grad_norm": 6.296746254965375, "learning_rate": 1.4836704791175835e-06, "loss": 0.3456, "step": 9390 }, { "epoch": 3.1945495514646485, "grad_norm": 3.763029207673051, "learning_rate": 1.4811931146791386e-06, "loss": 0.3437, "step": 9395 }, { "epoch": 3.196250159431997, "grad_norm": 3.93215412662426, "learning_rate": 1.4787169493380529e-06, "loss": 0.3565, "step": 9400 }, { "epoch": 3.1979507673993455, "grad_norm": 3.2221714589571095, "learning_rate": 1.4762419860086802e-06, "loss": 0.355, "step": 9405 }, { "epoch": 3.1996513753666935, "grad_norm": 5.083692562089596, "learning_rate": 1.4737682276039589e-06, "loss": 0.3467, "step": 9410 }, { "epoch": 3.201351983334042, "grad_norm": 7.054467328041976, "learning_rate": 1.4712956770354097e-06, "loss": 0.3569, "step": 9415 }, { "epoch": 3.20305259130139, "grad_norm": 3.813157998820209, "learning_rate": 1.4688243372131314e-06, "loss": 0.3686, "step": 9420 }, { "epoch": 3.2047531992687386, "grad_norm": 5.237275286277567, "learning_rate": 1.466354211045798e-06, "loss": 0.358, "step": 9425 }, { "epoch": 3.206453807236087, "grad_norm": 4.193675720923958, "learning_rate": 1.4638853014406554e-06, "loss": 0.3578, "step": 9430 }, { "epoch": 3.208154415203435, "grad_norm": 6.747402085669682, "learning_rate": 1.4614176113035166e-06, "loss": 0.3744, "step": 9435 }, { "epoch": 3.2098550231707836, "grad_norm": 4.575456821912026, "learning_rate": 1.458951143538761e-06, "loss": 0.3575, "step": 9440 }, { "epoch": 3.2115556311381317, "grad_norm": 6.358850914941462, "learning_rate": 1.4564859010493265e-06, "loss": 0.3799, "step": 9445 }, { "epoch": 3.21325623910548, "grad_norm": 3.402734864262026, "learning_rate": 1.4540218867367134e-06, "loss": 0.3557, "step": 9450 }, { "epoch": 3.2149568470728287, "grad_norm": 5.46462070508825, "learning_rate": 1.4515591035009713e-06, "loss": 0.3759, "step": 9455 }, { "epoch": 3.2166574550401767, "grad_norm": 5.384845485189256, "learning_rate": 1.4490975542407054e-06, "loss": 0.3587, "step": 9460 }, { "epoch": 3.2183580630075252, "grad_norm": 11.201007213966772, "learning_rate": 1.4466372418530644e-06, "loss": 0.351, "step": 9465 }, { "epoch": 3.2200586709748737, "grad_norm": 4.594061434244031, "learning_rate": 1.4441781692337449e-06, "loss": 0.3614, "step": 9470 }, { "epoch": 3.221759278942222, "grad_norm": 3.8160947458124452, "learning_rate": 1.441720339276983e-06, "loss": 0.3412, "step": 9475 }, { "epoch": 3.2234598869095703, "grad_norm": 3.6042225339848706, "learning_rate": 1.4392637548755508e-06, "loss": 0.3687, "step": 9480 }, { "epoch": 3.2251604948769184, "grad_norm": 7.343903966138617, "learning_rate": 1.4368084189207576e-06, "loss": 0.3617, "step": 9485 }, { "epoch": 3.226861102844267, "grad_norm": 5.543673064140564, "learning_rate": 1.4343543343024388e-06, "loss": 0.3608, "step": 9490 }, { "epoch": 3.2285617108116154, "grad_norm": 7.081596418583588, "learning_rate": 1.4319015039089623e-06, "loss": 0.3261, "step": 9495 }, { "epoch": 3.2302623187789634, "grad_norm": 4.026350638570551, "learning_rate": 1.4294499306272147e-06, "loss": 0.3636, "step": 9500 }, { "epoch": 3.231962926746312, "grad_norm": 4.6889529787433135, "learning_rate": 1.4269996173426081e-06, "loss": 0.3603, "step": 9505 }, { "epoch": 3.23366353471366, "grad_norm": 10.962216911027449, "learning_rate": 1.4245505669390664e-06, "loss": 0.3586, "step": 9510 }, { "epoch": 3.2353641426810085, "grad_norm": 4.408765691773431, "learning_rate": 1.422102782299032e-06, "loss": 0.3623, "step": 9515 }, { "epoch": 3.237064750648357, "grad_norm": 12.213129830596777, "learning_rate": 1.419656266303453e-06, "loss": 0.3354, "step": 9520 }, { "epoch": 3.238765358615705, "grad_norm": 11.184768597148008, "learning_rate": 1.4172110218317891e-06, "loss": 0.3478, "step": 9525 }, { "epoch": 3.2404659665830535, "grad_norm": 11.388174070470491, "learning_rate": 1.4147670517619989e-06, "loss": 0.3602, "step": 9530 }, { "epoch": 3.2421665745504016, "grad_norm": 3.659338530912207, "learning_rate": 1.4123243589705438e-06, "loss": 0.3482, "step": 9535 }, { "epoch": 3.24386718251775, "grad_norm": 3.228756400523898, "learning_rate": 1.4098829463323827e-06, "loss": 0.3728, "step": 9540 }, { "epoch": 3.2455677904850986, "grad_norm": 4.017379303108377, "learning_rate": 1.4074428167209641e-06, "loss": 0.3634, "step": 9545 }, { "epoch": 3.2472683984524466, "grad_norm": 20.13883721091286, "learning_rate": 1.405003973008231e-06, "loss": 0.3712, "step": 9550 }, { "epoch": 3.248969006419795, "grad_norm": 7.343815939433381, "learning_rate": 1.4025664180646088e-06, "loss": 0.3542, "step": 9555 }, { "epoch": 3.250669614387143, "grad_norm": 3.139097530497291, "learning_rate": 1.4001301547590096e-06, "loss": 0.3487, "step": 9560 }, { "epoch": 3.2523702223544917, "grad_norm": 8.051406942129448, "learning_rate": 1.3976951859588214e-06, "loss": 0.3451, "step": 9565 }, { "epoch": 3.25407083032184, "grad_norm": 4.9757182624069, "learning_rate": 1.395261514529913e-06, "loss": 0.3605, "step": 9570 }, { "epoch": 3.2557714382891882, "grad_norm": 8.39339333496793, "learning_rate": 1.3928291433366225e-06, "loss": 0.3593, "step": 9575 }, { "epoch": 3.2574720462565367, "grad_norm": 4.108719744608423, "learning_rate": 1.3903980752417612e-06, "loss": 0.3517, "step": 9580 }, { "epoch": 3.2591726542238852, "grad_norm": 4.483900114111096, "learning_rate": 1.387968313106602e-06, "loss": 0.3529, "step": 9585 }, { "epoch": 3.2608732621912333, "grad_norm": 6.602313068657708, "learning_rate": 1.3855398597908865e-06, "loss": 0.3715, "step": 9590 }, { "epoch": 3.262573870158582, "grad_norm": 11.186997491144231, "learning_rate": 1.3831127181528097e-06, "loss": 0.3609, "step": 9595 }, { "epoch": 3.26427447812593, "grad_norm": 3.702255195437605, "learning_rate": 1.380686891049028e-06, "loss": 0.34, "step": 9600 }, { "epoch": 3.2659750860932784, "grad_norm": 3.111270454859789, "learning_rate": 1.378262381334649e-06, "loss": 0.3577, "step": 9605 }, { "epoch": 3.267675694060627, "grad_norm": 3.9015262777944586, "learning_rate": 1.3758391918632274e-06, "loss": 0.366, "step": 9610 }, { "epoch": 3.269376302027975, "grad_norm": 3.2201790060582733, "learning_rate": 1.3734173254867686e-06, "loss": 0.3388, "step": 9615 }, { "epoch": 3.2710769099953234, "grad_norm": 5.846711105583726, "learning_rate": 1.3709967850557155e-06, "loss": 0.3732, "step": 9620 }, { "epoch": 3.2727775179626715, "grad_norm": 5.302735003789467, "learning_rate": 1.3685775734189554e-06, "loss": 0.3364, "step": 9625 }, { "epoch": 3.27447812593002, "grad_norm": 9.045329764476312, "learning_rate": 1.3661596934238076e-06, "loss": 0.3507, "step": 9630 }, { "epoch": 3.2761787338973685, "grad_norm": 7.541951413082083, "learning_rate": 1.3637431479160268e-06, "loss": 0.355, "step": 9635 }, { "epoch": 3.2778793418647165, "grad_norm": 5.455160556892848, "learning_rate": 1.3613279397397954e-06, "loss": 0.3548, "step": 9640 }, { "epoch": 3.279579949832065, "grad_norm": 5.56820279064728, "learning_rate": 1.358914071737724e-06, "loss": 0.362, "step": 9645 }, { "epoch": 3.2812805577994135, "grad_norm": 6.458486233565775, "learning_rate": 1.356501546750842e-06, "loss": 0.3503, "step": 9650 }, { "epoch": 3.2829811657667616, "grad_norm": 16.080185721494754, "learning_rate": 1.3540903676186021e-06, "loss": 0.3605, "step": 9655 }, { "epoch": 3.28468177373411, "grad_norm": 3.379459157677767, "learning_rate": 1.3516805371788721e-06, "loss": 0.3438, "step": 9660 }, { "epoch": 3.286382381701458, "grad_norm": 4.213221873676639, "learning_rate": 1.3492720582679297e-06, "loss": 0.3657, "step": 9665 }, { "epoch": 3.2880829896688066, "grad_norm": 5.550378842694407, "learning_rate": 1.3468649337204665e-06, "loss": 0.3413, "step": 9670 }, { "epoch": 3.2897835976361547, "grad_norm": 4.188266219864308, "learning_rate": 1.3444591663695743e-06, "loss": 0.3623, "step": 9675 }, { "epoch": 3.291484205603503, "grad_norm": 6.504902750656069, "learning_rate": 1.3420547590467538e-06, "loss": 0.3505, "step": 9680 }, { "epoch": 3.2931848135708517, "grad_norm": 11.903424870560336, "learning_rate": 1.3396517145818996e-06, "loss": 0.3404, "step": 9685 }, { "epoch": 3.2948854215381997, "grad_norm": 4.97108654701699, "learning_rate": 1.3372500358033064e-06, "loss": 0.3527, "step": 9690 }, { "epoch": 3.2965860295055482, "grad_norm": 4.756803819783444, "learning_rate": 1.334849725537658e-06, "loss": 0.3654, "step": 9695 }, { "epoch": 3.2982866374728967, "grad_norm": 3.5908380327207152, "learning_rate": 1.3324507866100312e-06, "loss": 0.3286, "step": 9700 }, { "epoch": 3.299987245440245, "grad_norm": 4.592058291952547, "learning_rate": 1.3300532218438848e-06, "loss": 0.3373, "step": 9705 }, { "epoch": 3.3016878534075933, "grad_norm": 3.1249667027320087, "learning_rate": 1.3276570340610639e-06, "loss": 0.353, "step": 9710 }, { "epoch": 3.3033884613749414, "grad_norm": 10.738793625412239, "learning_rate": 1.32526222608179e-06, "loss": 0.3331, "step": 9715 }, { "epoch": 3.30508906934229, "grad_norm": 4.611524702720749, "learning_rate": 1.3228688007246627e-06, "loss": 0.3527, "step": 9720 }, { "epoch": 3.3067896773096384, "grad_norm": 7.857897100716977, "learning_rate": 1.3204767608066543e-06, "loss": 0.366, "step": 9725 }, { "epoch": 3.3084902852769864, "grad_norm": 4.918137208400486, "learning_rate": 1.3180861091431041e-06, "loss": 0.3543, "step": 9730 }, { "epoch": 3.310190893244335, "grad_norm": 6.304405555203345, "learning_rate": 1.3156968485477207e-06, "loss": 0.3586, "step": 9735 }, { "epoch": 3.311891501211683, "grad_norm": 5.407731517296218, "learning_rate": 1.3133089818325725e-06, "loss": 0.3224, "step": 9740 }, { "epoch": 3.3135921091790315, "grad_norm": 4.254600301547921, "learning_rate": 1.3109225118080904e-06, "loss": 0.375, "step": 9745 }, { "epoch": 3.31529271714638, "grad_norm": 5.3879622240552445, "learning_rate": 1.308537441283058e-06, "loss": 0.3409, "step": 9750 }, { "epoch": 3.316993325113728, "grad_norm": 5.503722668980672, "learning_rate": 1.306153773064615e-06, "loss": 0.3644, "step": 9755 }, { "epoch": 3.3186939330810765, "grad_norm": 3.7292060947964063, "learning_rate": 1.3037715099582477e-06, "loss": 0.3719, "step": 9760 }, { "epoch": 3.320394541048425, "grad_norm": 4.9213167491874135, "learning_rate": 1.3013906547677923e-06, "loss": 0.3705, "step": 9765 }, { "epoch": 3.322095149015773, "grad_norm": 3.145540184588686, "learning_rate": 1.299011210295423e-06, "loss": 0.3474, "step": 9770 }, { "epoch": 3.3237957569831216, "grad_norm": 4.681201076613843, "learning_rate": 1.2966331793416581e-06, "loss": 0.3492, "step": 9775 }, { "epoch": 3.3254963649504696, "grad_norm": 15.673425550163644, "learning_rate": 1.2942565647053513e-06, "loss": 0.3586, "step": 9780 }, { "epoch": 3.327196972917818, "grad_norm": 3.5449959702708687, "learning_rate": 1.2918813691836862e-06, "loss": 0.3597, "step": 9785 }, { "epoch": 3.3288975808851666, "grad_norm": 5.776623181868395, "learning_rate": 1.2895075955721812e-06, "loss": 0.3443, "step": 9790 }, { "epoch": 3.3305981888525147, "grad_norm": 5.284604667205001, "learning_rate": 1.2871352466646762e-06, "loss": 0.3454, "step": 9795 }, { "epoch": 3.332298796819863, "grad_norm": 2.9524565603519894, "learning_rate": 1.284764325253338e-06, "loss": 0.3324, "step": 9800 }, { "epoch": 3.3339994047872112, "grad_norm": 5.483482066166741, "learning_rate": 1.282394834128651e-06, "loss": 0.3702, "step": 9805 }, { "epoch": 3.3357000127545597, "grad_norm": 3.6052555779675934, "learning_rate": 1.2800267760794182e-06, "loss": 0.3557, "step": 9810 }, { "epoch": 3.3374006207219082, "grad_norm": 3.95121512021213, "learning_rate": 1.2776601538927533e-06, "loss": 0.3679, "step": 9815 }, { "epoch": 3.3391012286892563, "grad_norm": 3.5054855290377427, "learning_rate": 1.2752949703540831e-06, "loss": 0.3335, "step": 9820 }, { "epoch": 3.340801836656605, "grad_norm": 3.3110519465444477, "learning_rate": 1.2729312282471379e-06, "loss": 0.3564, "step": 9825 }, { "epoch": 3.3425024446239533, "grad_norm": 5.756438579662786, "learning_rate": 1.2705689303539553e-06, "loss": 0.3464, "step": 9830 }, { "epoch": 3.3442030525913014, "grad_norm": 5.266044233708819, "learning_rate": 1.2682080794548687e-06, "loss": 0.3399, "step": 9835 }, { "epoch": 3.34590366055865, "grad_norm": 4.226321802166191, "learning_rate": 1.2658486783285118e-06, "loss": 0.3524, "step": 9840 }, { "epoch": 3.347604268525998, "grad_norm": 3.425918661818654, "learning_rate": 1.2634907297518122e-06, "loss": 0.3493, "step": 9845 }, { "epoch": 3.3493048764933464, "grad_norm": 33.576944515185474, "learning_rate": 1.2611342364999843e-06, "loss": 0.3627, "step": 9850 }, { "epoch": 3.3510054844606945, "grad_norm": 14.704585043928068, "learning_rate": 1.258779201346534e-06, "loss": 0.3816, "step": 9855 }, { "epoch": 3.352706092428043, "grad_norm": 6.061828813618805, "learning_rate": 1.2564256270632474e-06, "loss": 0.3325, "step": 9860 }, { "epoch": 3.3544067003953915, "grad_norm": 5.616879925172779, "learning_rate": 1.2540735164201945e-06, "loss": 0.3661, "step": 9865 }, { "epoch": 3.3561073083627395, "grad_norm": 3.4960002907362444, "learning_rate": 1.2517228721857194e-06, "loss": 0.3524, "step": 9870 }, { "epoch": 3.357807916330088, "grad_norm": 6.37061786854243, "learning_rate": 1.249373697126443e-06, "loss": 0.3616, "step": 9875 }, { "epoch": 3.3595085242974365, "grad_norm": 14.589626654492173, "learning_rate": 1.2470259940072552e-06, "loss": 0.3476, "step": 9880 }, { "epoch": 3.3612091322647846, "grad_norm": 5.493479668859717, "learning_rate": 1.244679765591315e-06, "loss": 0.3585, "step": 9885 }, { "epoch": 3.362909740232133, "grad_norm": 5.210868639805221, "learning_rate": 1.2423350146400436e-06, "loss": 0.3569, "step": 9890 }, { "epoch": 3.364610348199481, "grad_norm": 5.493687640765632, "learning_rate": 1.2399917439131257e-06, "loss": 0.352, "step": 9895 }, { "epoch": 3.3663109561668296, "grad_norm": 4.33835279527087, "learning_rate": 1.2376499561685034e-06, "loss": 0.35, "step": 9900 }, { "epoch": 3.368011564134178, "grad_norm": 5.224804654043163, "learning_rate": 1.235309654162371e-06, "loss": 0.3589, "step": 9905 }, { "epoch": 3.369712172101526, "grad_norm": 3.573205850458747, "learning_rate": 1.2329708406491775e-06, "loss": 0.345, "step": 9910 }, { "epoch": 3.3714127800688747, "grad_norm": 4.358838772350418, "learning_rate": 1.230633518381617e-06, "loss": 0.3485, "step": 9915 }, { "epoch": 3.3731133880362227, "grad_norm": 4.149921011912954, "learning_rate": 1.2282976901106314e-06, "loss": 0.353, "step": 9920 }, { "epoch": 3.3748139960035712, "grad_norm": 4.861952397773383, "learning_rate": 1.2259633585854006e-06, "loss": 0.3715, "step": 9925 }, { "epoch": 3.3765146039709197, "grad_norm": 4.230447350254913, "learning_rate": 1.2236305265533472e-06, "loss": 0.3388, "step": 9930 }, { "epoch": 3.378215211938268, "grad_norm": 3.869994835859092, "learning_rate": 1.2212991967601245e-06, "loss": 0.371, "step": 9935 }, { "epoch": 3.3799158199056163, "grad_norm": 5.2455469496553775, "learning_rate": 1.218969371949622e-06, "loss": 0.3621, "step": 9940 }, { "epoch": 3.381616427872965, "grad_norm": 8.880842044601593, "learning_rate": 1.216641054863954e-06, "loss": 0.34, "step": 9945 }, { "epoch": 3.383317035840313, "grad_norm": 4.18076871396702, "learning_rate": 1.214314248243464e-06, "loss": 0.3585, "step": 9950 }, { "epoch": 3.3850176438076613, "grad_norm": 4.122412595315365, "learning_rate": 1.2119889548267136e-06, "loss": 0.3672, "step": 9955 }, { "epoch": 3.3867182517750094, "grad_norm": 4.058013648340633, "learning_rate": 1.2096651773504866e-06, "loss": 0.3467, "step": 9960 }, { "epoch": 3.388418859742358, "grad_norm": 3.5657280622355563, "learning_rate": 1.2073429185497832e-06, "loss": 0.3624, "step": 9965 }, { "epoch": 3.3901194677097064, "grad_norm": 3.001269457589452, "learning_rate": 1.205022181157812e-06, "loss": 0.3391, "step": 9970 }, { "epoch": 3.3918200756770545, "grad_norm": 11.074556669455953, "learning_rate": 1.2027029679059958e-06, "loss": 0.3506, "step": 9975 }, { "epoch": 3.393520683644403, "grad_norm": 4.002246206732605, "learning_rate": 1.2003852815239592e-06, "loss": 0.3511, "step": 9980 }, { "epoch": 3.395221291611751, "grad_norm": 7.74038371051314, "learning_rate": 1.1980691247395341e-06, "loss": 0.3362, "step": 9985 }, { "epoch": 3.3969218995790995, "grad_norm": 6.430265373445128, "learning_rate": 1.1957545002787475e-06, "loss": 0.3665, "step": 9990 }, { "epoch": 3.398622507546448, "grad_norm": 5.8787842931823855, "learning_rate": 1.1934414108658273e-06, "loss": 0.3347, "step": 9995 }, { "epoch": 3.400323115513796, "grad_norm": 4.069184550986305, "learning_rate": 1.1911298592231912e-06, "loss": 0.3519, "step": 10000 }, { "epoch": 3.4020237234811446, "grad_norm": 6.865449984895776, "learning_rate": 1.18881984807145e-06, "loss": 0.3487, "step": 10005 }, { "epoch": 3.403724331448493, "grad_norm": 7.765969170122917, "learning_rate": 1.1865113801293978e-06, "loss": 0.3181, "step": 10010 }, { "epoch": 3.405424939415841, "grad_norm": 5.891701530959368, "learning_rate": 1.184204458114016e-06, "loss": 0.3313, "step": 10015 }, { "epoch": 3.4071255473831896, "grad_norm": 4.097355014099184, "learning_rate": 1.1818990847404657e-06, "loss": 0.3298, "step": 10020 }, { "epoch": 3.4088261553505377, "grad_norm": 3.3676007474296843, "learning_rate": 1.1795952627220825e-06, "loss": 0.3368, "step": 10025 }, { "epoch": 3.410526763317886, "grad_norm": 8.050189484594268, "learning_rate": 1.1772929947703802e-06, "loss": 0.3581, "step": 10030 }, { "epoch": 3.4122273712852342, "grad_norm": 12.988329727387205, "learning_rate": 1.1749922835950398e-06, "loss": 0.3351, "step": 10035 }, { "epoch": 3.4139279792525827, "grad_norm": 4.452551336342145, "learning_rate": 1.1726931319039133e-06, "loss": 0.3596, "step": 10040 }, { "epoch": 3.4156285872199312, "grad_norm": 4.7390840314669775, "learning_rate": 1.1703955424030142e-06, "loss": 0.3524, "step": 10045 }, { "epoch": 3.4173291951872793, "grad_norm": 3.853577268844155, "learning_rate": 1.1680995177965205e-06, "loss": 0.3579, "step": 10050 }, { "epoch": 3.419029803154628, "grad_norm": 11.34337877012613, "learning_rate": 1.165805060786765e-06, "loss": 0.3589, "step": 10055 }, { "epoch": 3.4207304111219763, "grad_norm": 6.19306633545983, "learning_rate": 1.1635121740742391e-06, "loss": 0.3604, "step": 10060 }, { "epoch": 3.4224310190893243, "grad_norm": 4.282172329915891, "learning_rate": 1.1612208603575822e-06, "loss": 0.3643, "step": 10065 }, { "epoch": 3.424131627056673, "grad_norm": 4.714097822547551, "learning_rate": 1.1589311223335864e-06, "loss": 0.3367, "step": 10070 }, { "epoch": 3.425832235024021, "grad_norm": 6.438695059124917, "learning_rate": 1.156642962697185e-06, "loss": 0.3488, "step": 10075 }, { "epoch": 3.4275328429913694, "grad_norm": 5.0249657678160125, "learning_rate": 1.1543563841414571e-06, "loss": 0.3586, "step": 10080 }, { "epoch": 3.429233450958718, "grad_norm": 4.565364515731503, "learning_rate": 1.1520713893576199e-06, "loss": 0.3433, "step": 10085 }, { "epoch": 3.430934058926066, "grad_norm": 6.5535294321935655, "learning_rate": 1.1497879810350251e-06, "loss": 0.3728, "step": 10090 }, { "epoch": 3.4326346668934145, "grad_norm": 4.757141860105625, "learning_rate": 1.14750616186116e-06, "loss": 0.3664, "step": 10095 }, { "epoch": 3.4343352748607625, "grad_norm": 3.2389630049290354, "learning_rate": 1.1452259345216377e-06, "loss": 0.3397, "step": 10100 }, { "epoch": 3.436035882828111, "grad_norm": 3.3426138556572704, "learning_rate": 1.1429473017002022e-06, "loss": 0.3531, "step": 10105 }, { "epoch": 3.4377364907954595, "grad_norm": 5.8805448197995815, "learning_rate": 1.1406702660787163e-06, "loss": 0.3621, "step": 10110 }, { "epoch": 3.4394370987628076, "grad_norm": 3.438221317751023, "learning_rate": 1.1383948303371675e-06, "loss": 0.3341, "step": 10115 }, { "epoch": 3.441137706730156, "grad_norm": 4.173544496895103, "learning_rate": 1.136120997153656e-06, "loss": 0.379, "step": 10120 }, { "epoch": 3.4428383146975046, "grad_norm": 15.351690047816835, "learning_rate": 1.1338487692044e-06, "loss": 0.3483, "step": 10125 }, { "epoch": 3.4445389226648526, "grad_norm": 4.903093752091242, "learning_rate": 1.131578149163724e-06, "loss": 0.3541, "step": 10130 }, { "epoch": 3.446239530632201, "grad_norm": 4.0036016634813025, "learning_rate": 1.1293091397040635e-06, "loss": 0.3513, "step": 10135 }, { "epoch": 3.447940138599549, "grad_norm": 3.691397103097567, "learning_rate": 1.127041743495958e-06, "loss": 0.3499, "step": 10140 }, { "epoch": 3.4496407465668977, "grad_norm": 4.672776733697925, "learning_rate": 1.1247759632080456e-06, "loss": 0.3558, "step": 10145 }, { "epoch": 3.451341354534246, "grad_norm": 4.709387888832122, "learning_rate": 1.122511801507067e-06, "loss": 0.3354, "step": 10150 }, { "epoch": 3.4530419625015942, "grad_norm": 4.020983262420783, "learning_rate": 1.120249261057852e-06, "loss": 0.3786, "step": 10155 }, { "epoch": 3.4547425704689427, "grad_norm": 3.9715982397829372, "learning_rate": 1.117988344523329e-06, "loss": 0.3354, "step": 10160 }, { "epoch": 3.456443178436291, "grad_norm": 4.288485704118769, "learning_rate": 1.1157290545645088e-06, "loss": 0.3595, "step": 10165 }, { "epoch": 3.4581437864036393, "grad_norm": 9.768926750109651, "learning_rate": 1.113471393840493e-06, "loss": 0.3412, "step": 10170 }, { "epoch": 3.459844394370988, "grad_norm": 4.138834652364292, "learning_rate": 1.1112153650084608e-06, "loss": 0.3586, "step": 10175 }, { "epoch": 3.461545002338336, "grad_norm": 3.753789179270736, "learning_rate": 1.108960970723676e-06, "loss": 0.3445, "step": 10180 }, { "epoch": 3.4632456103056843, "grad_norm": 6.0473161120725045, "learning_rate": 1.1067082136394732e-06, "loss": 0.3383, "step": 10185 }, { "epoch": 3.464946218273033, "grad_norm": 3.87336866761418, "learning_rate": 1.1044570964072649e-06, "loss": 0.349, "step": 10190 }, { "epoch": 3.466646826240381, "grad_norm": 6.983398625443584, "learning_rate": 1.1022076216765295e-06, "loss": 0.3548, "step": 10195 }, { "epoch": 3.4683474342077294, "grad_norm": 7.512321705522319, "learning_rate": 1.0999597920948149e-06, "loss": 0.3485, "step": 10200 }, { "epoch": 3.4700480421750775, "grad_norm": 6.463333767875482, "learning_rate": 1.097713610307733e-06, "loss": 0.365, "step": 10205 }, { "epoch": 3.471748650142426, "grad_norm": 4.028670236526878, "learning_rate": 1.0954690789589533e-06, "loss": 0.3472, "step": 10210 }, { "epoch": 3.473449258109774, "grad_norm": 5.201598757551068, "learning_rate": 1.0932262006902064e-06, "loss": 0.353, "step": 10215 }, { "epoch": 3.4751498660771225, "grad_norm": 5.152639228499577, "learning_rate": 1.090984978141274e-06, "loss": 0.344, "step": 10220 }, { "epoch": 3.476850474044471, "grad_norm": 5.823956212740816, "learning_rate": 1.0887454139499925e-06, "loss": 0.3519, "step": 10225 }, { "epoch": 3.478551082011819, "grad_norm": 4.225798446296788, "learning_rate": 1.086507510752243e-06, "loss": 0.3634, "step": 10230 }, { "epoch": 3.4802516899791676, "grad_norm": 3.635547643893639, "learning_rate": 1.0842712711819548e-06, "loss": 0.357, "step": 10235 }, { "epoch": 3.481952297946516, "grad_norm": 4.665937181986643, "learning_rate": 1.0820366978710959e-06, "loss": 0.3306, "step": 10240 }, { "epoch": 3.483652905913864, "grad_norm": 4.106378454272665, "learning_rate": 1.079803793449677e-06, "loss": 0.3356, "step": 10245 }, { "epoch": 3.4853535138812126, "grad_norm": 7.357366696829599, "learning_rate": 1.0775725605457404e-06, "loss": 0.3687, "step": 10250 }, { "epoch": 3.4870541218485607, "grad_norm": 4.677520590767949, "learning_rate": 1.0753430017853646e-06, "loss": 0.3455, "step": 10255 }, { "epoch": 3.488754729815909, "grad_norm": 5.516153393659592, "learning_rate": 1.0731151197926573e-06, "loss": 0.3439, "step": 10260 }, { "epoch": 3.4904553377832577, "grad_norm": 5.04066022640554, "learning_rate": 1.0708889171897497e-06, "loss": 0.3464, "step": 10265 }, { "epoch": 3.4921559457506057, "grad_norm": 4.141417524214379, "learning_rate": 1.0686643965968002e-06, "loss": 0.3646, "step": 10270 }, { "epoch": 3.4938565537179542, "grad_norm": 5.619200791285557, "learning_rate": 1.0664415606319843e-06, "loss": 0.3681, "step": 10275 }, { "epoch": 3.4955571616853023, "grad_norm": 5.899705389378892, "learning_rate": 1.0642204119114976e-06, "loss": 0.3381, "step": 10280 }, { "epoch": 3.497257769652651, "grad_norm": 4.039755263654937, "learning_rate": 1.0620009530495473e-06, "loss": 0.3418, "step": 10285 }, { "epoch": 3.4989583776199993, "grad_norm": 4.3477768599212085, "learning_rate": 1.0597831866583547e-06, "loss": 0.3375, "step": 10290 }, { "epoch": 3.5006589855873473, "grad_norm": 6.6985895763333785, "learning_rate": 1.057567115348145e-06, "loss": 0.3603, "step": 10295 }, { "epoch": 3.502359593554696, "grad_norm": 19.698218539245715, "learning_rate": 1.055352741727153e-06, "loss": 0.3559, "step": 10300 }, { "epoch": 3.5040602015220443, "grad_norm": 5.609784918032436, "learning_rate": 1.0531400684016114e-06, "loss": 0.3465, "step": 10305 }, { "epoch": 3.5057608094893924, "grad_norm": 3.851627555281216, "learning_rate": 1.0509290979757548e-06, "loss": 0.3581, "step": 10310 }, { "epoch": 3.507461417456741, "grad_norm": 9.753059698528306, "learning_rate": 1.0487198330518105e-06, "loss": 0.3395, "step": 10315 }, { "epoch": 3.509162025424089, "grad_norm": 19.66444586754753, "learning_rate": 1.0465122762300015e-06, "loss": 0.3558, "step": 10320 }, { "epoch": 3.5108626333914374, "grad_norm": 4.973830782952403, "learning_rate": 1.0443064301085394e-06, "loss": 0.3457, "step": 10325 }, { "epoch": 3.5125632413587855, "grad_norm": 4.6413672186230635, "learning_rate": 1.0421022972836206e-06, "loss": 0.3549, "step": 10330 }, { "epoch": 3.514263849326134, "grad_norm": 47.803694846795985, "learning_rate": 1.0398998803494282e-06, "loss": 0.3592, "step": 10335 }, { "epoch": 3.5159644572934825, "grad_norm": 36.10983220537593, "learning_rate": 1.0376991818981225e-06, "loss": 0.3571, "step": 10340 }, { "epoch": 3.5176650652608306, "grad_norm": 7.619601200166429, "learning_rate": 1.035500204519844e-06, "loss": 0.3404, "step": 10345 }, { "epoch": 3.519365673228179, "grad_norm": 20.895655303154214, "learning_rate": 1.0333029508027048e-06, "loss": 0.3671, "step": 10350 }, { "epoch": 3.5210662811955276, "grad_norm": 6.555757587556768, "learning_rate": 1.031107423332792e-06, "loss": 0.3567, "step": 10355 }, { "epoch": 3.5227668891628756, "grad_norm": 6.097681358369745, "learning_rate": 1.028913624694156e-06, "loss": 0.34, "step": 10360 }, { "epoch": 3.524467497130224, "grad_norm": 5.384410281123741, "learning_rate": 1.0267215574688183e-06, "loss": 0.339, "step": 10365 }, { "epoch": 3.5261681050975726, "grad_norm": 3.609206753858732, "learning_rate": 1.024531224236757e-06, "loss": 0.3541, "step": 10370 }, { "epoch": 3.5278687130649207, "grad_norm": 4.68829761586958, "learning_rate": 1.0223426275759126e-06, "loss": 0.3696, "step": 10375 }, { "epoch": 3.529569321032269, "grad_norm": 3.441930989952324, "learning_rate": 1.0201557700621822e-06, "loss": 0.355, "step": 10380 }, { "epoch": 3.5312699289996172, "grad_norm": 4.185920112694374, "learning_rate": 1.0179706542694131e-06, "loss": 0.3331, "step": 10385 }, { "epoch": 3.5329705369669657, "grad_norm": 5.302726584312579, "learning_rate": 1.0157872827694059e-06, "loss": 0.3426, "step": 10390 }, { "epoch": 3.534671144934314, "grad_norm": 4.0486171277544445, "learning_rate": 1.0136056581319054e-06, "loss": 0.341, "step": 10395 }, { "epoch": 3.5363717529016623, "grad_norm": 7.751605073146907, "learning_rate": 1.0114257829246026e-06, "loss": 0.3352, "step": 10400 }, { "epoch": 3.538072360869011, "grad_norm": 3.840718121416357, "learning_rate": 1.0092476597131274e-06, "loss": 0.3685, "step": 10405 }, { "epoch": 3.539772968836359, "grad_norm": 34.9378034765729, "learning_rate": 1.0070712910610495e-06, "loss": 0.3263, "step": 10410 }, { "epoch": 3.5414735768037073, "grad_norm": 5.3549959797705275, "learning_rate": 1.0048966795298731e-06, "loss": 0.36, "step": 10415 }, { "epoch": 3.543174184771056, "grad_norm": 4.467630253567017, "learning_rate": 1.0027238276790348e-06, "loss": 0.3355, "step": 10420 }, { "epoch": 3.544874792738404, "grad_norm": 4.163273802824049, "learning_rate": 1.0005527380658978e-06, "loss": 0.3453, "step": 10425 }, { "epoch": 3.5465754007057524, "grad_norm": 7.726891359050451, "learning_rate": 9.983834132457549e-07, "loss": 0.3289, "step": 10430 }, { "epoch": 3.548276008673101, "grad_norm": 3.029339493191297, "learning_rate": 9.962158557718172e-07, "loss": 0.3581, "step": 10435 }, { "epoch": 3.549976616640449, "grad_norm": 4.362906572985257, "learning_rate": 9.940500681952208e-07, "loss": 0.3826, "step": 10440 }, { "epoch": 3.5516772246077974, "grad_norm": 4.018003641401903, "learning_rate": 9.91886053065014e-07, "loss": 0.3375, "step": 10445 }, { "epoch": 3.5533778325751455, "grad_norm": 4.878837546624064, "learning_rate": 9.897238129281633e-07, "loss": 0.3217, "step": 10450 }, { "epoch": 3.555078440542494, "grad_norm": 14.33326882112203, "learning_rate": 9.875633503295417e-07, "loss": 0.3542, "step": 10455 }, { "epoch": 3.556779048509842, "grad_norm": 10.72392420053456, "learning_rate": 9.854046678119347e-07, "loss": 0.3526, "step": 10460 }, { "epoch": 3.5584796564771906, "grad_norm": 3.5231895845560977, "learning_rate": 9.83247767916028e-07, "loss": 0.3417, "step": 10465 }, { "epoch": 3.560180264444539, "grad_norm": 3.614269750006329, "learning_rate": 9.810926531804129e-07, "loss": 0.3525, "step": 10470 }, { "epoch": 3.561880872411887, "grad_norm": 20.10042035284421, "learning_rate": 9.789393261415792e-07, "loss": 0.3569, "step": 10475 }, { "epoch": 3.5635814803792356, "grad_norm": 3.64653363386056, "learning_rate": 9.767877893339097e-07, "loss": 0.3637, "step": 10480 }, { "epoch": 3.565282088346584, "grad_norm": 5.791402748117439, "learning_rate": 9.746380452896844e-07, "loss": 0.3673, "step": 10485 }, { "epoch": 3.566982696313932, "grad_norm": 3.9820409485030317, "learning_rate": 9.72490096539069e-07, "loss": 0.3451, "step": 10490 }, { "epoch": 3.5686833042812807, "grad_norm": 5.764970808084148, "learning_rate": 9.703439456101205e-07, "loss": 0.3671, "step": 10495 }, { "epoch": 3.5703839122486287, "grad_norm": 10.613146957278644, "learning_rate": 9.681995950287756e-07, "loss": 0.3462, "step": 10500 }, { "epoch": 3.572084520215977, "grad_norm": 4.444097200241279, "learning_rate": 9.660570473188565e-07, "loss": 0.3362, "step": 10505 }, { "epoch": 3.5737851281833253, "grad_norm": 4.151581287292085, "learning_rate": 9.639163050020589e-07, "loss": 0.3446, "step": 10510 }, { "epoch": 3.5754857361506738, "grad_norm": 3.4422524347308134, "learning_rate": 9.61777370597958e-07, "loss": 0.331, "step": 10515 }, { "epoch": 3.5771863441180223, "grad_norm": 6.291695215625101, "learning_rate": 9.596402466239973e-07, "loss": 0.3373, "step": 10520 }, { "epoch": 3.5788869520853703, "grad_norm": 8.458224481826973, "learning_rate": 9.57504935595492e-07, "loss": 0.3491, "step": 10525 }, { "epoch": 3.580587560052719, "grad_norm": 7.847005455939928, "learning_rate": 9.55371440025624e-07, "loss": 0.3554, "step": 10530 }, { "epoch": 3.5822881680200673, "grad_norm": 4.161632621025964, "learning_rate": 9.532397624254353e-07, "loss": 0.3358, "step": 10535 }, { "epoch": 3.5839887759874154, "grad_norm": 5.333937407524186, "learning_rate": 9.511099053038319e-07, "loss": 0.3403, "step": 10540 }, { "epoch": 3.585689383954764, "grad_norm": 47.99447831985307, "learning_rate": 9.489818711675742e-07, "loss": 0.3723, "step": 10545 }, { "epoch": 3.5873899919221124, "grad_norm": 4.823602301167504, "learning_rate": 9.468556625212791e-07, "loss": 0.3429, "step": 10550 }, { "epoch": 3.5890905998894604, "grad_norm": 5.119510687553085, "learning_rate": 9.447312818674134e-07, "loss": 0.3423, "step": 10555 }, { "epoch": 3.590791207856809, "grad_norm": 5.457065843868638, "learning_rate": 9.426087317062943e-07, "loss": 0.3309, "step": 10560 }, { "epoch": 3.592491815824157, "grad_norm": 3.4550953372137454, "learning_rate": 9.40488014536082e-07, "loss": 0.365, "step": 10565 }, { "epoch": 3.5941924237915055, "grad_norm": 5.41452648877141, "learning_rate": 9.383691328527824e-07, "loss": 0.3508, "step": 10570 }, { "epoch": 3.5958930317588536, "grad_norm": 5.534474014926724, "learning_rate": 9.362520891502377e-07, "loss": 0.371, "step": 10575 }, { "epoch": 3.597593639726202, "grad_norm": 4.940924121708319, "learning_rate": 9.341368859201308e-07, "loss": 0.3467, "step": 10580 }, { "epoch": 3.5992942476935506, "grad_norm": 17.61714640188424, "learning_rate": 9.320235256519741e-07, "loss": 0.3715, "step": 10585 }, { "epoch": 3.6009948556608986, "grad_norm": 5.751312456441521, "learning_rate": 9.299120108331142e-07, "loss": 0.3339, "step": 10590 }, { "epoch": 3.602695463628247, "grad_norm": 5.166376894454055, "learning_rate": 9.278023439487252e-07, "loss": 0.363, "step": 10595 }, { "epoch": 3.6043960715955956, "grad_norm": 3.863575377984854, "learning_rate": 9.256945274818038e-07, "loss": 0.3496, "step": 10600 }, { "epoch": 3.6060966795629437, "grad_norm": 5.513614259293148, "learning_rate": 9.23588563913173e-07, "loss": 0.3335, "step": 10605 }, { "epoch": 3.607797287530292, "grad_norm": 4.952598970243949, "learning_rate": 9.214844557214705e-07, "loss": 0.3779, "step": 10610 }, { "epoch": 3.6094978954976407, "grad_norm": 9.341478820658665, "learning_rate": 9.193822053831542e-07, "loss": 0.3565, "step": 10615 }, { "epoch": 3.6111985034649887, "grad_norm": 4.086573306032363, "learning_rate": 9.172818153724919e-07, "loss": 0.3567, "step": 10620 }, { "epoch": 3.612899111432337, "grad_norm": 6.5010472580613925, "learning_rate": 9.151832881615652e-07, "loss": 0.3656, "step": 10625 }, { "epoch": 3.6145997193996853, "grad_norm": 3.354870904762796, "learning_rate": 9.130866262202603e-07, "loss": 0.3638, "step": 10630 }, { "epoch": 3.6163003273670338, "grad_norm": 4.1115236060147575, "learning_rate": 9.109918320162708e-07, "loss": 0.3469, "step": 10635 }, { "epoch": 3.618000935334382, "grad_norm": 4.607088368228926, "learning_rate": 9.08898908015089e-07, "loss": 0.3366, "step": 10640 }, { "epoch": 3.6197015433017303, "grad_norm": 18.186133697983188, "learning_rate": 9.068078566800084e-07, "loss": 0.3574, "step": 10645 }, { "epoch": 3.621402151269079, "grad_norm": 3.9147702267000333, "learning_rate": 9.047186804721189e-07, "loss": 0.3317, "step": 10650 }, { "epoch": 3.623102759236427, "grad_norm": 7.557513551869926, "learning_rate": 9.026313818503002e-07, "loss": 0.3489, "step": 10655 }, { "epoch": 3.6248033672037754, "grad_norm": 5.3993015624251095, "learning_rate": 9.005459632712263e-07, "loss": 0.372, "step": 10660 }, { "epoch": 3.626503975171124, "grad_norm": 5.053809013474026, "learning_rate": 8.984624271893544e-07, "loss": 0.3557, "step": 10665 }, { "epoch": 3.628204583138472, "grad_norm": 6.460565333738014, "learning_rate": 8.963807760569296e-07, "loss": 0.347, "step": 10670 }, { "epoch": 3.6299051911058204, "grad_norm": 4.628311970760183, "learning_rate": 8.943010123239756e-07, "loss": 0.3636, "step": 10675 }, { "epoch": 3.6316057990731685, "grad_norm": 5.234295870033782, "learning_rate": 8.922231384382976e-07, "loss": 0.3487, "step": 10680 }, { "epoch": 3.633306407040517, "grad_norm": 10.655149420447447, "learning_rate": 8.901471568454734e-07, "loss": 0.3623, "step": 10685 }, { "epoch": 3.635007015007865, "grad_norm": 9.126701672294454, "learning_rate": 8.880730699888565e-07, "loss": 0.341, "step": 10690 }, { "epoch": 3.6367076229752135, "grad_norm": 4.087730378521851, "learning_rate": 8.86000880309568e-07, "loss": 0.366, "step": 10695 }, { "epoch": 3.638408230942562, "grad_norm": 3.4169310173879475, "learning_rate": 8.839305902464982e-07, "loss": 0.3535, "step": 10700 }, { "epoch": 3.64010883890991, "grad_norm": 5.473157698564127, "learning_rate": 8.818622022362991e-07, "loss": 0.3298, "step": 10705 }, { "epoch": 3.6418094468772586, "grad_norm": 3.986001428965386, "learning_rate": 8.797957187133866e-07, "loss": 0.3345, "step": 10710 }, { "epoch": 3.643510054844607, "grad_norm": 5.36691398359891, "learning_rate": 8.777311421099347e-07, "loss": 0.3472, "step": 10715 }, { "epoch": 3.645210662811955, "grad_norm": 8.850301792304556, "learning_rate": 8.756684748558708e-07, "loss": 0.3324, "step": 10720 }, { "epoch": 3.6469112707793037, "grad_norm": 4.686613735711385, "learning_rate": 8.736077193788781e-07, "loss": 0.3572, "step": 10725 }, { "epoch": 3.648611878746652, "grad_norm": 5.313370553096401, "learning_rate": 8.715488781043869e-07, "loss": 0.3466, "step": 10730 }, { "epoch": 3.650312486714, "grad_norm": 4.201836558647826, "learning_rate": 8.694919534555771e-07, "loss": 0.3613, "step": 10735 }, { "epoch": 3.6520130946813487, "grad_norm": 4.391708804040991, "learning_rate": 8.674369478533701e-07, "loss": 0.3609, "step": 10740 }, { "epoch": 3.6537137026486968, "grad_norm": 5.3463161467714855, "learning_rate": 8.653838637164321e-07, "loss": 0.3543, "step": 10745 }, { "epoch": 3.6554143106160453, "grad_norm": 3.983674499953354, "learning_rate": 8.633327034611638e-07, "loss": 0.3384, "step": 10750 }, { "epoch": 3.6571149185833933, "grad_norm": 28.895736753259612, "learning_rate": 8.612834695017055e-07, "loss": 0.3391, "step": 10755 }, { "epoch": 3.658815526550742, "grad_norm": 4.895856073408972, "learning_rate": 8.59236164249927e-07, "loss": 0.3486, "step": 10760 }, { "epoch": 3.6605161345180903, "grad_norm": 7.232101311424993, "learning_rate": 8.571907901154297e-07, "loss": 0.3631, "step": 10765 }, { "epoch": 3.6622167424854384, "grad_norm": 5.333254007085302, "learning_rate": 8.551473495055435e-07, "loss": 0.3474, "step": 10770 }, { "epoch": 3.663917350452787, "grad_norm": 4.052411850251206, "learning_rate": 8.53105844825319e-07, "loss": 0.3477, "step": 10775 }, { "epoch": 3.6656179584201354, "grad_norm": 6.471828711368031, "learning_rate": 8.510662784775322e-07, "loss": 0.3633, "step": 10780 }, { "epoch": 3.6673185663874834, "grad_norm": 4.578665714371467, "learning_rate": 8.490286528626743e-07, "loss": 0.3554, "step": 10785 }, { "epoch": 3.669019174354832, "grad_norm": 4.960024978436364, "learning_rate": 8.469929703789554e-07, "loss": 0.3398, "step": 10790 }, { "epoch": 3.6707197823221804, "grad_norm": 7.649576478459408, "learning_rate": 8.449592334222956e-07, "loss": 0.3676, "step": 10795 }, { "epoch": 3.6724203902895285, "grad_norm": 3.27366413812755, "learning_rate": 8.429274443863286e-07, "loss": 0.3652, "step": 10800 }, { "epoch": 3.674120998256877, "grad_norm": 5.600994248558118, "learning_rate": 8.408976056623919e-07, "loss": 0.3524, "step": 10805 }, { "epoch": 3.675821606224225, "grad_norm": 5.647153791168265, "learning_rate": 8.388697196395309e-07, "loss": 0.3387, "step": 10810 }, { "epoch": 3.6775222141915735, "grad_norm": 4.115828451734707, "learning_rate": 8.368437887044895e-07, "loss": 0.3659, "step": 10815 }, { "epoch": 3.6792228221589216, "grad_norm": 3.1285968581963863, "learning_rate": 8.348198152417136e-07, "loss": 0.3435, "step": 10820 }, { "epoch": 3.68092343012627, "grad_norm": 3.332093467858973, "learning_rate": 8.327978016333424e-07, "loss": 0.3332, "step": 10825 }, { "epoch": 3.6826240380936186, "grad_norm": 5.434188342715908, "learning_rate": 8.307777502592104e-07, "loss": 0.3415, "step": 10830 }, { "epoch": 3.6843246460609667, "grad_norm": 5.252703651996574, "learning_rate": 8.287596634968431e-07, "loss": 0.3264, "step": 10835 }, { "epoch": 3.686025254028315, "grad_norm": 5.025378368137891, "learning_rate": 8.26743543721451e-07, "loss": 0.3666, "step": 10840 }, { "epoch": 3.6877258619956637, "grad_norm": 6.449464171627322, "learning_rate": 8.247293933059328e-07, "loss": 0.3587, "step": 10845 }, { "epoch": 3.6894264699630117, "grad_norm": 27.15790696281429, "learning_rate": 8.227172146208659e-07, "loss": 0.3541, "step": 10850 }, { "epoch": 3.69112707793036, "grad_norm": 3.6536930635245595, "learning_rate": 8.207070100345107e-07, "loss": 0.3496, "step": 10855 }, { "epoch": 3.6928276858977083, "grad_norm": 6.4663982405136835, "learning_rate": 8.186987819128008e-07, "loss": 0.3581, "step": 10860 }, { "epoch": 3.6945282938650568, "grad_norm": 3.8326194223116095, "learning_rate": 8.166925326193464e-07, "loss": 0.3538, "step": 10865 }, { "epoch": 3.696228901832405, "grad_norm": 5.176863609255767, "learning_rate": 8.146882645154263e-07, "loss": 0.3684, "step": 10870 }, { "epoch": 3.6979295097997533, "grad_norm": 4.983850690407328, "learning_rate": 8.126859799599898e-07, "loss": 0.3334, "step": 10875 }, { "epoch": 3.699630117767102, "grad_norm": 14.71099045181069, "learning_rate": 8.106856813096492e-07, "loss": 0.3413, "step": 10880 }, { "epoch": 3.70133072573445, "grad_norm": 13.00188359280808, "learning_rate": 8.08687370918681e-07, "loss": 0.3545, "step": 10885 }, { "epoch": 3.7030313337017984, "grad_norm": 3.8544979857678414, "learning_rate": 8.066910511390228e-07, "loss": 0.3567, "step": 10890 }, { "epoch": 3.704731941669147, "grad_norm": 5.130480817192012, "learning_rate": 8.046967243202656e-07, "loss": 0.3415, "step": 10895 }, { "epoch": 3.706432549636495, "grad_norm": 7.468282471790433, "learning_rate": 8.02704392809659e-07, "loss": 0.3237, "step": 10900 }, { "epoch": 3.7081331576038434, "grad_norm": 9.985122000430085, "learning_rate": 8.007140589521006e-07, "loss": 0.3304, "step": 10905 }, { "epoch": 3.709833765571192, "grad_norm": 6.9889392828863475, "learning_rate": 7.987257250901398e-07, "loss": 0.3331, "step": 10910 }, { "epoch": 3.71153437353854, "grad_norm": 11.080424473644747, "learning_rate": 7.967393935639695e-07, "loss": 0.346, "step": 10915 }, { "epoch": 3.7132349815058885, "grad_norm": 4.061944846859679, "learning_rate": 7.947550667114284e-07, "loss": 0.3547, "step": 10920 }, { "epoch": 3.7149355894732365, "grad_norm": 4.106047339463541, "learning_rate": 7.92772746867993e-07, "loss": 0.3422, "step": 10925 }, { "epoch": 3.716636197440585, "grad_norm": 3.838078261798806, "learning_rate": 7.907924363667807e-07, "loss": 0.3369, "step": 10930 }, { "epoch": 3.718336805407933, "grad_norm": 6.69051307658985, "learning_rate": 7.888141375385411e-07, "loss": 0.3575, "step": 10935 }, { "epoch": 3.7200374133752816, "grad_norm": 3.667423958159118, "learning_rate": 7.868378527116588e-07, "loss": 0.3403, "step": 10940 }, { "epoch": 3.72173802134263, "grad_norm": 4.360620344296931, "learning_rate": 7.848635842121452e-07, "loss": 0.3491, "step": 10945 }, { "epoch": 3.723438629309978, "grad_norm": 2.873390222661289, "learning_rate": 7.828913343636407e-07, "loss": 0.3198, "step": 10950 }, { "epoch": 3.7251392372773267, "grad_norm": 2.610382891982884, "learning_rate": 7.809211054874102e-07, "loss": 0.3493, "step": 10955 }, { "epoch": 3.726839845244675, "grad_norm": 5.011617850920955, "learning_rate": 7.789528999023369e-07, "loss": 0.3472, "step": 10960 }, { "epoch": 3.728540453212023, "grad_norm": 4.893587266486058, "learning_rate": 7.769867199249264e-07, "loss": 0.3603, "step": 10965 }, { "epoch": 3.7302410611793717, "grad_norm": 4.605818910102079, "learning_rate": 7.750225678692974e-07, "loss": 0.3347, "step": 10970 }, { "epoch": 3.73194166914672, "grad_norm": 3.330070000188197, "learning_rate": 7.730604460471841e-07, "loss": 0.3323, "step": 10975 }, { "epoch": 3.7336422771140683, "grad_norm": 5.063438012995593, "learning_rate": 7.711003567679285e-07, "loss": 0.3396, "step": 10980 }, { "epoch": 3.7353428850814168, "grad_norm": 5.161235017584308, "learning_rate": 7.691423023384837e-07, "loss": 0.3579, "step": 10985 }, { "epoch": 3.737043493048765, "grad_norm": 5.530851535182855, "learning_rate": 7.671862850634041e-07, "loss": 0.3605, "step": 10990 }, { "epoch": 3.7387441010161133, "grad_norm": 6.390053527359117, "learning_rate": 7.652323072448503e-07, "loss": 0.3411, "step": 10995 }, { "epoch": 3.7404447089834614, "grad_norm": 7.104792676720178, "learning_rate": 7.63280371182579e-07, "loss": 0.3454, "step": 11000 }, { "epoch": 3.74214531695081, "grad_norm": 2.8862857830970543, "learning_rate": 7.613304791739465e-07, "loss": 0.3636, "step": 11005 }, { "epoch": 3.7438459249181584, "grad_norm": 5.550515786579572, "learning_rate": 7.593826335139026e-07, "loss": 0.3541, "step": 11010 }, { "epoch": 3.7455465328855064, "grad_norm": 4.427229901279991, "learning_rate": 7.574368364949872e-07, "loss": 0.3323, "step": 11015 }, { "epoch": 3.747247140852855, "grad_norm": 4.178971671598606, "learning_rate": 7.554930904073313e-07, "loss": 0.3335, "step": 11020 }, { "epoch": 3.7489477488202034, "grad_norm": 3.8652351408257153, "learning_rate": 7.535513975386496e-07, "loss": 0.3588, "step": 11025 }, { "epoch": 3.7506483567875515, "grad_norm": 5.292558860177557, "learning_rate": 7.516117601742434e-07, "loss": 0.34, "step": 11030 }, { "epoch": 3.7523489647549, "grad_norm": 3.826163337894789, "learning_rate": 7.496741805969907e-07, "loss": 0.3537, "step": 11035 }, { "epoch": 3.754049572722248, "grad_norm": 6.067442756842923, "learning_rate": 7.477386610873516e-07, "loss": 0.3455, "step": 11040 }, { "epoch": 3.7557501806895965, "grad_norm": 2.9473887584803076, "learning_rate": 7.458052039233582e-07, "loss": 0.3453, "step": 11045 }, { "epoch": 3.7574507886569446, "grad_norm": 4.690369858336077, "learning_rate": 7.438738113806184e-07, "loss": 0.3557, "step": 11050 }, { "epoch": 3.759151396624293, "grad_norm": 4.491174167199348, "learning_rate": 7.419444857323068e-07, "loss": 0.3472, "step": 11055 }, { "epoch": 3.7608520045916416, "grad_norm": 3.799149701193832, "learning_rate": 7.400172292491686e-07, "loss": 0.3458, "step": 11060 }, { "epoch": 3.7625526125589897, "grad_norm": 4.673187961284107, "learning_rate": 7.38092044199511e-07, "loss": 0.3556, "step": 11065 }, { "epoch": 3.764253220526338, "grad_norm": 4.23084028829816, "learning_rate": 7.361689328492044e-07, "loss": 0.3521, "step": 11070 }, { "epoch": 3.7659538284936867, "grad_norm": 3.152798840011891, "learning_rate": 7.342478974616799e-07, "loss": 0.3581, "step": 11075 }, { "epoch": 3.7676544364610347, "grad_norm": 23.4908790433505, "learning_rate": 7.323289402979223e-07, "loss": 0.3593, "step": 11080 }, { "epoch": 3.769355044428383, "grad_norm": 16.479284492601987, "learning_rate": 7.30412063616473e-07, "loss": 0.3544, "step": 11085 }, { "epoch": 3.7710556523957317, "grad_norm": 4.3508974680303, "learning_rate": 7.284972696734225e-07, "loss": 0.3333, "step": 11090 }, { "epoch": 3.7727562603630798, "grad_norm": 3.952590145244473, "learning_rate": 7.265845607224125e-07, "loss": 0.3402, "step": 11095 }, { "epoch": 3.7744568683304283, "grad_norm": 3.421443781155386, "learning_rate": 7.246739390146285e-07, "loss": 0.3609, "step": 11100 }, { "epoch": 3.7761574762977763, "grad_norm": 8.340835601491221, "learning_rate": 7.227654067988013e-07, "loss": 0.3547, "step": 11105 }, { "epoch": 3.777858084265125, "grad_norm": 3.411412544859863, "learning_rate": 7.208589663212002e-07, "loss": 0.3497, "step": 11110 }, { "epoch": 3.779558692232473, "grad_norm": 3.567161121256871, "learning_rate": 7.18954619825635e-07, "loss": 0.37, "step": 11115 }, { "epoch": 3.7812593001998214, "grad_norm": 3.615767506793355, "learning_rate": 7.17052369553449e-07, "loss": 0.3336, "step": 11120 }, { "epoch": 3.78295990816717, "grad_norm": 4.134690328545315, "learning_rate": 7.151522177435196e-07, "loss": 0.3551, "step": 11125 }, { "epoch": 3.784660516134518, "grad_norm": 4.0514203148253385, "learning_rate": 7.132541666322548e-07, "loss": 0.3326, "step": 11130 }, { "epoch": 3.7863611241018664, "grad_norm": 4.172031178322271, "learning_rate": 7.113582184535874e-07, "loss": 0.3582, "step": 11135 }, { "epoch": 3.788061732069215, "grad_norm": 2.7164967983329187, "learning_rate": 7.094643754389794e-07, "loss": 0.3478, "step": 11140 }, { "epoch": 3.789762340036563, "grad_norm": 3.3892192131767804, "learning_rate": 7.075726398174104e-07, "loss": 0.3636, "step": 11145 }, { "epoch": 3.7914629480039115, "grad_norm": 2.963601178791977, "learning_rate": 7.056830138153842e-07, "loss": 0.361, "step": 11150 }, { "epoch": 3.79316355597126, "grad_norm": 12.594805720891362, "learning_rate": 7.037954996569174e-07, "loss": 0.3305, "step": 11155 }, { "epoch": 3.794864163938608, "grad_norm": 3.162660613013296, "learning_rate": 7.01910099563545e-07, "loss": 0.3404, "step": 11160 }, { "epoch": 3.7965647719059565, "grad_norm": 3.8671981846704053, "learning_rate": 7.000268157543102e-07, "loss": 0.352, "step": 11165 }, { "epoch": 3.7982653798733046, "grad_norm": 3.331325951800168, "learning_rate": 6.981456504457687e-07, "loss": 0.3444, "step": 11170 }, { "epoch": 3.799965987840653, "grad_norm": 4.034136357496109, "learning_rate": 6.962666058519801e-07, "loss": 0.329, "step": 11175 }, { "epoch": 3.801666595808001, "grad_norm": 5.7048239174405735, "learning_rate": 6.943896841845105e-07, "loss": 0.3606, "step": 11180 }, { "epoch": 3.8033672037753496, "grad_norm": 3.9899757130108324, "learning_rate": 6.925148876524243e-07, "loss": 0.3587, "step": 11185 }, { "epoch": 3.805067811742698, "grad_norm": 8.230030083565003, "learning_rate": 6.906422184622874e-07, "loss": 0.3427, "step": 11190 }, { "epoch": 3.806768419710046, "grad_norm": 73.47520067805937, "learning_rate": 6.88771678818162e-07, "loss": 0.3474, "step": 11195 }, { "epoch": 3.8084690276773947, "grad_norm": 3.933562785537101, "learning_rate": 6.869032709216009e-07, "loss": 0.3227, "step": 11200 }, { "epoch": 3.810169635644743, "grad_norm": 4.750777773820389, "learning_rate": 6.850369969716514e-07, "loss": 0.3598, "step": 11205 }, { "epoch": 3.8118702436120913, "grad_norm": 4.465645703924592, "learning_rate": 6.831728591648465e-07, "loss": 0.3295, "step": 11210 }, { "epoch": 3.8135708515794398, "grad_norm": 5.031035772172988, "learning_rate": 6.813108596952075e-07, "loss": 0.3438, "step": 11215 }, { "epoch": 3.815271459546788, "grad_norm": 3.7204766759148225, "learning_rate": 6.794510007542363e-07, "loss": 0.3468, "step": 11220 }, { "epoch": 3.8169720675141363, "grad_norm": 31.049482893342063, "learning_rate": 6.775932845309183e-07, "loss": 0.3222, "step": 11225 }, { "epoch": 3.8186726754814844, "grad_norm": 3.281552307266753, "learning_rate": 6.757377132117144e-07, "loss": 0.3525, "step": 11230 }, { "epoch": 3.820373283448833, "grad_norm": 6.6996368343978645, "learning_rate": 6.73884288980563e-07, "loss": 0.3483, "step": 11235 }, { "epoch": 3.8220738914161814, "grad_norm": 8.196919341429526, "learning_rate": 6.720330140188738e-07, "loss": 0.3287, "step": 11240 }, { "epoch": 3.8237744993835294, "grad_norm": 6.205721311864081, "learning_rate": 6.701838905055283e-07, "loss": 0.3471, "step": 11245 }, { "epoch": 3.825475107350878, "grad_norm": 3.5349480188081577, "learning_rate": 6.68336920616876e-07, "loss": 0.354, "step": 11250 }, { "epoch": 3.8271757153182264, "grad_norm": 7.080195985500879, "learning_rate": 6.664921065267294e-07, "loss": 0.346, "step": 11255 }, { "epoch": 3.8288763232855745, "grad_norm": 3.8942073965633903, "learning_rate": 6.646494504063669e-07, "loss": 0.3302, "step": 11260 }, { "epoch": 3.830576931252923, "grad_norm": 4.2304240711639896, "learning_rate": 6.628089544245239e-07, "loss": 0.3488, "step": 11265 }, { "epoch": 3.8322775392202715, "grad_norm": 6.856566626034545, "learning_rate": 6.609706207473962e-07, "loss": 0.3347, "step": 11270 }, { "epoch": 3.8339781471876195, "grad_norm": 4.627238887062445, "learning_rate": 6.591344515386322e-07, "loss": 0.3409, "step": 11275 }, { "epoch": 3.835678755154968, "grad_norm": 3.9126156850905693, "learning_rate": 6.573004489593352e-07, "loss": 0.3273, "step": 11280 }, { "epoch": 3.837379363122316, "grad_norm": 9.229389838058808, "learning_rate": 6.554686151680553e-07, "loss": 0.3605, "step": 11285 }, { "epoch": 3.8390799710896646, "grad_norm": 4.1718374787018835, "learning_rate": 6.536389523207942e-07, "loss": 0.3593, "step": 11290 }, { "epoch": 3.8407805790570126, "grad_norm": 4.279657448841572, "learning_rate": 6.518114625709946e-07, "loss": 0.3401, "step": 11295 }, { "epoch": 3.842481187024361, "grad_norm": 3.7404141093085412, "learning_rate": 6.499861480695441e-07, "loss": 0.3624, "step": 11300 }, { "epoch": 3.8441817949917096, "grad_norm": 4.092752721420215, "learning_rate": 6.48163010964768e-07, "loss": 0.3372, "step": 11305 }, { "epoch": 3.8458824029590577, "grad_norm": 3.9814207225263187, "learning_rate": 6.463420534024309e-07, "loss": 0.3357, "step": 11310 }, { "epoch": 3.847583010926406, "grad_norm": 4.154660851481634, "learning_rate": 6.445232775257318e-07, "loss": 0.331, "step": 11315 }, { "epoch": 3.8492836188937547, "grad_norm": 4.795935575498587, "learning_rate": 6.427066854753003e-07, "loss": 0.3449, "step": 11320 }, { "epoch": 3.8509842268611028, "grad_norm": 5.521989090755959, "learning_rate": 6.408922793891981e-07, "loss": 0.3345, "step": 11325 }, { "epoch": 3.8526848348284513, "grad_norm": 3.6124264673601765, "learning_rate": 6.390800614029116e-07, "loss": 0.3611, "step": 11330 }, { "epoch": 3.8543854427957998, "grad_norm": 3.958122727061788, "learning_rate": 6.372700336493546e-07, "loss": 0.3374, "step": 11335 }, { "epoch": 3.856086050763148, "grad_norm": 9.65760106782031, "learning_rate": 6.354621982588596e-07, "loss": 0.3428, "step": 11340 }, { "epoch": 3.8577866587304963, "grad_norm": 4.3528128756831475, "learning_rate": 6.336565573591833e-07, "loss": 0.3384, "step": 11345 }, { "epoch": 3.8594872666978444, "grad_norm": 4.133636095887551, "learning_rate": 6.318531130754949e-07, "loss": 0.3488, "step": 11350 }, { "epoch": 3.861187874665193, "grad_norm": 4.171733396761034, "learning_rate": 6.300518675303821e-07, "loss": 0.3519, "step": 11355 }, { "epoch": 3.862888482632541, "grad_norm": 14.418268068439486, "learning_rate": 6.282528228438417e-07, "loss": 0.3318, "step": 11360 }, { "epoch": 3.8645890905998894, "grad_norm": 14.77951062584472, "learning_rate": 6.264559811332829e-07, "loss": 0.3408, "step": 11365 }, { "epoch": 3.866289698567238, "grad_norm": 3.52859372811698, "learning_rate": 6.246613445135194e-07, "loss": 0.3484, "step": 11370 }, { "epoch": 3.867990306534586, "grad_norm": 5.050039245168925, "learning_rate": 6.228689150967718e-07, "loss": 0.3387, "step": 11375 }, { "epoch": 3.8696909145019345, "grad_norm": 3.082733389247098, "learning_rate": 6.210786949926626e-07, "loss": 0.3356, "step": 11380 }, { "epoch": 3.871391522469283, "grad_norm": 3.621152839290671, "learning_rate": 6.192906863082121e-07, "loss": 0.3294, "step": 11385 }, { "epoch": 3.873092130436631, "grad_norm": 21.51319810625766, "learning_rate": 6.175048911478407e-07, "loss": 0.3412, "step": 11390 }, { "epoch": 3.8747927384039795, "grad_norm": 11.121889177588187, "learning_rate": 6.157213116133604e-07, "loss": 0.3311, "step": 11395 }, { "epoch": 3.8764933463713276, "grad_norm": 23.979128719969413, "learning_rate": 6.139399498039791e-07, "loss": 0.3547, "step": 11400 }, { "epoch": 3.878193954338676, "grad_norm": 5.665138082890954, "learning_rate": 6.12160807816291e-07, "loss": 0.3434, "step": 11405 }, { "epoch": 3.879894562306024, "grad_norm": 6.6538289108242745, "learning_rate": 6.103838877442806e-07, "loss": 0.3622, "step": 11410 }, { "epoch": 3.8815951702733726, "grad_norm": 5.380750179055174, "learning_rate": 6.086091916793144e-07, "loss": 0.3529, "step": 11415 }, { "epoch": 3.883295778240721, "grad_norm": 3.6063176955128182, "learning_rate": 6.068367217101446e-07, "loss": 0.3644, "step": 11420 }, { "epoch": 3.884996386208069, "grad_norm": 7.8881610245357345, "learning_rate": 6.050664799228998e-07, "loss": 0.3597, "step": 11425 }, { "epoch": 3.8866969941754177, "grad_norm": 4.508278572980578, "learning_rate": 6.03298468401089e-07, "loss": 0.3466, "step": 11430 }, { "epoch": 3.888397602142766, "grad_norm": 3.061000778590297, "learning_rate": 6.01532689225596e-07, "loss": 0.3433, "step": 11435 }, { "epoch": 3.8900982101101143, "grad_norm": 3.6907552434175925, "learning_rate": 5.997691444746748e-07, "loss": 0.3466, "step": 11440 }, { "epoch": 3.8917988180774628, "grad_norm": 4.739034167572349, "learning_rate": 5.980078362239525e-07, "loss": 0.3391, "step": 11445 }, { "epoch": 3.8934994260448113, "grad_norm": 6.560917584236525, "learning_rate": 5.962487665464217e-07, "loss": 0.3555, "step": 11450 }, { "epoch": 3.8952000340121593, "grad_norm": 4.935951791754003, "learning_rate": 5.94491937512442e-07, "loss": 0.3389, "step": 11455 }, { "epoch": 3.896900641979508, "grad_norm": 3.786243244585295, "learning_rate": 5.927373511897341e-07, "loss": 0.3352, "step": 11460 }, { "epoch": 3.898601249946856, "grad_norm": 4.731739492632203, "learning_rate": 5.909850096433814e-07, "loss": 0.3542, "step": 11465 }, { "epoch": 3.9003018579142044, "grad_norm": 5.166070287462641, "learning_rate": 5.892349149358223e-07, "loss": 0.3354, "step": 11470 }, { "epoch": 3.9020024658815524, "grad_norm": 5.070315415224376, "learning_rate": 5.874870691268542e-07, "loss": 0.3449, "step": 11475 }, { "epoch": 3.903703073848901, "grad_norm": 5.32512289281027, "learning_rate": 5.857414742736239e-07, "loss": 0.342, "step": 11480 }, { "epoch": 3.9054036818162494, "grad_norm": 11.970175542429455, "learning_rate": 5.839981324306327e-07, "loss": 0.3565, "step": 11485 }, { "epoch": 3.9071042897835975, "grad_norm": 4.697387642701096, "learning_rate": 5.822570456497267e-07, "loss": 0.3442, "step": 11490 }, { "epoch": 3.908804897750946, "grad_norm": 3.2676261209216415, "learning_rate": 5.805182159801003e-07, "loss": 0.3518, "step": 11495 }, { "epoch": 3.9105055057182945, "grad_norm": 3.527915684751866, "learning_rate": 5.787816454682913e-07, "loss": 0.3454, "step": 11500 }, { "epoch": 3.9122061136856425, "grad_norm": 3.9774769863607853, "learning_rate": 5.770473361581763e-07, "loss": 0.3573, "step": 11505 }, { "epoch": 3.913906721652991, "grad_norm": 4.598293619230731, "learning_rate": 5.753152900909739e-07, "loss": 0.3219, "step": 11510 }, { "epoch": 3.9156073296203395, "grad_norm": 4.803845055709252, "learning_rate": 5.73585509305235e-07, "loss": 0.343, "step": 11515 }, { "epoch": 3.9173079375876876, "grad_norm": 5.752345994677968, "learning_rate": 5.718579958368486e-07, "loss": 0.3603, "step": 11520 }, { "epoch": 3.919008545555036, "grad_norm": 3.582750747285596, "learning_rate": 5.701327517190311e-07, "loss": 0.3415, "step": 11525 }, { "epoch": 3.920709153522384, "grad_norm": 6.75928754986394, "learning_rate": 5.684097789823318e-07, "loss": 0.3307, "step": 11530 }, { "epoch": 3.9224097614897326, "grad_norm": 4.849660560457972, "learning_rate": 5.666890796546228e-07, "loss": 0.3349, "step": 11535 }, { "epoch": 3.9241103694570807, "grad_norm": 4.078913800118893, "learning_rate": 5.649706557611043e-07, "loss": 0.3587, "step": 11540 }, { "epoch": 3.925810977424429, "grad_norm": 5.998123726294037, "learning_rate": 5.632545093242949e-07, "loss": 0.3386, "step": 11545 }, { "epoch": 3.9275115853917777, "grad_norm": 12.909793774483513, "learning_rate": 5.615406423640355e-07, "loss": 0.3558, "step": 11550 }, { "epoch": 3.9292121933591257, "grad_norm": 9.161852962276848, "learning_rate": 5.598290568974829e-07, "loss": 0.3484, "step": 11555 }, { "epoch": 3.9309128013264742, "grad_norm": 5.681001215541077, "learning_rate": 5.58119754939108e-07, "loss": 0.345, "step": 11560 }, { "epoch": 3.9326134092938227, "grad_norm": 18.390125379797897, "learning_rate": 5.564127385006962e-07, "loss": 0.3573, "step": 11565 }, { "epoch": 3.934314017261171, "grad_norm": 4.473431760016146, "learning_rate": 5.547080095913399e-07, "loss": 0.3613, "step": 11570 }, { "epoch": 3.9360146252285193, "grad_norm": 5.011396356874849, "learning_rate": 5.530055702174428e-07, "loss": 0.3577, "step": 11575 }, { "epoch": 3.9377152331958674, "grad_norm": 3.4884679262972322, "learning_rate": 5.513054223827099e-07, "loss": 0.3486, "step": 11580 }, { "epoch": 3.939415841163216, "grad_norm": 7.556067494975034, "learning_rate": 5.49607568088153e-07, "loss": 0.34, "step": 11585 }, { "epoch": 3.941116449130564, "grad_norm": 3.5129385303314553, "learning_rate": 5.479120093320814e-07, "loss": 0.3394, "step": 11590 }, { "epoch": 3.9428170570979124, "grad_norm": 5.3164494083512395, "learning_rate": 5.462187481101053e-07, "loss": 0.3578, "step": 11595 }, { "epoch": 3.944517665065261, "grad_norm": 6.204423170047807, "learning_rate": 5.445277864151277e-07, "loss": 0.3382, "step": 11600 }, { "epoch": 3.946218273032609, "grad_norm": 4.216013760119267, "learning_rate": 5.428391262373483e-07, "loss": 0.3263, "step": 11605 }, { "epoch": 3.9479188809999575, "grad_norm": 4.254713132798691, "learning_rate": 5.411527695642557e-07, "loss": 0.3322, "step": 11610 }, { "epoch": 3.949619488967306, "grad_norm": 7.176228807971487, "learning_rate": 5.394687183806286e-07, "loss": 0.3306, "step": 11615 }, { "epoch": 3.951320096934654, "grad_norm": 4.554733786870882, "learning_rate": 5.377869746685326e-07, "loss": 0.3653, "step": 11620 }, { "epoch": 3.9530207049020025, "grad_norm": 5.883493743275225, "learning_rate": 5.361075404073151e-07, "loss": 0.3578, "step": 11625 }, { "epoch": 3.954721312869351, "grad_norm": 5.552533536404681, "learning_rate": 5.344304175736089e-07, "loss": 0.3498, "step": 11630 }, { "epoch": 3.956421920836699, "grad_norm": 3.6293689879441855, "learning_rate": 5.327556081413221e-07, "loss": 0.3344, "step": 11635 }, { "epoch": 3.9581225288040476, "grad_norm": 5.834120539117083, "learning_rate": 5.31083114081645e-07, "loss": 0.3537, "step": 11640 }, { "epoch": 3.9598231367713956, "grad_norm": 3.559537450025789, "learning_rate": 5.294129373630383e-07, "loss": 0.3453, "step": 11645 }, { "epoch": 3.961523744738744, "grad_norm": 5.880313505645204, "learning_rate": 5.277450799512382e-07, "loss": 0.3399, "step": 11650 }, { "epoch": 3.963224352706092, "grad_norm": 30.648536531961824, "learning_rate": 5.260795438092492e-07, "loss": 0.359, "step": 11655 }, { "epoch": 3.9649249606734407, "grad_norm": 5.685154949790931, "learning_rate": 5.244163308973457e-07, "loss": 0.344, "step": 11660 }, { "epoch": 3.966625568640789, "grad_norm": 7.036854083405207, "learning_rate": 5.227554431730655e-07, "loss": 0.3735, "step": 11665 }, { "epoch": 3.9683261766081372, "grad_norm": 4.390123520957557, "learning_rate": 5.210968825912125e-07, "loss": 0.35, "step": 11670 }, { "epoch": 3.9700267845754857, "grad_norm": 13.586586371890261, "learning_rate": 5.19440651103848e-07, "loss": 0.3502, "step": 11675 }, { "epoch": 3.9717273925428342, "grad_norm": 8.034737107456621, "learning_rate": 5.177867506602962e-07, "loss": 0.3438, "step": 11680 }, { "epoch": 3.9734280005101823, "grad_norm": 3.861685766413837, "learning_rate": 5.16135183207134e-07, "loss": 0.3406, "step": 11685 }, { "epoch": 3.975128608477531, "grad_norm": 5.740961553671016, "learning_rate": 5.144859506881955e-07, "loss": 0.3297, "step": 11690 }, { "epoch": 3.9768292164448793, "grad_norm": 5.307904514450347, "learning_rate": 5.128390550445642e-07, "loss": 0.3348, "step": 11695 }, { "epoch": 3.9785298244122274, "grad_norm": 4.416571373165715, "learning_rate": 5.111944982145744e-07, "loss": 0.3519, "step": 11700 }, { "epoch": 3.980230432379576, "grad_norm": 4.4894924508616265, "learning_rate": 5.095522821338089e-07, "loss": 0.3511, "step": 11705 }, { "epoch": 3.981931040346924, "grad_norm": 4.571727707355725, "learning_rate": 5.079124087350925e-07, "loss": 0.3525, "step": 11710 }, { "epoch": 3.9836316483142724, "grad_norm": 4.111811038770526, "learning_rate": 5.062748799484962e-07, "loss": 0.3294, "step": 11715 }, { "epoch": 3.9853322562816205, "grad_norm": 4.800365658437861, "learning_rate": 5.046396977013279e-07, "loss": 0.3443, "step": 11720 }, { "epoch": 3.987032864248969, "grad_norm": 10.987577764196464, "learning_rate": 5.03006863918137e-07, "loss": 0.3356, "step": 11725 }, { "epoch": 3.9887334722163175, "grad_norm": 4.778769125156549, "learning_rate": 5.013763805207065e-07, "loss": 0.3397, "step": 11730 }, { "epoch": 3.9904340801836655, "grad_norm": 3.846104030028701, "learning_rate": 4.997482494280545e-07, "loss": 0.3285, "step": 11735 }, { "epoch": 3.992134688151014, "grad_norm": 4.834777993843937, "learning_rate": 4.981224725564296e-07, "loss": 0.3601, "step": 11740 }, { "epoch": 3.9938352961183625, "grad_norm": 10.691376689487361, "learning_rate": 4.964990518193108e-07, "loss": 0.3092, "step": 11745 }, { "epoch": 3.9955359040857106, "grad_norm": 5.0267727496093855, "learning_rate": 4.948779891274022e-07, "loss": 0.3442, "step": 11750 }, { "epoch": 3.997236512053059, "grad_norm": 6.727200424471898, "learning_rate": 4.93259286388634e-07, "loss": 0.3259, "step": 11755 }, { "epoch": 3.998937120020407, "grad_norm": 3.938920036381914, "learning_rate": 4.916429455081589e-07, "loss": 0.362, "step": 11760 }, { "epoch": 4.00034012159347, "grad_norm": 5.135731470701529, "learning_rate": 4.900289683883483e-07, "loss": 0.2891, "step": 11765 }, { "epoch": 4.002040729560818, "grad_norm": 4.496709671994853, "learning_rate": 4.884173569287937e-07, "loss": 0.3317, "step": 11770 }, { "epoch": 4.003741337528166, "grad_norm": 6.208874128074458, "learning_rate": 4.868081130263e-07, "loss": 0.3266, "step": 11775 }, { "epoch": 4.005441945495515, "grad_norm": 3.4389004641980785, "learning_rate": 4.852012385748875e-07, "loss": 0.3181, "step": 11780 }, { "epoch": 4.007142553462863, "grad_norm": 3.8965981469706903, "learning_rate": 4.835967354657864e-07, "loss": 0.3317, "step": 11785 }, { "epoch": 4.008843161430211, "grad_norm": 4.244085913003326, "learning_rate": 4.81994605587437e-07, "loss": 0.3383, "step": 11790 }, { "epoch": 4.010543769397559, "grad_norm": 4.734834888059188, "learning_rate": 4.803948508254852e-07, "loss": 0.3075, "step": 11795 }, { "epoch": 4.012244377364908, "grad_norm": 5.687682401041763, "learning_rate": 4.787974730627832e-07, "loss": 0.3404, "step": 11800 }, { "epoch": 4.013944985332256, "grad_norm": 4.60895234001638, "learning_rate": 4.772024741793829e-07, "loss": 0.3288, "step": 11805 }, { "epoch": 4.015645593299604, "grad_norm": 3.3667932580721316, "learning_rate": 4.756098560525396e-07, "loss": 0.318, "step": 11810 }, { "epoch": 4.017346201266953, "grad_norm": 7.397026909437256, "learning_rate": 4.7401962055670383e-07, "loss": 0.3168, "step": 11815 }, { "epoch": 4.019046809234301, "grad_norm": 3.8001404263947447, "learning_rate": 4.7243176956352304e-07, "loss": 0.3255, "step": 11820 }, { "epoch": 4.020747417201649, "grad_norm": 4.107057495039566, "learning_rate": 4.7084630494183907e-07, "loss": 0.3262, "step": 11825 }, { "epoch": 4.022448025168998, "grad_norm": 4.666556025494843, "learning_rate": 4.692632285576826e-07, "loss": 0.3335, "step": 11830 }, { "epoch": 4.024148633136346, "grad_norm": 17.10531736370999, "learning_rate": 4.6768254227427644e-07, "loss": 0.3335, "step": 11835 }, { "epoch": 4.025849241103694, "grad_norm": 3.8543574029556456, "learning_rate": 4.661042479520275e-07, "loss": 0.354, "step": 11840 }, { "epoch": 4.027549849071043, "grad_norm": 4.412476871447233, "learning_rate": 4.6452834744853013e-07, "loss": 0.3471, "step": 11845 }, { "epoch": 4.029250457038391, "grad_norm": 6.062941877478426, "learning_rate": 4.6295484261855845e-07, "loss": 0.3565, "step": 11850 }, { "epoch": 4.030951065005739, "grad_norm": 4.212755558174354, "learning_rate": 4.613837353140696e-07, "loss": 0.3366, "step": 11855 }, { "epoch": 4.0326516729730875, "grad_norm": 8.39574948864062, "learning_rate": 4.59815027384197e-07, "loss": 0.3487, "step": 11860 }, { "epoch": 4.034352280940436, "grad_norm": 3.6531708455443277, "learning_rate": 4.5824872067525173e-07, "loss": 0.3356, "step": 11865 }, { "epoch": 4.0360528889077845, "grad_norm": 10.149279127989958, "learning_rate": 4.566848170307167e-07, "loss": 0.3399, "step": 11870 }, { "epoch": 4.0377534968751325, "grad_norm": 8.009957540856208, "learning_rate": 4.551233182912482e-07, "loss": 0.3355, "step": 11875 }, { "epoch": 4.0394541048424815, "grad_norm": 6.730013534884652, "learning_rate": 4.5356422629467183e-07, "loss": 0.3265, "step": 11880 }, { "epoch": 4.0411547128098295, "grad_norm": 3.4953852628691076, "learning_rate": 4.5200754287597957e-07, "loss": 0.3584, "step": 11885 }, { "epoch": 4.042855320777178, "grad_norm": 5.58030837836292, "learning_rate": 4.504532698673297e-07, "loss": 0.3411, "step": 11890 }, { "epoch": 4.0445559287445265, "grad_norm": 5.8306395005855975, "learning_rate": 4.4890140909804256e-07, "loss": 0.3174, "step": 11895 }, { "epoch": 4.046256536711875, "grad_norm": 5.031289449422286, "learning_rate": 4.473519623946007e-07, "loss": 0.3415, "step": 11900 }, { "epoch": 4.047957144679223, "grad_norm": 3.398075231787273, "learning_rate": 4.4580493158064344e-07, "loss": 0.3361, "step": 11905 }, { "epoch": 4.049657752646572, "grad_norm": 5.040352157836528, "learning_rate": 4.442603184769692e-07, "loss": 0.3512, "step": 11910 }, { "epoch": 4.05135836061392, "grad_norm": 4.506327411419073, "learning_rate": 4.4271812490152777e-07, "loss": 0.331, "step": 11915 }, { "epoch": 4.053058968581268, "grad_norm": 4.242167387806916, "learning_rate": 4.411783526694247e-07, "loss": 0.3537, "step": 11920 }, { "epoch": 4.054759576548616, "grad_norm": 4.766638417890972, "learning_rate": 4.396410035929122e-07, "loss": 0.3614, "step": 11925 }, { "epoch": 4.056460184515965, "grad_norm": 4.9021226309565336, "learning_rate": 4.3810607948139404e-07, "loss": 0.3467, "step": 11930 }, { "epoch": 4.058160792483313, "grad_norm": 6.173945901366074, "learning_rate": 4.365735821414163e-07, "loss": 0.3183, "step": 11935 }, { "epoch": 4.059861400450661, "grad_norm": 6.092350405877605, "learning_rate": 4.35043513376672e-07, "loss": 0.3365, "step": 11940 }, { "epoch": 4.06156200841801, "grad_norm": 9.884534395369656, "learning_rate": 4.3351587498799474e-07, "loss": 0.3301, "step": 11945 }, { "epoch": 4.063262616385358, "grad_norm": 7.639977419079223, "learning_rate": 4.3199066877335604e-07, "loss": 0.3391, "step": 11950 }, { "epoch": 4.064963224352706, "grad_norm": 5.863388161011214, "learning_rate": 4.3046789652786776e-07, "loss": 0.3283, "step": 11955 }, { "epoch": 4.066663832320055, "grad_norm": 9.919872534614637, "learning_rate": 4.2894756004377395e-07, "loss": 0.3301, "step": 11960 }, { "epoch": 4.068364440287403, "grad_norm": 4.407329918289548, "learning_rate": 4.2742966111045474e-07, "loss": 0.3328, "step": 11965 }, { "epoch": 4.070065048254751, "grad_norm": 8.995267251325002, "learning_rate": 4.2591420151441907e-07, "loss": 0.3232, "step": 11970 }, { "epoch": 4.0717656562221, "grad_norm": 4.424394805699658, "learning_rate": 4.244011830393069e-07, "loss": 0.3306, "step": 11975 }, { "epoch": 4.073466264189448, "grad_norm": 4.906959662453473, "learning_rate": 4.2289060746588287e-07, "loss": 0.3681, "step": 11980 }, { "epoch": 4.075166872156796, "grad_norm": 5.071382231563371, "learning_rate": 4.213824765720384e-07, "loss": 0.3211, "step": 11985 }, { "epoch": 4.076867480124144, "grad_norm": 3.9643075804477026, "learning_rate": 4.198767921327862e-07, "loss": 0.3458, "step": 11990 }, { "epoch": 4.078568088091493, "grad_norm": 10.401584275639529, "learning_rate": 4.183735559202606e-07, "loss": 0.3218, "step": 11995 }, { "epoch": 4.080268696058841, "grad_norm": 6.065678934112641, "learning_rate": 4.168727697037142e-07, "loss": 0.3357, "step": 12000 }, { "epoch": 4.081969304026189, "grad_norm": 6.745161884490871, "learning_rate": 4.1537443524951535e-07, "loss": 0.3313, "step": 12005 }, { "epoch": 4.083669911993538, "grad_norm": 7.948902751138924, "learning_rate": 4.138785543211482e-07, "loss": 0.3399, "step": 12010 }, { "epoch": 4.085370519960886, "grad_norm": 3.3476008743028696, "learning_rate": 4.123851286792069e-07, "loss": 0.352, "step": 12015 }, { "epoch": 4.087071127928234, "grad_norm": 8.785461189996955, "learning_rate": 4.1089416008139896e-07, "loss": 0.3309, "step": 12020 }, { "epoch": 4.088771735895583, "grad_norm": 8.244632978963141, "learning_rate": 4.0940565028253637e-07, "loss": 0.3129, "step": 12025 }, { "epoch": 4.090472343862931, "grad_norm": 3.7533078229490675, "learning_rate": 4.07919601034541e-07, "loss": 0.3202, "step": 12030 }, { "epoch": 4.092172951830279, "grad_norm": 4.390947639905816, "learning_rate": 4.064360140864354e-07, "loss": 0.3221, "step": 12035 }, { "epoch": 4.093873559797627, "grad_norm": 5.598651161063812, "learning_rate": 4.0495489118434676e-07, "loss": 0.3559, "step": 12040 }, { "epoch": 4.095574167764976, "grad_norm": 4.761020664773868, "learning_rate": 4.034762340714998e-07, "loss": 0.3254, "step": 12045 }, { "epoch": 4.097274775732324, "grad_norm": 6.763489160147899, "learning_rate": 4.0200004448821965e-07, "loss": 0.3429, "step": 12050 }, { "epoch": 4.098975383699672, "grad_norm": 5.971382865942539, "learning_rate": 4.0052632417192456e-07, "loss": 0.3337, "step": 12055 }, { "epoch": 4.100675991667021, "grad_norm": 6.7303913678187826, "learning_rate": 3.990550748571284e-07, "loss": 0.3256, "step": 12060 }, { "epoch": 4.102376599634369, "grad_norm": 5.083156364017551, "learning_rate": 3.975862982754369e-07, "loss": 0.3276, "step": 12065 }, { "epoch": 4.104077207601717, "grad_norm": 9.445355887927477, "learning_rate": 3.961199961555437e-07, "loss": 0.3381, "step": 12070 }, { "epoch": 4.105777815569066, "grad_norm": 5.829236079661058, "learning_rate": 3.946561702232321e-07, "loss": 0.3232, "step": 12075 }, { "epoch": 4.107478423536414, "grad_norm": 5.016957899041762, "learning_rate": 3.9319482220136955e-07, "loss": 0.3531, "step": 12080 }, { "epoch": 4.109179031503762, "grad_norm": 5.537718475199659, "learning_rate": 3.917359538099083e-07, "loss": 0.3133, "step": 12085 }, { "epoch": 4.110879639471111, "grad_norm": 4.802846544132063, "learning_rate": 3.902795667658804e-07, "loss": 0.3379, "step": 12090 }, { "epoch": 4.112580247438459, "grad_norm": 7.493129998410939, "learning_rate": 3.8882566278340003e-07, "loss": 0.3264, "step": 12095 }, { "epoch": 4.1142808554058075, "grad_norm": 7.924193722489419, "learning_rate": 3.8737424357365634e-07, "loss": 0.313, "step": 12100 }, { "epoch": 4.1159814633731555, "grad_norm": 6.096111272079246, "learning_rate": 3.8592531084491594e-07, "loss": 0.3567, "step": 12105 }, { "epoch": 4.1176820713405045, "grad_norm": 8.966564981592384, "learning_rate": 3.84478866302517e-07, "loss": 0.326, "step": 12110 }, { "epoch": 4.1193826793078525, "grad_norm": 6.2294686801633015, "learning_rate": 3.8303491164887185e-07, "loss": 0.345, "step": 12115 }, { "epoch": 4.121083287275201, "grad_norm": 8.22935998004008, "learning_rate": 3.815934485834591e-07, "loss": 0.3341, "step": 12120 }, { "epoch": 4.1227838952425495, "grad_norm": 5.905648836128258, "learning_rate": 3.801544788028275e-07, "loss": 0.3152, "step": 12125 }, { "epoch": 4.124484503209898, "grad_norm": 3.4890504674637013, "learning_rate": 3.787180040005908e-07, "loss": 0.3425, "step": 12130 }, { "epoch": 4.126185111177246, "grad_norm": 3.7863186325993596, "learning_rate": 3.772840258674243e-07, "loss": 0.3399, "step": 12135 }, { "epoch": 4.127885719144595, "grad_norm": 5.226220359513184, "learning_rate": 3.758525460910681e-07, "loss": 0.3464, "step": 12140 }, { "epoch": 4.129586327111943, "grad_norm": 9.037509795642451, "learning_rate": 3.744235663563181e-07, "loss": 0.3428, "step": 12145 }, { "epoch": 4.131286935079291, "grad_norm": 5.30353100529993, "learning_rate": 3.729970883450315e-07, "loss": 0.3329, "step": 12150 }, { "epoch": 4.132987543046639, "grad_norm": 5.967745002578333, "learning_rate": 3.715731137361178e-07, "loss": 0.3414, "step": 12155 }, { "epoch": 4.134688151013988, "grad_norm": 3.6828922962503916, "learning_rate": 3.701516442055425e-07, "loss": 0.3219, "step": 12160 }, { "epoch": 4.136388758981336, "grad_norm": 4.306941870592475, "learning_rate": 3.687326814263209e-07, "loss": 0.3327, "step": 12165 }, { "epoch": 4.138089366948684, "grad_norm": 20.32543527178242, "learning_rate": 3.673162270685196e-07, "loss": 0.3409, "step": 12170 }, { "epoch": 4.139789974916033, "grad_norm": 9.974612321929813, "learning_rate": 3.6590228279925116e-07, "loss": 0.3322, "step": 12175 }, { "epoch": 4.141490582883381, "grad_norm": 5.616076419213856, "learning_rate": 3.644908502826755e-07, "loss": 0.332, "step": 12180 }, { "epoch": 4.143191190850729, "grad_norm": 6.9246878710743465, "learning_rate": 3.6308193117999573e-07, "loss": 0.3365, "step": 12185 }, { "epoch": 4.144891798818078, "grad_norm": 5.369312933630458, "learning_rate": 3.616755271494557e-07, "loss": 0.3192, "step": 12190 }, { "epoch": 4.146592406785426, "grad_norm": 5.059545246742273, "learning_rate": 3.60271639846341e-07, "loss": 0.317, "step": 12195 }, { "epoch": 4.148293014752774, "grad_norm": 3.4426579367914507, "learning_rate": 3.5887027092297327e-07, "loss": 0.3312, "step": 12200 }, { "epoch": 4.149993622720123, "grad_norm": 5.554490698341612, "learning_rate": 3.5747142202871204e-07, "loss": 0.3408, "step": 12205 }, { "epoch": 4.151694230687471, "grad_norm": 19.692853991946798, "learning_rate": 3.560750948099484e-07, "loss": 0.3324, "step": 12210 }, { "epoch": 4.153394838654819, "grad_norm": 4.387421607416816, "learning_rate": 3.5468129091010854e-07, "loss": 0.3025, "step": 12215 }, { "epoch": 4.155095446622167, "grad_norm": 3.695903233173566, "learning_rate": 3.5329001196964557e-07, "loss": 0.3233, "step": 12220 }, { "epoch": 4.156796054589516, "grad_norm": 8.639861256721842, "learning_rate": 3.5190125962604405e-07, "loss": 0.3372, "step": 12225 }, { "epoch": 4.158496662556864, "grad_norm": 5.297218588291433, "learning_rate": 3.5051503551381175e-07, "loss": 0.3599, "step": 12230 }, { "epoch": 4.160197270524212, "grad_norm": 4.766813102258966, "learning_rate": 3.4913134126448365e-07, "loss": 0.3458, "step": 12235 }, { "epoch": 4.161897878491561, "grad_norm": 3.890680257164456, "learning_rate": 3.4775017850661425e-07, "loss": 0.3197, "step": 12240 }, { "epoch": 4.163598486458909, "grad_norm": 2.740938586587348, "learning_rate": 3.4637154886578123e-07, "loss": 0.3442, "step": 12245 }, { "epoch": 4.165299094426257, "grad_norm": 4.018664745701717, "learning_rate": 3.4499545396458e-07, "loss": 0.3376, "step": 12250 }, { "epoch": 4.166999702393606, "grad_norm": 13.659635890780235, "learning_rate": 3.436218954226214e-07, "loss": 0.3299, "step": 12255 }, { "epoch": 4.168700310360954, "grad_norm": 4.199637246793097, "learning_rate": 3.422508748565334e-07, "loss": 0.3346, "step": 12260 }, { "epoch": 4.170400918328302, "grad_norm": 4.015286212983886, "learning_rate": 3.408823938799544e-07, "loss": 0.3408, "step": 12265 }, { "epoch": 4.172101526295651, "grad_norm": 14.278961850137955, "learning_rate": 3.395164541035359e-07, "loss": 0.3329, "step": 12270 }, { "epoch": 4.173802134262999, "grad_norm": 4.070675448977317, "learning_rate": 3.38153057134937e-07, "loss": 0.3293, "step": 12275 }, { "epoch": 4.175502742230347, "grad_norm": 7.07040459557904, "learning_rate": 3.3679220457882525e-07, "loss": 0.3378, "step": 12280 }, { "epoch": 4.177203350197695, "grad_norm": 6.11377470581607, "learning_rate": 3.3543389803687207e-07, "loss": 0.3417, "step": 12285 }, { "epoch": 4.178903958165044, "grad_norm": 5.743366459885062, "learning_rate": 3.340781391077541e-07, "loss": 0.3521, "step": 12290 }, { "epoch": 4.180604566132392, "grad_norm": 12.218672829962044, "learning_rate": 3.3272492938714733e-07, "loss": 0.3117, "step": 12295 }, { "epoch": 4.18230517409974, "grad_norm": 9.896118453917886, "learning_rate": 3.3137427046772975e-07, "loss": 0.3359, "step": 12300 }, { "epoch": 4.184005782067089, "grad_norm": 5.284094524863797, "learning_rate": 3.300261639391761e-07, "loss": 0.3132, "step": 12305 }, { "epoch": 4.185706390034437, "grad_norm": 3.7749420884738427, "learning_rate": 3.28680611388156e-07, "loss": 0.3173, "step": 12310 }, { "epoch": 4.187406998001785, "grad_norm": 20.930912970941833, "learning_rate": 3.273376143983356e-07, "loss": 0.3447, "step": 12315 }, { "epoch": 4.189107605969134, "grad_norm": 4.613633442599528, "learning_rate": 3.259971745503704e-07, "loss": 0.3421, "step": 12320 }, { "epoch": 4.190808213936482, "grad_norm": 5.5828361989817115, "learning_rate": 3.2465929342190867e-07, "loss": 0.3452, "step": 12325 }, { "epoch": 4.1925088219038305, "grad_norm": 4.37957903611316, "learning_rate": 3.233239725875853e-07, "loss": 0.3574, "step": 12330 }, { "epoch": 4.194209429871179, "grad_norm": 4.51865029906568, "learning_rate": 3.219912136190237e-07, "loss": 0.3309, "step": 12335 }, { "epoch": 4.1959100378385275, "grad_norm": 6.1822263663390755, "learning_rate": 3.206610180848296e-07, "loss": 0.3358, "step": 12340 }, { "epoch": 4.1976106458058755, "grad_norm": 3.701188018585034, "learning_rate": 3.1933338755059497e-07, "loss": 0.3205, "step": 12345 }, { "epoch": 4.199311253773224, "grad_norm": 7.457425086689321, "learning_rate": 3.1800832357888887e-07, "loss": 0.3302, "step": 12350 }, { "epoch": 4.2010118617405725, "grad_norm": 5.98366094900533, "learning_rate": 3.1668582772926367e-07, "loss": 0.3231, "step": 12355 }, { "epoch": 4.202712469707921, "grad_norm": 6.143107452522967, "learning_rate": 3.1536590155824554e-07, "loss": 0.3348, "step": 12360 }, { "epoch": 4.204413077675269, "grad_norm": 4.104381555667731, "learning_rate": 3.1404854661933853e-07, "loss": 0.3218, "step": 12365 }, { "epoch": 4.206113685642618, "grad_norm": 23.656709595759143, "learning_rate": 3.1273376446302073e-07, "loss": 0.3162, "step": 12370 }, { "epoch": 4.207814293609966, "grad_norm": 13.627337480427906, "learning_rate": 3.1142155663674013e-07, "loss": 0.3279, "step": 12375 }, { "epoch": 4.209514901577314, "grad_norm": 3.976971210462784, "learning_rate": 3.1011192468491657e-07, "loss": 0.3242, "step": 12380 }, { "epoch": 4.211215509544663, "grad_norm": 9.02059291841068, "learning_rate": 3.088048701489368e-07, "loss": 0.3399, "step": 12385 }, { "epoch": 4.212916117512011, "grad_norm": 7.79347074633455, "learning_rate": 3.075003945671559e-07, "loss": 0.3187, "step": 12390 }, { "epoch": 4.214616725479359, "grad_norm": 3.501972624953021, "learning_rate": 3.0619849947489123e-07, "loss": 0.3289, "step": 12395 }, { "epoch": 4.216317333446707, "grad_norm": 4.558927656676324, "learning_rate": 3.0489918640442593e-07, "loss": 0.3453, "step": 12400 }, { "epoch": 4.218017941414056, "grad_norm": 11.039193918155615, "learning_rate": 3.03602456885001e-07, "loss": 0.3145, "step": 12405 }, { "epoch": 4.219718549381404, "grad_norm": 4.691774804474439, "learning_rate": 3.0230831244281943e-07, "loss": 0.3244, "step": 12410 }, { "epoch": 4.221419157348752, "grad_norm": 37.11292414140762, "learning_rate": 3.010167546010395e-07, "loss": 0.316, "step": 12415 }, { "epoch": 4.223119765316101, "grad_norm": 4.6314090129134735, "learning_rate": 2.997277848797769e-07, "loss": 0.3254, "step": 12420 }, { "epoch": 4.224820373283449, "grad_norm": 4.314612424750355, "learning_rate": 2.9844140479610067e-07, "loss": 0.3282, "step": 12425 }, { "epoch": 4.226520981250797, "grad_norm": 9.376175709616241, "learning_rate": 2.97157615864031e-07, "loss": 0.3271, "step": 12430 }, { "epoch": 4.228221589218146, "grad_norm": 7.5985891288198175, "learning_rate": 2.9587641959454016e-07, "loss": 0.3177, "step": 12435 }, { "epoch": 4.229922197185494, "grad_norm": 3.748623010254426, "learning_rate": 2.945978174955466e-07, "loss": 0.3318, "step": 12440 }, { "epoch": 4.231622805152842, "grad_norm": 3.5006880481945637, "learning_rate": 2.9332181107191827e-07, "loss": 0.3305, "step": 12445 }, { "epoch": 4.233323413120191, "grad_norm": 5.626310084463864, "learning_rate": 2.9204840182546547e-07, "loss": 0.3236, "step": 12450 }, { "epoch": 4.235024021087539, "grad_norm": 4.888099802880378, "learning_rate": 2.90777591254944e-07, "loss": 0.3361, "step": 12455 }, { "epoch": 4.236724629054887, "grad_norm": 5.996948734691151, "learning_rate": 2.895093808560492e-07, "loss": 0.3179, "step": 12460 }, { "epoch": 4.238425237022235, "grad_norm": 9.206406290347116, "learning_rate": 2.882437721214179e-07, "loss": 0.3351, "step": 12465 }, { "epoch": 4.240125844989584, "grad_norm": 3.434852619964506, "learning_rate": 2.8698076654062266e-07, "loss": 0.3554, "step": 12470 }, { "epoch": 4.241826452956932, "grad_norm": 21.163064351724614, "learning_rate": 2.8572036560017504e-07, "loss": 0.3268, "step": 12475 }, { "epoch": 4.24352706092428, "grad_norm": 4.7982412121970635, "learning_rate": 2.84462570783518e-07, "loss": 0.3428, "step": 12480 }, { "epoch": 4.245227668891629, "grad_norm": 6.091033321935728, "learning_rate": 2.832073835710295e-07, "loss": 0.3298, "step": 12485 }, { "epoch": 4.246928276858977, "grad_norm": 3.8101277792160797, "learning_rate": 2.819548054400181e-07, "loss": 0.3448, "step": 12490 }, { "epoch": 4.248628884826325, "grad_norm": 4.627526846992593, "learning_rate": 2.8070483786472036e-07, "loss": 0.3364, "step": 12495 }, { "epoch": 4.250329492793674, "grad_norm": 11.60964562591399, "learning_rate": 2.7945748231630154e-07, "loss": 0.3361, "step": 12500 }, { "epoch": 4.252030100761022, "grad_norm": 3.4926976379349997, "learning_rate": 2.782127402628515e-07, "loss": 0.3433, "step": 12505 }, { "epoch": 4.25373070872837, "grad_norm": 4.21342937678202, "learning_rate": 2.7697061316938586e-07, "loss": 0.3018, "step": 12510 }, { "epoch": 4.255431316695718, "grad_norm": 7.127233077067102, "learning_rate": 2.7573110249784014e-07, "loss": 0.3407, "step": 12515 }, { "epoch": 4.257131924663067, "grad_norm": 8.685577768966226, "learning_rate": 2.7449420970707297e-07, "loss": 0.3041, "step": 12520 }, { "epoch": 4.258832532630415, "grad_norm": 4.294271976213762, "learning_rate": 2.732599362528596e-07, "loss": 0.332, "step": 12525 }, { "epoch": 4.260533140597763, "grad_norm": 4.503960368111351, "learning_rate": 2.7202828358789455e-07, "loss": 0.3232, "step": 12530 }, { "epoch": 4.262233748565112, "grad_norm": 4.529662416358458, "learning_rate": 2.7079925316178536e-07, "loss": 0.3121, "step": 12535 }, { "epoch": 4.26393435653246, "grad_norm": 4.065393870043167, "learning_rate": 2.6957284642105536e-07, "loss": 0.3622, "step": 12540 }, { "epoch": 4.265634964499808, "grad_norm": 8.070950137139976, "learning_rate": 2.6834906480913943e-07, "loss": 0.3289, "step": 12545 }, { "epoch": 4.267335572467157, "grad_norm": 4.742477935472126, "learning_rate": 2.671279097663818e-07, "loss": 0.3404, "step": 12550 }, { "epoch": 4.269036180434505, "grad_norm": 10.85672255056592, "learning_rate": 2.659093827300366e-07, "loss": 0.3502, "step": 12555 }, { "epoch": 4.2707367884018534, "grad_norm": 7.5889566127981105, "learning_rate": 2.6469348513426336e-07, "loss": 0.333, "step": 12560 }, { "epoch": 4.272437396369202, "grad_norm": 4.797312226441103, "learning_rate": 2.634802184101287e-07, "loss": 0.3096, "step": 12565 }, { "epoch": 4.2741380043365504, "grad_norm": 3.940287114364061, "learning_rate": 2.6226958398560124e-07, "loss": 0.3189, "step": 12570 }, { "epoch": 4.2758386123038985, "grad_norm": 5.914220125672408, "learning_rate": 2.6106158328555313e-07, "loss": 0.3348, "step": 12575 }, { "epoch": 4.277539220271247, "grad_norm": 13.579740966019758, "learning_rate": 2.598562177317543e-07, "loss": 0.3529, "step": 12580 }, { "epoch": 4.2792398282385955, "grad_norm": 15.490252641479021, "learning_rate": 2.58653488742876e-07, "loss": 0.3415, "step": 12585 }, { "epoch": 4.2809404362059436, "grad_norm": 6.938888473730673, "learning_rate": 2.574533977344837e-07, "loss": 0.3337, "step": 12590 }, { "epoch": 4.282641044173292, "grad_norm": 3.7425468287506827, "learning_rate": 2.562559461190406e-07, "loss": 0.3344, "step": 12595 }, { "epoch": 4.2843416521406406, "grad_norm": 7.1439052669196865, "learning_rate": 2.550611353059013e-07, "loss": 0.3367, "step": 12600 }, { "epoch": 4.286042260107989, "grad_norm": 14.236621388928102, "learning_rate": 2.5386896670131336e-07, "loss": 0.3216, "step": 12605 }, { "epoch": 4.287742868075337, "grad_norm": 3.9103518012490803, "learning_rate": 2.5267944170841494e-07, "loss": 0.3453, "step": 12610 }, { "epoch": 4.289443476042686, "grad_norm": 15.03324667573704, "learning_rate": 2.5149256172723095e-07, "loss": 0.336, "step": 12615 }, { "epoch": 4.291144084010034, "grad_norm": 4.9505553564093825, "learning_rate": 2.5030832815467615e-07, "loss": 0.3391, "step": 12620 }, { "epoch": 4.292844691977382, "grad_norm": 12.07133173210356, "learning_rate": 2.4912674238454724e-07, "loss": 0.3191, "step": 12625 }, { "epoch": 4.29454529994473, "grad_norm": 4.240895964099319, "learning_rate": 2.479478058075274e-07, "loss": 0.3407, "step": 12630 }, { "epoch": 4.296245907912079, "grad_norm": 5.293194735108943, "learning_rate": 2.4677151981117946e-07, "loss": 0.3018, "step": 12635 }, { "epoch": 4.297946515879427, "grad_norm": 6.1948378074512025, "learning_rate": 2.4559788577994903e-07, "loss": 0.3442, "step": 12640 }, { "epoch": 4.299647123846775, "grad_norm": 4.208129472196771, "learning_rate": 2.4442690509515835e-07, "loss": 0.342, "step": 12645 }, { "epoch": 4.301347731814124, "grad_norm": 5.572045612595394, "learning_rate": 2.432585791350081e-07, "loss": 0.3345, "step": 12650 }, { "epoch": 4.303048339781472, "grad_norm": 25.10758918360812, "learning_rate": 2.420929092745733e-07, "loss": 0.3606, "step": 12655 }, { "epoch": 4.30474894774882, "grad_norm": 3.9346043818193657, "learning_rate": 2.4092989688580373e-07, "loss": 0.3102, "step": 12660 }, { "epoch": 4.306449555716169, "grad_norm": 15.556170787230272, "learning_rate": 2.3976954333752216e-07, "loss": 0.3162, "step": 12665 }, { "epoch": 4.308150163683517, "grad_norm": 13.705266638952072, "learning_rate": 2.386118499954196e-07, "loss": 0.3277, "step": 12670 }, { "epoch": 4.309850771650865, "grad_norm": 3.5485079389858263, "learning_rate": 2.374568182220588e-07, "loss": 0.3131, "step": 12675 }, { "epoch": 4.311551379618214, "grad_norm": 4.700940412896038, "learning_rate": 2.3630444937686763e-07, "loss": 0.3549, "step": 12680 }, { "epoch": 4.313251987585562, "grad_norm": 6.919243111865925, "learning_rate": 2.3515474481614175e-07, "loss": 0.3373, "step": 12685 }, { "epoch": 4.31495259555291, "grad_norm": 26.410264409630383, "learning_rate": 2.3400770589303901e-07, "loss": 0.3449, "step": 12690 }, { "epoch": 4.316653203520259, "grad_norm": 4.110181771728492, "learning_rate": 2.3286333395758253e-07, "loss": 0.3302, "step": 12695 }, { "epoch": 4.318353811487607, "grad_norm": 5.228698212074843, "learning_rate": 2.3172163035665386e-07, "loss": 0.35, "step": 12700 }, { "epoch": 4.320054419454955, "grad_norm": 4.7766472552551535, "learning_rate": 2.3058259643399584e-07, "loss": 0.3256, "step": 12705 }, { "epoch": 4.321755027422303, "grad_norm": 4.290504770111564, "learning_rate": 2.294462335302078e-07, "loss": 0.3249, "step": 12710 }, { "epoch": 4.323455635389652, "grad_norm": 3.541077946999206, "learning_rate": 2.283125429827468e-07, "loss": 0.335, "step": 12715 }, { "epoch": 4.325156243357, "grad_norm": 3.962210605540325, "learning_rate": 2.271815261259236e-07, "loss": 0.3184, "step": 12720 }, { "epoch": 4.326856851324348, "grad_norm": 5.562231012105735, "learning_rate": 2.2605318429090224e-07, "loss": 0.3531, "step": 12725 }, { "epoch": 4.328557459291697, "grad_norm": 4.038234094819427, "learning_rate": 2.2492751880569958e-07, "loss": 0.3241, "step": 12730 }, { "epoch": 4.330258067259045, "grad_norm": 4.27507229275323, "learning_rate": 2.2380453099518057e-07, "loss": 0.3374, "step": 12735 }, { "epoch": 4.331958675226393, "grad_norm": 7.264680467702832, "learning_rate": 2.2268422218106017e-07, "loss": 0.3411, "step": 12740 }, { "epoch": 4.333659283193742, "grad_norm": 3.1918500942610173, "learning_rate": 2.2156659368189892e-07, "loss": 0.3428, "step": 12745 }, { "epoch": 4.33535989116109, "grad_norm": 9.260426826471768, "learning_rate": 2.2045164681310434e-07, "loss": 0.3258, "step": 12750 }, { "epoch": 4.337060499128438, "grad_norm": 5.721978841971327, "learning_rate": 2.193393828869264e-07, "loss": 0.3509, "step": 12755 }, { "epoch": 4.338761107095786, "grad_norm": 5.891909315177032, "learning_rate": 2.182298032124583e-07, "loss": 0.3381, "step": 12760 }, { "epoch": 4.340461715063135, "grad_norm": 7.437928772365634, "learning_rate": 2.171229090956331e-07, "loss": 0.3438, "step": 12765 }, { "epoch": 4.342162323030483, "grad_norm": 7.680142942754952, "learning_rate": 2.1601870183922402e-07, "loss": 0.3327, "step": 12770 }, { "epoch": 4.343862930997831, "grad_norm": 3.5973104304876835, "learning_rate": 2.1491718274284063e-07, "loss": 0.3397, "step": 12775 }, { "epoch": 4.34556353896518, "grad_norm": 4.860272530473138, "learning_rate": 2.1381835310293004e-07, "loss": 0.3532, "step": 12780 }, { "epoch": 4.347264146932528, "grad_norm": 3.7557972617596223, "learning_rate": 2.1272221421277383e-07, "loss": 0.3424, "step": 12785 }, { "epoch": 4.348964754899876, "grad_norm": 7.552587829453479, "learning_rate": 2.1162876736248534e-07, "loss": 0.3552, "step": 12790 }, { "epoch": 4.350665362867225, "grad_norm": 5.377756282356434, "learning_rate": 2.1053801383901117e-07, "loss": 0.3191, "step": 12795 }, { "epoch": 4.352365970834573, "grad_norm": 8.332497388343738, "learning_rate": 2.0944995492612614e-07, "loss": 0.3337, "step": 12800 }, { "epoch": 4.3540665788019215, "grad_norm": 8.811937511223524, "learning_rate": 2.0836459190443552e-07, "loss": 0.3191, "step": 12805 }, { "epoch": 4.35576718676927, "grad_norm": 8.97010518298963, "learning_rate": 2.072819260513703e-07, "loss": 0.3228, "step": 12810 }, { "epoch": 4.3574677947366185, "grad_norm": 53.90870953043256, "learning_rate": 2.0620195864118786e-07, "loss": 0.3305, "step": 12815 }, { "epoch": 4.3591684027039665, "grad_norm": 4.202582388810684, "learning_rate": 2.051246909449686e-07, "loss": 0.3373, "step": 12820 }, { "epoch": 4.360869010671315, "grad_norm": 11.864689055727705, "learning_rate": 2.0405012423061671e-07, "loss": 0.334, "step": 12825 }, { "epoch": 4.3625696186386635, "grad_norm": 4.62385496393284, "learning_rate": 2.0297825976285602e-07, "loss": 0.3125, "step": 12830 }, { "epoch": 4.364270226606012, "grad_norm": 8.993048469078259, "learning_rate": 2.0190909880323157e-07, "loss": 0.3504, "step": 12835 }, { "epoch": 4.36597083457336, "grad_norm": 6.876532401332347, "learning_rate": 2.0084264261010427e-07, "loss": 0.3355, "step": 12840 }, { "epoch": 4.367671442540709, "grad_norm": 5.528186576955831, "learning_rate": 1.9977889243865429e-07, "loss": 0.3237, "step": 12845 }, { "epoch": 4.369372050508057, "grad_norm": 7.219172837643221, "learning_rate": 1.9871784954087509e-07, "loss": 0.3386, "step": 12850 }, { "epoch": 4.371072658475405, "grad_norm": 4.891547091814564, "learning_rate": 1.9765951516557358e-07, "loss": 0.3086, "step": 12855 }, { "epoch": 4.372773266442754, "grad_norm": 6.735058635539884, "learning_rate": 1.9660389055837032e-07, "loss": 0.3194, "step": 12860 }, { "epoch": 4.374473874410102, "grad_norm": 6.233073816777856, "learning_rate": 1.9555097696169505e-07, "loss": 0.332, "step": 12865 }, { "epoch": 4.37617448237745, "grad_norm": 7.408104700323201, "learning_rate": 1.9450077561478875e-07, "loss": 0.3456, "step": 12870 }, { "epoch": 4.377875090344798, "grad_norm": 10.966215800829005, "learning_rate": 1.934532877536971e-07, "loss": 0.3387, "step": 12875 }, { "epoch": 4.379575698312147, "grad_norm": 7.624994132110487, "learning_rate": 1.9240851461127556e-07, "loss": 0.3444, "step": 12880 }, { "epoch": 4.381276306279495, "grad_norm": 4.453753920721134, "learning_rate": 1.913664574171814e-07, "loss": 0.3312, "step": 12885 }, { "epoch": 4.382976914246843, "grad_norm": 3.3374771090875863, "learning_rate": 1.9032711739787767e-07, "loss": 0.307, "step": 12890 }, { "epoch": 4.384677522214192, "grad_norm": 5.736321962818629, "learning_rate": 1.8929049577662783e-07, "loss": 0.3551, "step": 12895 }, { "epoch": 4.38637813018154, "grad_norm": 6.918296159726646, "learning_rate": 1.8825659377349704e-07, "loss": 0.3317, "step": 12900 }, { "epoch": 4.388078738148888, "grad_norm": 5.970245167001063, "learning_rate": 1.8722541260534856e-07, "loss": 0.3164, "step": 12905 }, { "epoch": 4.389779346116237, "grad_norm": 8.067603469833086, "learning_rate": 1.8619695348584433e-07, "loss": 0.353, "step": 12910 }, { "epoch": 4.391479954083585, "grad_norm": 4.4113207435854696, "learning_rate": 1.8517121762544138e-07, "loss": 0.3536, "step": 12915 }, { "epoch": 4.393180562050933, "grad_norm": 3.9116648499255584, "learning_rate": 1.841482062313929e-07, "loss": 0.3313, "step": 12920 }, { "epoch": 4.394881170018282, "grad_norm": 5.199219652097869, "learning_rate": 1.8312792050774408e-07, "loss": 0.3239, "step": 12925 }, { "epoch": 4.39658177798563, "grad_norm": 6.230748647200844, "learning_rate": 1.8211036165533324e-07, "loss": 0.3279, "step": 12930 }, { "epoch": 4.398282385952978, "grad_norm": 4.369606011124405, "learning_rate": 1.8109553087178906e-07, "loss": 0.3315, "step": 12935 }, { "epoch": 4.399982993920326, "grad_norm": 3.671190072982252, "learning_rate": 1.8008342935152855e-07, "loss": 0.3244, "step": 12940 }, { "epoch": 4.401683601887675, "grad_norm": 15.715314195589261, "learning_rate": 1.7907405828575808e-07, "loss": 0.3397, "step": 12945 }, { "epoch": 4.403384209855023, "grad_norm": 4.114398666967137, "learning_rate": 1.780674188624684e-07, "loss": 0.3246, "step": 12950 }, { "epoch": 4.405084817822371, "grad_norm": 4.237456116028077, "learning_rate": 1.7706351226643687e-07, "loss": 0.3046, "step": 12955 }, { "epoch": 4.40678542578972, "grad_norm": 5.355889648244975, "learning_rate": 1.7606233967922327e-07, "loss": 0.3192, "step": 12960 }, { "epoch": 4.408486033757068, "grad_norm": 4.169810263927873, "learning_rate": 1.7506390227917086e-07, "loss": 0.3324, "step": 12965 }, { "epoch": 4.410186641724416, "grad_norm": 3.2655003902600384, "learning_rate": 1.7406820124140162e-07, "loss": 0.3339, "step": 12970 }, { "epoch": 4.411887249691765, "grad_norm": 8.793739027776986, "learning_rate": 1.7307523773781948e-07, "loss": 0.326, "step": 12975 }, { "epoch": 4.413587857659113, "grad_norm": 4.087601343185249, "learning_rate": 1.7208501293710433e-07, "loss": 0.3265, "step": 12980 }, { "epoch": 4.415288465626461, "grad_norm": 7.843239771835991, "learning_rate": 1.7109752800471392e-07, "loss": 0.3446, "step": 12985 }, { "epoch": 4.416989073593809, "grad_norm": 4.0065398536899695, "learning_rate": 1.701127841028802e-07, "loss": 0.33, "step": 12990 }, { "epoch": 4.418689681561158, "grad_norm": 7.207831924182851, "learning_rate": 1.691307823906102e-07, "loss": 0.3259, "step": 12995 }, { "epoch": 4.420390289528506, "grad_norm": 6.902820446735924, "learning_rate": 1.6815152402368334e-07, "loss": 0.3333, "step": 13000 }, { "epoch": 4.422090897495854, "grad_norm": 4.112236792551807, "learning_rate": 1.6717501015464905e-07, "loss": 0.3222, "step": 13005 }, { "epoch": 4.423791505463203, "grad_norm": 4.152300939447504, "learning_rate": 1.6620124193282793e-07, "loss": 0.316, "step": 13010 }, { "epoch": 4.425492113430551, "grad_norm": 4.39405264061652, "learning_rate": 1.6523022050430826e-07, "loss": 0.3165, "step": 13015 }, { "epoch": 4.427192721397899, "grad_norm": 4.366272517121309, "learning_rate": 1.642619470119461e-07, "loss": 0.3398, "step": 13020 }, { "epoch": 4.428893329365248, "grad_norm": 3.589598372160312, "learning_rate": 1.6329642259536234e-07, "loss": 0.3312, "step": 13025 }, { "epoch": 4.430593937332596, "grad_norm": 3.4086614751423157, "learning_rate": 1.6233364839094324e-07, "loss": 0.3381, "step": 13030 }, { "epoch": 4.4322945452999445, "grad_norm": 8.301177756196116, "learning_rate": 1.6137362553183766e-07, "loss": 0.3644, "step": 13035 }, { "epoch": 4.433995153267293, "grad_norm": 4.825972903763466, "learning_rate": 1.604163551479568e-07, "loss": 0.3157, "step": 13040 }, { "epoch": 4.4356957612346415, "grad_norm": 3.3252927866456305, "learning_rate": 1.59461838365971e-07, "loss": 0.3311, "step": 13045 }, { "epoch": 4.4373963692019895, "grad_norm": 15.790663252217936, "learning_rate": 1.5851007630931115e-07, "loss": 0.3477, "step": 13050 }, { "epoch": 4.4390969771693385, "grad_norm": 5.767935348002293, "learning_rate": 1.5756107009816586e-07, "loss": 0.3274, "step": 13055 }, { "epoch": 4.4407975851366865, "grad_norm": 4.79697016294387, "learning_rate": 1.566148208494786e-07, "loss": 0.3316, "step": 13060 }, { "epoch": 4.442498193104035, "grad_norm": 4.731467520877539, "learning_rate": 1.556713296769502e-07, "loss": 0.3309, "step": 13065 }, { "epoch": 4.444198801071383, "grad_norm": 5.88336906810782, "learning_rate": 1.547305976910335e-07, "loss": 0.3464, "step": 13070 }, { "epoch": 4.445899409038732, "grad_norm": 5.955078035740091, "learning_rate": 1.5379262599893501e-07, "loss": 0.3263, "step": 13075 }, { "epoch": 4.44760001700608, "grad_norm": 5.12040950557112, "learning_rate": 1.5285741570461198e-07, "loss": 0.3085, "step": 13080 }, { "epoch": 4.449300624973428, "grad_norm": 10.023332032947414, "learning_rate": 1.5192496790877198e-07, "loss": 0.3169, "step": 13085 }, { "epoch": 4.451001232940777, "grad_norm": 4.201150526747543, "learning_rate": 1.5099528370887018e-07, "loss": 0.3304, "step": 13090 }, { "epoch": 4.452701840908125, "grad_norm": 3.888545762383645, "learning_rate": 1.5006836419911102e-07, "loss": 0.3412, "step": 13095 }, { "epoch": 4.454402448875473, "grad_norm": 3.65215765164716, "learning_rate": 1.4914421047044297e-07, "loss": 0.3324, "step": 13100 }, { "epoch": 4.456103056842822, "grad_norm": 4.101559637581969, "learning_rate": 1.4822282361056095e-07, "loss": 0.3137, "step": 13105 }, { "epoch": 4.45780366481017, "grad_norm": 3.6110004134728793, "learning_rate": 1.4730420470390193e-07, "loss": 0.3293, "step": 13110 }, { "epoch": 4.459504272777518, "grad_norm": 5.576116168819829, "learning_rate": 1.4638835483164581e-07, "loss": 0.3155, "step": 13115 }, { "epoch": 4.461204880744866, "grad_norm": 11.093818283062696, "learning_rate": 1.4547527507171422e-07, "loss": 0.3331, "step": 13120 }, { "epoch": 4.462905488712215, "grad_norm": 9.074945847283377, "learning_rate": 1.4456496649876668e-07, "loss": 0.3107, "step": 13125 }, { "epoch": 4.464606096679563, "grad_norm": 6.604581904508539, "learning_rate": 1.436574301842028e-07, "loss": 0.2965, "step": 13130 }, { "epoch": 4.466306704646911, "grad_norm": 4.158699458490385, "learning_rate": 1.427526671961582e-07, "loss": 0.3348, "step": 13135 }, { "epoch": 4.46800731261426, "grad_norm": 5.955671410779653, "learning_rate": 1.4185067859950553e-07, "loss": 0.3288, "step": 13140 }, { "epoch": 4.469707920581608, "grad_norm": 7.5923118833897165, "learning_rate": 1.4095146545585052e-07, "loss": 0.32, "step": 13145 }, { "epoch": 4.471408528548956, "grad_norm": 5.261442511978854, "learning_rate": 1.4005502882353418e-07, "loss": 0.3353, "step": 13150 }, { "epoch": 4.473109136516305, "grad_norm": 5.426856594721494, "learning_rate": 1.3916136975762772e-07, "loss": 0.305, "step": 13155 }, { "epoch": 4.474809744483653, "grad_norm": 4.841462577673882, "learning_rate": 1.3827048930993487e-07, "loss": 0.3428, "step": 13160 }, { "epoch": 4.476510352451001, "grad_norm": 3.6496581728439055, "learning_rate": 1.3738238852898794e-07, "loss": 0.3318, "step": 13165 }, { "epoch": 4.47821096041835, "grad_norm": 5.300579498588721, "learning_rate": 1.3649706846004862e-07, "loss": 0.3286, "step": 13170 }, { "epoch": 4.479911568385698, "grad_norm": 7.826093550107106, "learning_rate": 1.3561453014510506e-07, "loss": 0.3335, "step": 13175 }, { "epoch": 4.481612176353046, "grad_norm": 5.432994510418763, "learning_rate": 1.3473477462287166e-07, "loss": 0.3135, "step": 13180 }, { "epoch": 4.483312784320394, "grad_norm": 7.125487104046547, "learning_rate": 1.3385780292878764e-07, "loss": 0.3433, "step": 13185 }, { "epoch": 4.485013392287743, "grad_norm": 8.747829967326172, "learning_rate": 1.3298361609501513e-07, "loss": 0.3417, "step": 13190 }, { "epoch": 4.486714000255091, "grad_norm": 3.2934469329019627, "learning_rate": 1.321122151504403e-07, "loss": 0.3395, "step": 13195 }, { "epoch": 4.488414608222439, "grad_norm": 4.913953515495719, "learning_rate": 1.3124360112066775e-07, "loss": 0.3437, "step": 13200 }, { "epoch": 4.490115216189788, "grad_norm": 4.147259235923274, "learning_rate": 1.3037777502802497e-07, "loss": 0.3222, "step": 13205 }, { "epoch": 4.491815824157136, "grad_norm": 5.4785928158207335, "learning_rate": 1.2951473789155568e-07, "loss": 0.3309, "step": 13210 }, { "epoch": 4.493516432124484, "grad_norm": 4.861894897417913, "learning_rate": 1.2865449072702263e-07, "loss": 0.3389, "step": 13215 }, { "epoch": 4.495217040091833, "grad_norm": 4.048798735180959, "learning_rate": 1.27797034546904e-07, "loss": 0.3193, "step": 13220 }, { "epoch": 4.496917648059181, "grad_norm": 5.437187565523013, "learning_rate": 1.2694237036039393e-07, "loss": 0.3283, "step": 13225 }, { "epoch": 4.498618256026529, "grad_norm": 8.55892872499695, "learning_rate": 1.260904991733991e-07, "loss": 0.3128, "step": 13230 }, { "epoch": 4.500318863993877, "grad_norm": 7.9516763528490335, "learning_rate": 1.2524142198854062e-07, "loss": 0.3372, "step": 13235 }, { "epoch": 4.502019471961226, "grad_norm": 4.4812834219775155, "learning_rate": 1.2439513980515045e-07, "loss": 0.3501, "step": 13240 }, { "epoch": 4.503720079928574, "grad_norm": 5.268924961688235, "learning_rate": 1.2355165361927045e-07, "loss": 0.3548, "step": 13245 }, { "epoch": 4.505420687895922, "grad_norm": 3.3432882139172673, "learning_rate": 1.227109644236524e-07, "loss": 0.2985, "step": 13250 }, { "epoch": 4.507121295863271, "grad_norm": 4.14663984609508, "learning_rate": 1.2187307320775526e-07, "loss": 0.3348, "step": 13255 }, { "epoch": 4.508821903830619, "grad_norm": 3.9700860026519345, "learning_rate": 1.2103798095774665e-07, "loss": 0.3044, "step": 13260 }, { "epoch": 4.5105225117979675, "grad_norm": 3.9006630355912826, "learning_rate": 1.202056886564973e-07, "loss": 0.3241, "step": 13265 }, { "epoch": 4.512223119765316, "grad_norm": 6.998962310279112, "learning_rate": 1.1937619728358496e-07, "loss": 0.3182, "step": 13270 }, { "epoch": 4.5139237277326645, "grad_norm": 4.369582853724072, "learning_rate": 1.1854950781528901e-07, "loss": 0.3303, "step": 13275 }, { "epoch": 4.5156243357000125, "grad_norm": 4.789326373618072, "learning_rate": 1.177256212245928e-07, "loss": 0.3501, "step": 13280 }, { "epoch": 4.5173249436673615, "grad_norm": 8.395740382579294, "learning_rate": 1.1690453848117872e-07, "loss": 0.3328, "step": 13285 }, { "epoch": 4.5190255516347095, "grad_norm": 4.082702969850315, "learning_rate": 1.1608626055143068e-07, "loss": 0.3265, "step": 13290 }, { "epoch": 4.520726159602058, "grad_norm": 5.506729170134501, "learning_rate": 1.1527078839843164e-07, "loss": 0.3313, "step": 13295 }, { "epoch": 4.5224267675694065, "grad_norm": 4.600119129222766, "learning_rate": 1.1445812298196079e-07, "loss": 0.3422, "step": 13300 }, { "epoch": 4.524127375536755, "grad_norm": 5.250707685450341, "learning_rate": 1.1364826525849526e-07, "loss": 0.308, "step": 13305 }, { "epoch": 4.525827983504103, "grad_norm": 5.813778459770343, "learning_rate": 1.1284121618120675e-07, "loss": 0.3287, "step": 13310 }, { "epoch": 4.527528591471451, "grad_norm": 8.482249036813347, "learning_rate": 1.120369766999621e-07, "loss": 0.3542, "step": 13315 }, { "epoch": 4.5292291994388, "grad_norm": 44.4986547425875, "learning_rate": 1.1123554776132028e-07, "loss": 0.3279, "step": 13320 }, { "epoch": 4.530929807406148, "grad_norm": 5.616438060806645, "learning_rate": 1.1043693030853369e-07, "loss": 0.3327, "step": 13325 }, { "epoch": 4.532630415373496, "grad_norm": 6.246008971073881, "learning_rate": 1.0964112528154408e-07, "loss": 0.3208, "step": 13330 }, { "epoch": 4.534331023340845, "grad_norm": 20.38889674582648, "learning_rate": 1.0884813361698526e-07, "loss": 0.3313, "step": 13335 }, { "epoch": 4.536031631308193, "grad_norm": 4.253780013214621, "learning_rate": 1.0805795624817733e-07, "loss": 0.3053, "step": 13340 }, { "epoch": 4.537732239275541, "grad_norm": 6.978035722690048, "learning_rate": 1.0727059410513024e-07, "loss": 0.3326, "step": 13345 }, { "epoch": 4.539432847242889, "grad_norm": 4.420401496695248, "learning_rate": 1.0648604811453911e-07, "loss": 0.3306, "step": 13350 }, { "epoch": 4.541133455210238, "grad_norm": 9.816058519908898, "learning_rate": 1.0570431919978503e-07, "loss": 0.3374, "step": 13355 }, { "epoch": 4.542834063177586, "grad_norm": 9.606878566032291, "learning_rate": 1.0492540828093395e-07, "loss": 0.3263, "step": 13360 }, { "epoch": 4.544534671144934, "grad_norm": 7.715141830725023, "learning_rate": 1.0414931627473396e-07, "loss": 0.3301, "step": 13365 }, { "epoch": 4.546235279112283, "grad_norm": 4.680687545110913, "learning_rate": 1.0337604409461715e-07, "loss": 0.3276, "step": 13370 }, { "epoch": 4.547935887079631, "grad_norm": 4.453814003722188, "learning_rate": 1.0260559265069497e-07, "loss": 0.3377, "step": 13375 }, { "epoch": 4.549636495046979, "grad_norm": 23.469453151538225, "learning_rate": 1.0183796284976011e-07, "loss": 0.3589, "step": 13380 }, { "epoch": 4.551337103014328, "grad_norm": 4.088982279683541, "learning_rate": 1.0107315559528374e-07, "loss": 0.3125, "step": 13385 }, { "epoch": 4.553037710981676, "grad_norm": 4.8239304234020635, "learning_rate": 1.0031117178741557e-07, "loss": 0.3282, "step": 13390 }, { "epoch": 4.554738318949024, "grad_norm": 9.11888618207721, "learning_rate": 9.955201232298123e-08, "loss": 0.3315, "step": 13395 }, { "epoch": 4.556438926916373, "grad_norm": 3.8658203336746086, "learning_rate": 9.879567809548351e-08, "loss": 0.3154, "step": 13400 }, { "epoch": 4.558139534883721, "grad_norm": 4.233775252166157, "learning_rate": 9.804216999509897e-08, "loss": 0.3329, "step": 13405 }, { "epoch": 4.559840142851069, "grad_norm": 5.832164655350215, "learning_rate": 9.729148890867818e-08, "loss": 0.3363, "step": 13410 }, { "epoch": 4.561540750818418, "grad_norm": 4.4491927675603336, "learning_rate": 9.654363571974496e-08, "loss": 0.3341, "step": 13415 }, { "epoch": 4.563241358785766, "grad_norm": 4.530429269530576, "learning_rate": 9.57986113084941e-08, "loss": 0.3352, "step": 13420 }, { "epoch": 4.564941966753114, "grad_norm": 3.258923521195808, "learning_rate": 9.505641655179144e-08, "loss": 0.3142, "step": 13425 }, { "epoch": 4.566642574720462, "grad_norm": 4.257615335573072, "learning_rate": 9.431705232317179e-08, "loss": 0.3349, "step": 13430 }, { "epoch": 4.568343182687811, "grad_norm": 13.592790964567337, "learning_rate": 9.358051949283991e-08, "loss": 0.3212, "step": 13435 }, { "epoch": 4.570043790655159, "grad_norm": 6.975885256964604, "learning_rate": 9.284681892766629e-08, "loss": 0.3447, "step": 13440 }, { "epoch": 4.571744398622507, "grad_norm": 4.395602256939808, "learning_rate": 9.211595149118957e-08, "loss": 0.341, "step": 13445 }, { "epoch": 4.573445006589856, "grad_norm": 3.8954209473335295, "learning_rate": 9.138791804361253e-08, "loss": 0.3265, "step": 13450 }, { "epoch": 4.575145614557204, "grad_norm": 5.803626434070616, "learning_rate": 9.066271944180388e-08, "loss": 0.3122, "step": 13455 }, { "epoch": 4.576846222524552, "grad_norm": 4.045096160827611, "learning_rate": 8.99403565392945e-08, "loss": 0.3242, "step": 13460 }, { "epoch": 4.5785468304919, "grad_norm": 4.1985007006023425, "learning_rate": 8.922083018627875e-08, "loss": 0.3342, "step": 13465 }, { "epoch": 4.580247438459249, "grad_norm": 4.812436169736084, "learning_rate": 8.850414122961171e-08, "loss": 0.3441, "step": 13470 }, { "epoch": 4.581948046426597, "grad_norm": 3.7440161835946806, "learning_rate": 8.779029051280946e-08, "loss": 0.3282, "step": 13475 }, { "epoch": 4.583648654393945, "grad_norm": 5.370574064417769, "learning_rate": 8.7079278876048e-08, "loss": 0.3103, "step": 13480 }, { "epoch": 4.585349262361294, "grad_norm": 6.219787022073194, "learning_rate": 8.637110715616015e-08, "loss": 0.3431, "step": 13485 }, { "epoch": 4.587049870328642, "grad_norm": 4.935446514700299, "learning_rate": 8.566577618663807e-08, "loss": 0.3125, "step": 13490 }, { "epoch": 4.5887504782959905, "grad_norm": 12.87214378064997, "learning_rate": 8.496328679762967e-08, "loss": 0.341, "step": 13495 }, { "epoch": 4.590451086263339, "grad_norm": 4.231651641870352, "learning_rate": 8.426363981593855e-08, "loss": 0.3313, "step": 13500 }, { "epoch": 4.5921516942306875, "grad_norm": 3.879091516868406, "learning_rate": 8.356683606502269e-08, "loss": 0.3263, "step": 13505 }, { "epoch": 4.5938523021980355, "grad_norm": 5.468098031628809, "learning_rate": 8.287287636499414e-08, "loss": 0.3275, "step": 13510 }, { "epoch": 4.5955529101653845, "grad_norm": 3.575457095129105, "learning_rate": 8.218176153261704e-08, "loss": 0.326, "step": 13515 }, { "epoch": 4.5972535181327325, "grad_norm": 4.953138668741619, "learning_rate": 8.149349238130793e-08, "loss": 0.3381, "step": 13520 }, { "epoch": 4.598954126100081, "grad_norm": 5.24919771321698, "learning_rate": 8.080806972113331e-08, "loss": 0.3381, "step": 13525 }, { "epoch": 4.6006547340674295, "grad_norm": 4.288167964297829, "learning_rate": 8.012549435881007e-08, "loss": 0.3103, "step": 13530 }, { "epoch": 4.602355342034778, "grad_norm": 5.194859941964073, "learning_rate": 7.944576709770363e-08, "loss": 0.3264, "step": 13535 }, { "epoch": 4.604055950002126, "grad_norm": 5.901346190501034, "learning_rate": 7.87688887378274e-08, "loss": 0.3298, "step": 13540 }, { "epoch": 4.605756557969475, "grad_norm": 16.68260982892744, "learning_rate": 7.809486007584216e-08, "loss": 0.3375, "step": 13545 }, { "epoch": 4.607457165936823, "grad_norm": 4.701726667900006, "learning_rate": 7.742368190505334e-08, "loss": 0.3323, "step": 13550 }, { "epoch": 4.609157773904171, "grad_norm": 5.9509914549389595, "learning_rate": 7.67553550154132e-08, "loss": 0.3222, "step": 13555 }, { "epoch": 4.610858381871519, "grad_norm": 13.976456333749962, "learning_rate": 7.608988019351699e-08, "loss": 0.3507, "step": 13560 }, { "epoch": 4.612558989838868, "grad_norm": 3.8581391762891872, "learning_rate": 7.542725822260371e-08, "loss": 0.3353, "step": 13565 }, { "epoch": 4.614259597806216, "grad_norm": 4.830330543756697, "learning_rate": 7.476748988255428e-08, "loss": 0.3094, "step": 13570 }, { "epoch": 4.615960205773564, "grad_norm": 4.012557265721932, "learning_rate": 7.41105759498914e-08, "loss": 0.3173, "step": 13575 }, { "epoch": 4.617660813740913, "grad_norm": 4.248863274590059, "learning_rate": 7.345651719777775e-08, "loss": 0.3523, "step": 13580 }, { "epoch": 4.619361421708261, "grad_norm": 6.913313645146682, "learning_rate": 7.280531439601641e-08, "loss": 0.3098, "step": 13585 }, { "epoch": 4.621062029675609, "grad_norm": 3.841679848706814, "learning_rate": 7.215696831104791e-08, "loss": 0.3524, "step": 13590 }, { "epoch": 4.622762637642957, "grad_norm": 2.9552779005802967, "learning_rate": 7.151147970595129e-08, "loss": 0.3329, "step": 13595 }, { "epoch": 4.624463245610306, "grad_norm": 3.7823566013155685, "learning_rate": 7.086884934044302e-08, "loss": 0.3349, "step": 13600 }, { "epoch": 4.626163853577654, "grad_norm": 17.23201939605056, "learning_rate": 7.02290779708742e-08, "loss": 0.351, "step": 13605 }, { "epoch": 4.627864461545002, "grad_norm": 10.04970677446816, "learning_rate": 6.959216635023191e-08, "loss": 0.3265, "step": 13610 }, { "epoch": 4.629565069512351, "grad_norm": 5.0992004095698995, "learning_rate": 6.895811522813683e-08, "loss": 0.3352, "step": 13615 }, { "epoch": 4.631265677479699, "grad_norm": 4.928106483892405, "learning_rate": 6.832692535084395e-08, "loss": 0.325, "step": 13620 }, { "epoch": 4.632966285447047, "grad_norm": 4.225590866855814, "learning_rate": 6.769859746123931e-08, "loss": 0.3152, "step": 13625 }, { "epoch": 4.634666893414396, "grad_norm": 6.029621948014496, "learning_rate": 6.70731322988416e-08, "loss": 0.3344, "step": 13630 }, { "epoch": 4.636367501381744, "grad_norm": 5.840225724774846, "learning_rate": 6.645053059979923e-08, "loss": 0.3422, "step": 13635 }, { "epoch": 4.638068109349092, "grad_norm": 5.255603762891737, "learning_rate": 6.583079309689183e-08, "loss": 0.3314, "step": 13640 }, { "epoch": 4.639768717316441, "grad_norm": 4.098687599519045, "learning_rate": 6.521392051952653e-08, "loss": 0.3333, "step": 13645 }, { "epoch": 4.641469325283789, "grad_norm": 5.255583875205857, "learning_rate": 6.45999135937389e-08, "loss": 0.3271, "step": 13650 }, { "epoch": 4.643169933251137, "grad_norm": 5.571635080468063, "learning_rate": 6.398877304219287e-08, "loss": 0.3267, "step": 13655 }, { "epoch": 4.644870541218486, "grad_norm": 5.6399676170928315, "learning_rate": 6.338049958417692e-08, "loss": 0.3274, "step": 13660 }, { "epoch": 4.646571149185834, "grad_norm": 9.774654917394303, "learning_rate": 6.277509393560672e-08, "loss": 0.3303, "step": 13665 }, { "epoch": 4.648271757153182, "grad_norm": 4.612499635823047, "learning_rate": 6.217255680902146e-08, "loss": 0.3338, "step": 13670 }, { "epoch": 4.64997236512053, "grad_norm": 16.70420980081881, "learning_rate": 6.157288891358498e-08, "loss": 0.3191, "step": 13675 }, { "epoch": 4.651672973087879, "grad_norm": 3.7846285855313115, "learning_rate": 6.097609095508355e-08, "loss": 0.3203, "step": 13680 }, { "epoch": 4.653373581055227, "grad_norm": 4.60325436502952, "learning_rate": 6.038216363592614e-08, "loss": 0.3381, "step": 13685 }, { "epoch": 4.655074189022575, "grad_norm": 142.50050566162332, "learning_rate": 5.979110765514273e-08, "loss": 0.3313, "step": 13690 }, { "epoch": 4.656774796989924, "grad_norm": 23.86895854571442, "learning_rate": 5.92029237083841e-08, "loss": 0.3359, "step": 13695 }, { "epoch": 4.658475404957272, "grad_norm": 5.283896466005684, "learning_rate": 5.8617612487920364e-08, "loss": 0.3283, "step": 13700 }, { "epoch": 4.66017601292462, "grad_norm": 4.779382002247554, "learning_rate": 5.8035174682641024e-08, "loss": 0.3206, "step": 13705 }, { "epoch": 4.661876620891968, "grad_norm": 7.054684759020897, "learning_rate": 5.74556109780533e-08, "loss": 0.3202, "step": 13710 }, { "epoch": 4.663577228859317, "grad_norm": 7.206859116855702, "learning_rate": 5.6878922056281816e-08, "loss": 0.3219, "step": 13715 }, { "epoch": 4.665277836826665, "grad_norm": 6.1873721233225725, "learning_rate": 5.630510859606808e-08, "loss": 0.3104, "step": 13720 }, { "epoch": 4.6669784447940135, "grad_norm": 7.693323171395487, "learning_rate": 5.573417127276853e-08, "loss": 0.326, "step": 13725 }, { "epoch": 4.668679052761362, "grad_norm": 3.5717677038539812, "learning_rate": 5.5166110758355375e-08, "loss": 0.3046, "step": 13730 }, { "epoch": 4.6703796607287105, "grad_norm": 3.875300346012126, "learning_rate": 5.4600927721413786e-08, "loss": 0.3232, "step": 13735 }, { "epoch": 4.6720802686960585, "grad_norm": 4.185631255677343, "learning_rate": 5.403862282714362e-08, "loss": 0.3321, "step": 13740 }, { "epoch": 4.6737808766634075, "grad_norm": 14.202455049654725, "learning_rate": 5.347919673735602e-08, "loss": 0.3181, "step": 13745 }, { "epoch": 4.6754814846307555, "grad_norm": 4.3145911797537115, "learning_rate": 5.292265011047487e-08, "loss": 0.3529, "step": 13750 }, { "epoch": 4.677182092598104, "grad_norm": 4.891949675153146, "learning_rate": 5.236898360153425e-08, "loss": 0.3432, "step": 13755 }, { "epoch": 4.6788827005654525, "grad_norm": 6.60604531632661, "learning_rate": 5.181819786217901e-08, "loss": 0.3129, "step": 13760 }, { "epoch": 4.680583308532801, "grad_norm": 3.4956157623637902, "learning_rate": 5.1270293540663095e-08, "loss": 0.3234, "step": 13765 }, { "epoch": 4.682283916500149, "grad_norm": 6.734484318198087, "learning_rate": 5.072527128184956e-08, "loss": 0.3537, "step": 13770 }, { "epoch": 4.683984524467498, "grad_norm": 10.341178508350671, "learning_rate": 5.01831317272089e-08, "loss": 0.3357, "step": 13775 }, { "epoch": 4.685685132434846, "grad_norm": 4.047827109768496, "learning_rate": 4.964387551481875e-08, "loss": 0.3282, "step": 13780 }, { "epoch": 4.687385740402194, "grad_norm": 6.8714161053726315, "learning_rate": 4.910750327936392e-08, "loss": 0.3501, "step": 13785 }, { "epoch": 4.689086348369542, "grad_norm": 8.505102959858387, "learning_rate": 4.85740156521336e-08, "loss": 0.3168, "step": 13790 }, { "epoch": 4.690786956336891, "grad_norm": 4.7115482090418634, "learning_rate": 4.804341326102358e-08, "loss": 0.3459, "step": 13795 }, { "epoch": 4.692487564304239, "grad_norm": 6.277115723449739, "learning_rate": 4.75156967305318e-08, "loss": 0.3508, "step": 13800 }, { "epoch": 4.694188172271587, "grad_norm": 7.547462766787185, "learning_rate": 4.699086668176173e-08, "loss": 0.3464, "step": 13805 }, { "epoch": 4.695888780238936, "grad_norm": 4.296686807226226, "learning_rate": 4.646892373241812e-08, "loss": 0.3405, "step": 13810 }, { "epoch": 4.697589388206284, "grad_norm": 4.407793513535197, "learning_rate": 4.594986849680821e-08, "loss": 0.3285, "step": 13815 }, { "epoch": 4.699289996173632, "grad_norm": 10.953418019870245, "learning_rate": 4.543370158584054e-08, "loss": 0.3294, "step": 13820 }, { "epoch": 4.70099060414098, "grad_norm": 4.815143078865413, "learning_rate": 4.4920423607024144e-08, "loss": 0.3429, "step": 13825 }, { "epoch": 4.702691212108329, "grad_norm": 3.8748734207286373, "learning_rate": 4.441003516446773e-08, "loss": 0.3321, "step": 13830 }, { "epoch": 4.704391820075677, "grad_norm": 5.964705868539915, "learning_rate": 4.390253685887941e-08, "loss": 0.3069, "step": 13835 }, { "epoch": 4.706092428043025, "grad_norm": 4.517574517561679, "learning_rate": 4.339792928756581e-08, "loss": 0.3323, "step": 13840 }, { "epoch": 4.707793036010374, "grad_norm": 5.3957387491489985, "learning_rate": 4.289621304443076e-08, "loss": 0.34, "step": 13845 }, { "epoch": 4.709493643977722, "grad_norm": 7.735334001093503, "learning_rate": 4.239738871997551e-08, "loss": 0.3408, "step": 13850 }, { "epoch": 4.71119425194507, "grad_norm": 4.054492180541433, "learning_rate": 4.190145690129738e-08, "loss": 0.3139, "step": 13855 }, { "epoch": 4.712894859912419, "grad_norm": 6.348049582111246, "learning_rate": 4.140841817208946e-08, "loss": 0.3166, "step": 13860 }, { "epoch": 4.714595467879767, "grad_norm": 4.1275479057128575, "learning_rate": 4.091827311264007e-08, "loss": 0.3214, "step": 13865 }, { "epoch": 4.716296075847115, "grad_norm": 3.657059874633293, "learning_rate": 4.043102229983109e-08, "loss": 0.3241, "step": 13870 }, { "epoch": 4.717996683814464, "grad_norm": 8.612859433209668, "learning_rate": 3.994666630713878e-08, "loss": 0.3113, "step": 13875 }, { "epoch": 4.719697291781812, "grad_norm": 9.237348760144329, "learning_rate": 3.946520570463158e-08, "loss": 0.3434, "step": 13880 }, { "epoch": 4.72139789974916, "grad_norm": 7.960686824647489, "learning_rate": 3.898664105897065e-08, "loss": 0.3488, "step": 13885 }, { "epoch": 4.723098507716509, "grad_norm": 5.552691101089457, "learning_rate": 3.851097293340877e-08, "loss": 0.3466, "step": 13890 }, { "epoch": 4.724799115683857, "grad_norm": 4.29663431060009, "learning_rate": 3.803820188778895e-08, "loss": 0.3259, "step": 13895 }, { "epoch": 4.726499723651205, "grad_norm": 5.090123301435192, "learning_rate": 3.756832847854525e-08, "loss": 0.3119, "step": 13900 }, { "epoch": 4.728200331618554, "grad_norm": 6.696393552711733, "learning_rate": 3.710135325870085e-08, "loss": 0.3354, "step": 13905 }, { "epoch": 4.729900939585902, "grad_norm": 4.918072272655382, "learning_rate": 3.663727677786833e-08, "loss": 0.3327, "step": 13910 }, { "epoch": 4.73160154755325, "grad_norm": 5.2599558489731315, "learning_rate": 3.6176099582247716e-08, "loss": 0.3405, "step": 13915 }, { "epoch": 4.733302155520598, "grad_norm": 7.8895944654960815, "learning_rate": 3.5717822214627606e-08, "loss": 0.3335, "step": 13920 }, { "epoch": 4.735002763487947, "grad_norm": 4.802501219254316, "learning_rate": 3.526244521438321e-08, "loss": 0.3502, "step": 13925 }, { "epoch": 4.736703371455295, "grad_norm": 9.590248448123884, "learning_rate": 3.4809969117475806e-08, "loss": 0.3316, "step": 13930 }, { "epoch": 4.738403979422643, "grad_norm": 3.695100725537164, "learning_rate": 3.4360394456453004e-08, "loss": 0.3461, "step": 13935 }, { "epoch": 4.740104587389992, "grad_norm": 4.148114638485843, "learning_rate": 3.3913721760447104e-08, "loss": 0.3223, "step": 13940 }, { "epoch": 4.74180519535734, "grad_norm": 10.58552100482856, "learning_rate": 3.3469951555175075e-08, "loss": 0.3346, "step": 13945 }, { "epoch": 4.743505803324688, "grad_norm": 2.508138019154806, "learning_rate": 3.3029084362938005e-08, "loss": 0.332, "step": 13950 }, { "epoch": 4.7452064112920365, "grad_norm": 3.9719421989029775, "learning_rate": 3.259112070261944e-08, "loss": 0.3278, "step": 13955 }, { "epoch": 4.746907019259385, "grad_norm": 4.152108309347853, "learning_rate": 3.2156061089686776e-08, "loss": 0.3303, "step": 13960 }, { "epoch": 4.7486076272267335, "grad_norm": 5.436167936445684, "learning_rate": 3.172390603618847e-08, "loss": 0.3251, "step": 13965 }, { "epoch": 4.7503082351940815, "grad_norm": 4.499253144753351, "learning_rate": 3.129465605075488e-08, "loss": 0.3328, "step": 13970 }, { "epoch": 4.7520088431614305, "grad_norm": 14.816044571670451, "learning_rate": 3.086831163859661e-08, "loss": 0.3413, "step": 13975 }, { "epoch": 4.7537094511287785, "grad_norm": 4.341032786225299, "learning_rate": 3.044487330150558e-08, "loss": 0.3258, "step": 13980 }, { "epoch": 4.755410059096127, "grad_norm": 13.714824551807652, "learning_rate": 3.002434153785261e-08, "loss": 0.3166, "step": 13985 }, { "epoch": 4.7571106670634755, "grad_norm": 5.47757169285566, "learning_rate": 2.960671684258759e-08, "loss": 0.3161, "step": 13990 }, { "epoch": 4.758811275030824, "grad_norm": 3.7629727614857464, "learning_rate": 2.9191999707239292e-08, "loss": 0.3173, "step": 13995 }, { "epoch": 4.760511882998172, "grad_norm": 3.65754739288696, "learning_rate": 2.8780190619914216e-08, "loss": 0.3233, "step": 14000 }, { "epoch": 4.762212490965521, "grad_norm": 8.729693460847791, "learning_rate": 2.8371290065295764e-08, "loss": 0.3526, "step": 14005 }, { "epoch": 4.763913098932869, "grad_norm": 4.676875112956524, "learning_rate": 2.79652985246448e-08, "loss": 0.3408, "step": 14010 }, { "epoch": 4.765613706900217, "grad_norm": 17.946843511877965, "learning_rate": 2.7562216475797986e-08, "loss": 0.3343, "step": 14015 }, { "epoch": 4.767314314867566, "grad_norm": 3.772720874376048, "learning_rate": 2.7162044393167498e-08, "loss": 0.3221, "step": 14020 }, { "epoch": 4.769014922834914, "grad_norm": 7.405290276612255, "learning_rate": 2.676478274774158e-08, "loss": 0.3259, "step": 14025 }, { "epoch": 4.770715530802262, "grad_norm": 9.481284247622, "learning_rate": 2.6370432007081502e-08, "loss": 0.3209, "step": 14030 }, { "epoch": 4.77241613876961, "grad_norm": 6.465908227524739, "learning_rate": 2.5978992635323773e-08, "loss": 0.3221, "step": 14035 }, { "epoch": 4.774116746736959, "grad_norm": 6.157097149133531, "learning_rate": 2.5590465093177087e-08, "loss": 0.3272, "step": 14040 }, { "epoch": 4.775817354704307, "grad_norm": 3.8184457373119436, "learning_rate": 2.520484983792454e-08, "loss": 0.325, "step": 14045 }, { "epoch": 4.777517962671655, "grad_norm": 7.076044755832123, "learning_rate": 2.4822147323420032e-08, "loss": 0.3345, "step": 14050 }, { "epoch": 4.779218570639004, "grad_norm": 4.090298221303333, "learning_rate": 2.444235800009076e-08, "loss": 0.3248, "step": 14055 }, { "epoch": 4.780919178606352, "grad_norm": 11.176518096915373, "learning_rate": 2.406548231493361e-08, "loss": 0.3425, "step": 14060 }, { "epoch": 4.7826197865737, "grad_norm": 7.616449195698886, "learning_rate": 2.3691520711517923e-08, "loss": 0.3395, "step": 14065 }, { "epoch": 4.784320394541048, "grad_norm": 3.341981099864446, "learning_rate": 2.332047362998191e-08, "loss": 0.3482, "step": 14070 }, { "epoch": 4.786021002508397, "grad_norm": 6.11811041814841, "learning_rate": 2.295234150703429e-08, "loss": 0.345, "step": 14075 }, { "epoch": 4.787721610475745, "grad_norm": 4.48366183201597, "learning_rate": 2.258712477595265e-08, "loss": 0.3466, "step": 14080 }, { "epoch": 4.789422218443093, "grad_norm": 3.3110746330710454, "learning_rate": 2.2224823866583145e-08, "loss": 0.3197, "step": 14085 }, { "epoch": 4.791122826410442, "grad_norm": 5.6015300192522, "learning_rate": 2.186543920534051e-08, "loss": 0.3269, "step": 14090 }, { "epoch": 4.79282343437779, "grad_norm": 3.731054334037487, "learning_rate": 2.1508971215206953e-08, "loss": 0.3583, "step": 14095 }, { "epoch": 4.794524042345138, "grad_norm": 4.66466334818196, "learning_rate": 2.115542031573159e-08, "loss": 0.3386, "step": 14100 }, { "epoch": 4.796224650312487, "grad_norm": 5.25031709764381, "learning_rate": 2.0804786923031008e-08, "loss": 0.3379, "step": 14105 }, { "epoch": 4.797925258279835, "grad_norm": 10.516944628469465, "learning_rate": 2.0457071449787315e-08, "loss": 0.3225, "step": 14110 }, { "epoch": 4.799625866247183, "grad_norm": 3.5611550276427715, "learning_rate": 2.0112274305248426e-08, "loss": 0.3161, "step": 14115 }, { "epoch": 4.801326474214532, "grad_norm": 7.576371200543571, "learning_rate": 1.977039589522778e-08, "loss": 0.3405, "step": 14120 }, { "epoch": 4.80302708218188, "grad_norm": 5.451582141759962, "learning_rate": 1.943143662210295e-08, "loss": 0.3234, "step": 14125 }, { "epoch": 4.804727690149228, "grad_norm": 5.586511869261489, "learning_rate": 1.9095396884817043e-08, "loss": 0.3386, "step": 14130 }, { "epoch": 4.806428298116577, "grad_norm": 12.158692728698131, "learning_rate": 1.8762277078875346e-08, "loss": 0.3168, "step": 14135 }, { "epoch": 4.808128906083925, "grad_norm": 3.6248018478769986, "learning_rate": 1.843207759634813e-08, "loss": 0.3253, "step": 14140 }, { "epoch": 4.809829514051273, "grad_norm": 11.92575864371344, "learning_rate": 1.810479882586702e-08, "loss": 0.335, "step": 14145 }, { "epoch": 4.811530122018621, "grad_norm": 4.050455765642693, "learning_rate": 1.7780441152627227e-08, "loss": 0.3384, "step": 14150 }, { "epoch": 4.81323072998597, "grad_norm": 7.165397203447299, "learning_rate": 1.7459004958385317e-08, "loss": 0.3406, "step": 14155 }, { "epoch": 4.814931337953318, "grad_norm": 6.031532642070139, "learning_rate": 1.7140490621459782e-08, "loss": 0.3431, "step": 14160 }, { "epoch": 4.816631945920666, "grad_norm": 16.529377007280242, "learning_rate": 1.6824898516729916e-08, "loss": 0.2999, "step": 14165 }, { "epoch": 4.818332553888015, "grad_norm": 6.834412260587851, "learning_rate": 1.6512229015635817e-08, "loss": 0.3104, "step": 14170 }, { "epoch": 4.820033161855363, "grad_norm": 4.2081382672489775, "learning_rate": 1.620248248617784e-08, "loss": 0.324, "step": 14175 }, { "epoch": 4.821733769822711, "grad_norm": 4.531693339378879, "learning_rate": 1.5895659292915477e-08, "loss": 0.3341, "step": 14180 }, { "epoch": 4.8234343777900595, "grad_norm": 11.082332131058152, "learning_rate": 1.559175979696875e-08, "loss": 0.3127, "step": 14185 }, { "epoch": 4.825134985757408, "grad_norm": 6.595292995331069, "learning_rate": 1.5290784356015166e-08, "loss": 0.3168, "step": 14190 }, { "epoch": 4.8268355937247565, "grad_norm": 5.052925451408609, "learning_rate": 1.4992733324292465e-08, "loss": 0.299, "step": 14195 }, { "epoch": 4.8285362016921045, "grad_norm": 4.504702164111421, "learning_rate": 1.4697607052594487e-08, "loss": 0.3065, "step": 14200 }, { "epoch": 4.8302368096594535, "grad_norm": 5.758256087628319, "learning_rate": 1.4405405888274492e-08, "loss": 0.3106, "step": 14205 }, { "epoch": 4.8319374176268015, "grad_norm": 5.2199824865839215, "learning_rate": 1.4116130175241826e-08, "loss": 0.3041, "step": 14210 }, { "epoch": 4.83363802559415, "grad_norm": 3.9418870967087383, "learning_rate": 1.382978025396331e-08, "loss": 0.3182, "step": 14215 }, { "epoch": 4.8353386335614985, "grad_norm": 4.6303149964657875, "learning_rate": 1.3546356461462129e-08, "loss": 0.326, "step": 14220 }, { "epoch": 4.837039241528847, "grad_norm": 5.553330113835605, "learning_rate": 1.3265859131317004e-08, "loss": 0.3169, "step": 14225 }, { "epoch": 4.838739849496195, "grad_norm": 6.2443968915828405, "learning_rate": 1.2988288593663301e-08, "loss": 0.3101, "step": 14230 }, { "epoch": 4.840440457463544, "grad_norm": 4.663283684252, "learning_rate": 1.2713645175190526e-08, "loss": 0.3028, "step": 14235 }, { "epoch": 4.842141065430892, "grad_norm": 3.766050860064333, "learning_rate": 1.2441929199143998e-08, "loss": 0.3188, "step": 14240 }, { "epoch": 4.84384167339824, "grad_norm": 4.965225020822388, "learning_rate": 1.2173140985323183e-08, "loss": 0.3326, "step": 14245 }, { "epoch": 4.845542281365589, "grad_norm": 4.884112848650207, "learning_rate": 1.1907280850081416e-08, "loss": 0.3297, "step": 14250 }, { "epoch": 4.847242889332937, "grad_norm": 4.079118794301395, "learning_rate": 1.1644349106326446e-08, "loss": 0.3458, "step": 14255 }, { "epoch": 4.848943497300285, "grad_norm": 4.606447266319099, "learning_rate": 1.138434606351907e-08, "loss": 0.304, "step": 14260 }, { "epoch": 4.850644105267634, "grad_norm": 7.463303603237622, "learning_rate": 1.1127272027672553e-08, "loss": 0.3416, "step": 14265 }, { "epoch": 4.852344713234982, "grad_norm": 5.457559221356962, "learning_rate": 1.0873127301353759e-08, "loss": 0.3433, "step": 14270 }, { "epoch": 4.85404532120233, "grad_norm": 3.8041665918547927, "learning_rate": 1.0621912183681471e-08, "loss": 0.3236, "step": 14275 }, { "epoch": 4.855745929169678, "grad_norm": 4.930478268701416, "learning_rate": 1.0373626970326122e-08, "loss": 0.3263, "step": 14280 }, { "epoch": 4.857446537137027, "grad_norm": 3.8579986437154767, "learning_rate": 1.0128271953510627e-08, "loss": 0.3287, "step": 14285 }, { "epoch": 4.859147145104375, "grad_norm": 3.938039554816089, "learning_rate": 9.885847422008155e-09, "loss": 0.3169, "step": 14290 }, { "epoch": 4.860847753071723, "grad_norm": 3.2680166121584744, "learning_rate": 9.646353661143248e-09, "loss": 0.2959, "step": 14295 }, { "epoch": 4.862548361039072, "grad_norm": 4.24184915115148, "learning_rate": 9.409790952791265e-09, "loss": 0.3199, "step": 14300 }, { "epoch": 4.86424896900642, "grad_norm": 4.150684459822878, "learning_rate": 9.176159575377542e-09, "loss": 0.3193, "step": 14305 }, { "epoch": 4.865949576973768, "grad_norm": 4.386856416297783, "learning_rate": 8.945459803877399e-09, "loss": 0.3256, "step": 14310 }, { "epoch": 4.867650184941116, "grad_norm": 7.261086576109466, "learning_rate": 8.717691909815861e-09, "loss": 0.3317, "step": 14315 }, { "epoch": 4.869350792908465, "grad_norm": 4.526817758996456, "learning_rate": 8.492856161266827e-09, "loss": 0.3382, "step": 14320 }, { "epoch": 4.871051400875813, "grad_norm": 9.745905029849254, "learning_rate": 8.270952822854173e-09, "loss": 0.3144, "step": 14325 }, { "epoch": 4.872752008843161, "grad_norm": 4.157833604852971, "learning_rate": 8.051982155748983e-09, "loss": 0.3324, "step": 14330 }, { "epoch": 4.87445261681051, "grad_norm": 5.334779588145237, "learning_rate": 7.835944417672047e-09, "loss": 0.3368, "step": 14335 }, { "epoch": 4.876153224777858, "grad_norm": 4.234055385260185, "learning_rate": 7.622839862891363e-09, "loss": 0.3215, "step": 14340 }, { "epoch": 4.877853832745206, "grad_norm": 6.635208082117994, "learning_rate": 7.412668742223239e-09, "loss": 0.3282, "step": 14345 }, { "epoch": 4.879554440712555, "grad_norm": 4.579780726869808, "learning_rate": 7.205431303030919e-09, "loss": 0.3453, "step": 14350 }, { "epoch": 4.881255048679903, "grad_norm": 7.936028346894105, "learning_rate": 7.001127789225404e-09, "loss": 0.3272, "step": 14355 }, { "epoch": 4.882955656647251, "grad_norm": 5.496662933139069, "learning_rate": 6.799758441263793e-09, "loss": 0.3329, "step": 14360 }, { "epoch": 4.8846562646146, "grad_norm": 7.823396970762261, "learning_rate": 6.601323496150391e-09, "loss": 0.3202, "step": 14365 }, { "epoch": 4.886356872581948, "grad_norm": 33.03284728252866, "learning_rate": 6.405823187435878e-09, "loss": 0.336, "step": 14370 }, { "epoch": 4.888057480549296, "grad_norm": 4.953899415870555, "learning_rate": 6.21325774521675e-09, "loss": 0.3167, "step": 14375 }, { "epoch": 4.889758088516645, "grad_norm": 4.117838498124844, "learning_rate": 6.023627396135046e-09, "loss": 0.3446, "step": 14380 }, { "epoch": 4.891458696483993, "grad_norm": 3.9367115304478677, "learning_rate": 5.836932363378345e-09, "loss": 0.3222, "step": 14385 }, { "epoch": 4.893159304451341, "grad_norm": 4.510819927911293, "learning_rate": 5.653172866680323e-09, "loss": 0.3249, "step": 14390 }, { "epoch": 4.894859912418689, "grad_norm": 10.001543119830593, "learning_rate": 5.472349122318532e-09, "loss": 0.3575, "step": 14395 }, { "epoch": 4.896560520386038, "grad_norm": 4.7034873943443385, "learning_rate": 5.294461343115509e-09, "loss": 0.3358, "step": 14400 }, { "epoch": 4.898261128353386, "grad_norm": 5.090134015217698, "learning_rate": 5.119509738439332e-09, "loss": 0.3306, "step": 14405 }, { "epoch": 4.899961736320734, "grad_norm": 4.590455859703564, "learning_rate": 4.947494514200568e-09, "loss": 0.3223, "step": 14410 }, { "epoch": 4.901662344288083, "grad_norm": 6.328085485050238, "learning_rate": 4.778415872855047e-09, "loss": 0.3443, "step": 14415 }, { "epoch": 4.903362952255431, "grad_norm": 3.538138410045203, "learning_rate": 4.612274013401918e-09, "loss": 0.3497, "step": 14420 }, { "epoch": 4.9050635602227795, "grad_norm": 6.790611192620718, "learning_rate": 4.449069131383932e-09, "loss": 0.3301, "step": 14425 }, { "epoch": 4.9067641681901275, "grad_norm": 6.7806268756366945, "learning_rate": 4.288801418887156e-09, "loss": 0.336, "step": 14430 }, { "epoch": 4.9084647761574765, "grad_norm": 10.540979291550155, "learning_rate": 4.131471064540427e-09, "loss": 0.3295, "step": 14435 }, { "epoch": 4.9101653841248245, "grad_norm": 3.7211698539878655, "learning_rate": 3.977078253515898e-09, "loss": 0.3135, "step": 14440 }, { "epoch": 4.911865992092173, "grad_norm": 3.5706156017562978, "learning_rate": 3.825623167527936e-09, "loss": 0.3122, "step": 14445 }, { "epoch": 4.9135666000595215, "grad_norm": 5.847975032828931, "learning_rate": 3.6771059848333956e-09, "loss": 0.3311, "step": 14450 }, { "epoch": 4.91526720802687, "grad_norm": 3.381943041804788, "learning_rate": 3.531526880231617e-09, "loss": 0.3191, "step": 14455 }, { "epoch": 4.916967815994218, "grad_norm": 5.237877413713388, "learning_rate": 3.388886025063598e-09, "loss": 0.3235, "step": 14460 }, { "epoch": 4.918668423961567, "grad_norm": 4.369904248870116, "learning_rate": 3.2491835872125455e-09, "loss": 0.3283, "step": 14465 }, { "epoch": 4.920369031928915, "grad_norm": 6.527563018262631, "learning_rate": 3.1124197311024896e-09, "loss": 0.2977, "step": 14470 }, { "epoch": 4.922069639896263, "grad_norm": 6.879941803520668, "learning_rate": 2.9785946176996703e-09, "loss": 0.3217, "step": 14475 }, { "epoch": 4.923770247863612, "grad_norm": 5.22069543753945, "learning_rate": 2.8477084045111513e-09, "loss": 0.3536, "step": 14480 }, { "epoch": 4.92547085583096, "grad_norm": 4.145546124082018, "learning_rate": 2.7197612455850952e-09, "loss": 0.3381, "step": 14485 }, { "epoch": 4.927171463798308, "grad_norm": 4.2734827922564875, "learning_rate": 2.5947532915102105e-09, "loss": 0.34, "step": 14490 }, { "epoch": 4.928872071765657, "grad_norm": 6.600727915702121, "learning_rate": 2.4726846894165823e-09, "loss": 0.3416, "step": 14495 }, { "epoch": 4.930572679733005, "grad_norm": 4.013607295901653, "learning_rate": 2.353555582974287e-09, "loss": 0.3262, "step": 14500 }, { "epoch": 4.932273287700353, "grad_norm": 4.028623933831119, "learning_rate": 2.2373661123936687e-09, "loss": 0.3392, "step": 14505 }, { "epoch": 4.933973895667701, "grad_norm": 4.982694995695445, "learning_rate": 2.124116414425059e-09, "loss": 0.3388, "step": 14510 }, { "epoch": 4.93567450363505, "grad_norm": 22.898140404287325, "learning_rate": 2.0138066223596153e-09, "loss": 0.3288, "step": 14515 }, { "epoch": 4.937375111602398, "grad_norm": 7.380963270845392, "learning_rate": 1.90643686602765e-09, "loss": 0.3225, "step": 14520 }, { "epoch": 4.939075719569746, "grad_norm": 4.637343762454011, "learning_rate": 1.8020072717991889e-09, "loss": 0.3244, "step": 14525 }, { "epoch": 4.940776327537095, "grad_norm": 6.8534054772689315, "learning_rate": 1.7005179625842471e-09, "loss": 0.3398, "step": 14530 }, { "epoch": 4.942476935504443, "grad_norm": 4.787676916447148, "learning_rate": 1.6019690578314428e-09, "loss": 0.3373, "step": 14535 }, { "epoch": 4.944177543471791, "grad_norm": 6.9960588478076, "learning_rate": 1.5063606735293835e-09, "loss": 0.3367, "step": 14540 }, { "epoch": 4.945878151439139, "grad_norm": 4.598526229363827, "learning_rate": 1.4136929222058337e-09, "loss": 0.3277, "step": 14545 }, { "epoch": 4.947578759406488, "grad_norm": 5.057956091708963, "learning_rate": 1.3239659129266059e-09, "loss": 0.324, "step": 14550 }, { "epoch": 4.949279367373836, "grad_norm": 5.521031541373788, "learning_rate": 1.2371797512975014e-09, "loss": 0.3574, "step": 14555 }, { "epoch": 4.950979975341184, "grad_norm": 4.256729862265275, "learning_rate": 1.1533345394623696e-09, "loss": 0.3278, "step": 14560 }, { "epoch": 4.952680583308533, "grad_norm": 7.058631958237392, "learning_rate": 1.0724303761042165e-09, "loss": 0.3161, "step": 14565 }, { "epoch": 4.954381191275881, "grad_norm": 4.0105963723265114, "learning_rate": 9.944673564435403e-10, "loss": 0.3388, "step": 14570 }, { "epoch": 4.956081799243229, "grad_norm": 5.0775775613873515, "learning_rate": 9.194455722405515e-10, "loss": 0.3169, "step": 14575 }, { "epoch": 4.957782407210578, "grad_norm": 16.218514657286182, "learning_rate": 8.473651117923976e-10, "loss": 0.3212, "step": 14580 }, { "epoch": 4.959483015177926, "grad_norm": 4.491387347459075, "learning_rate": 7.782260599356606e-10, "loss": 0.3479, "step": 14585 }, { "epoch": 4.961183623145274, "grad_norm": 6.522099950741569, "learning_rate": 7.120284980441372e-10, "loss": 0.3316, "step": 14590 }, { "epoch": 4.962884231112623, "grad_norm": 6.5595159956988205, "learning_rate": 6.487725040299487e-10, "loss": 0.3399, "step": 14595 }, { "epoch": 4.964584839079971, "grad_norm": 12.52246280240769, "learning_rate": 5.884581523429856e-10, "loss": 0.3101, "step": 14600 }, { "epoch": 4.966285447047319, "grad_norm": 3.867868696316935, "learning_rate": 5.310855139709082e-10, "loss": 0.3392, "step": 14605 }, { "epoch": 4.967986055014668, "grad_norm": 4.72438646289047, "learning_rate": 4.766546564391461e-10, "loss": 0.336, "step": 14610 }, { "epoch": 4.969686662982016, "grad_norm": 4.621887636559062, "learning_rate": 4.2516564381089864e-10, "loss": 0.3262, "step": 14615 }, { "epoch": 4.971387270949364, "grad_norm": 5.3117366358284315, "learning_rate": 3.766185366868569e-10, "loss": 0.3035, "step": 14620 }, { "epoch": 4.973087878916713, "grad_norm": 5.98696897470915, "learning_rate": 3.3101339220492637e-10, "loss": 0.3294, "step": 14625 }, { "epoch": 4.974788486884061, "grad_norm": 5.125906773564488, "learning_rate": 2.883502640405045e-10, "loss": 0.3312, "step": 14630 }, { "epoch": 4.976489094851409, "grad_norm": 9.435480750740867, "learning_rate": 2.486292024070358e-10, "loss": 0.3434, "step": 14635 }, { "epoch": 4.978189702818757, "grad_norm": 8.434443648518124, "learning_rate": 2.11850254054069e-10, "loss": 0.3285, "step": 14640 }, { "epoch": 4.979890310786106, "grad_norm": 4.90195079746842, "learning_rate": 1.7801346226947736e-10, "loss": 0.3252, "step": 14645 }, { "epoch": 4.981590918753454, "grad_norm": 21.019745490751443, "learning_rate": 1.4711886687807097e-10, "loss": 0.3381, "step": 14650 }, { "epoch": 4.9832915267208024, "grad_norm": 4.632978757783584, "learning_rate": 1.191665042410417e-10, "loss": 0.3404, "step": 14655 }, { "epoch": 4.984992134688151, "grad_norm": 3.9419023704852543, "learning_rate": 9.415640725762843e-11, "loss": 0.3189, "step": 14660 }, { "epoch": 4.986692742655499, "grad_norm": 4.6062000770567675, "learning_rate": 7.208860536345174e-11, "loss": 0.3345, "step": 14665 }, { "epoch": 4.9883933506228475, "grad_norm": 7.7648848410049585, "learning_rate": 5.296312453217933e-11, "loss": 0.3167, "step": 14670 }, { "epoch": 4.9900939585901956, "grad_norm": 6.071758031929791, "learning_rate": 3.677998727302789e-11, "loss": 0.305, "step": 14675 }, { "epoch": 4.9917945665575445, "grad_norm": 6.055511194549918, "learning_rate": 2.35392126332612e-11, "loss": 0.3061, "step": 14680 }, { "epoch": 4.9934951745248926, "grad_norm": 5.787453807652962, "learning_rate": 1.3240816196802287e-11, "loss": 0.3292, "step": 14685 }, { "epoch": 4.995195782492241, "grad_norm": 5.439973255765252, "learning_rate": 5.884810084511028e-12, "loss": 0.3282, "step": 14690 }, { "epoch": 4.9968963904595896, "grad_norm": 4.592926398935226, "learning_rate": 1.4712029539065698e-12, "loss": 0.3253, "step": 14695 }, { "epoch": 4.998596998426938, "grad_norm": 4.794272151329833, "learning_rate": 0.0, "loss": 0.3183, "step": 14700 }, { "epoch": 4.998596998426938, "step": 14700, "total_flos": 6727881718628352.0, "train_loss": 0.41388291033757785, "train_runtime": 737214.1133, "train_samples_per_second": 1.276, "train_steps_per_second": 0.02 } ], "logging_steps": 5, "max_steps": 14700, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6727881718628352.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }