diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20622 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.998596998426938, + "eval_steps": 500, + "global_step": 14700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017006079673483271, + "grad_norm": 14.845677861036755, + "learning_rate": 1.1312217194570136e-07, + "loss": 1.8551, + "step": 5 + }, + { + "epoch": 0.0034012159346966542, + "grad_norm": 10.628996742983775, + "learning_rate": 2.2624434389140273e-07, + "loss": 1.8205, + "step": 10 + }, + { + "epoch": 0.005101823902044981, + "grad_norm": 16.209420936017096, + "learning_rate": 3.393665158371041e-07, + "loss": 1.8248, + "step": 15 + }, + { + "epoch": 0.0068024318693933085, + "grad_norm": 14.057467625353302, + "learning_rate": 4.5248868778280546e-07, + "loss": 1.8197, + "step": 20 + }, + { + "epoch": 0.008503039836741635, + "grad_norm": 8.549081078810927, + "learning_rate": 5.656108597285068e-07, + "loss": 1.8493, + "step": 25 + }, + { + "epoch": 0.010203647804089963, + "grad_norm": 8.704433651079075, + "learning_rate": 6.787330316742082e-07, + "loss": 1.811, + "step": 30 + }, + { + "epoch": 0.011904255771438289, + "grad_norm": 12.2652942343875, + "learning_rate": 7.918552036199095e-07, + "loss": 1.8156, + "step": 35 + }, + { + "epoch": 0.013604863738786617, + "grad_norm": 7.896258864091117, + "learning_rate": 9.049773755656109e-07, + "loss": 1.8359, + "step": 40 + }, + { + "epoch": 0.015305471706134943, + "grad_norm": 6.971049091184883, + "learning_rate": 1.0180995475113123e-06, + "loss": 1.8116, + "step": 45 + }, + { + "epoch": 0.01700607967348327, + "grad_norm": 6.957776653029673, + "learning_rate": 1.1312217194570136e-06, + "loss": 1.7826, + "step": 50 + }, + { + "epoch": 0.018706687640831596, + "grad_norm": 9.177739116592859, + "learning_rate": 1.244343891402715e-06, + "loss": 1.7557, + "step": 55 + }, + { + "epoch": 0.020407295608179925, + "grad_norm": 7.756059818874078, + "learning_rate": 1.3574660633484164e-06, + "loss": 1.7249, + "step": 60 + }, + { + "epoch": 0.02210790357552825, + "grad_norm": 18.463301673578197, + "learning_rate": 1.4705882352941177e-06, + "loss": 1.6644, + "step": 65 + }, + { + "epoch": 0.023808511542876578, + "grad_norm": 10.995078036433805, + "learning_rate": 1.583710407239819e-06, + "loss": 1.649, + "step": 70 + }, + { + "epoch": 0.025509119510224904, + "grad_norm": 12.94230015695223, + "learning_rate": 1.6968325791855207e-06, + "loss": 1.659, + "step": 75 + }, + { + "epoch": 0.027209727477573234, + "grad_norm": 6.76101300426946, + "learning_rate": 1.8099547511312218e-06, + "loss": 1.6152, + "step": 80 + }, + { + "epoch": 0.02891033544492156, + "grad_norm": 8.812543750769409, + "learning_rate": 1.9230769230769234e-06, + "loss": 1.5711, + "step": 85 + }, + { + "epoch": 0.030610943412269886, + "grad_norm": 5.697844190371594, + "learning_rate": 2.0361990950226245e-06, + "loss": 1.5305, + "step": 90 + }, + { + "epoch": 0.03231155137961821, + "grad_norm": 19.374366768238563, + "learning_rate": 2.149321266968326e-06, + "loss": 1.5062, + "step": 95 + }, + { + "epoch": 0.03401215934696654, + "grad_norm": 5.157411864253761, + "learning_rate": 2.2624434389140273e-06, + "loss": 1.4838, + "step": 100 + }, + { + "epoch": 0.035712767314314865, + "grad_norm": 13.81826997890456, + "learning_rate": 2.3755656108597284e-06, + "loss": 1.4604, + "step": 105 + }, + { + "epoch": 0.03741337528166319, + "grad_norm": 8.371503617793609, + "learning_rate": 2.48868778280543e-06, + "loss": 1.424, + "step": 110 + }, + { + "epoch": 0.039113983249011525, + "grad_norm": 4.489966249256709, + "learning_rate": 2.6018099547511316e-06, + "loss": 1.3639, + "step": 115 + }, + { + "epoch": 0.04081459121635985, + "grad_norm": 6.698307717633603, + "learning_rate": 2.7149321266968327e-06, + "loss": 1.3189, + "step": 120 + }, + { + "epoch": 0.04251519918370818, + "grad_norm": 7.92343503351275, + "learning_rate": 2.8280542986425343e-06, + "loss": 1.2216, + "step": 125 + }, + { + "epoch": 0.0442158071510565, + "grad_norm": 7.3532950329675755, + "learning_rate": 2.9411764705882355e-06, + "loss": 1.1338, + "step": 130 + }, + { + "epoch": 0.04591641511840483, + "grad_norm": 6.168611954311049, + "learning_rate": 3.054298642533937e-06, + "loss": 1.0414, + "step": 135 + }, + { + "epoch": 0.047617023085753156, + "grad_norm": 5.875428318783832, + "learning_rate": 3.167420814479638e-06, + "loss": 0.9731, + "step": 140 + }, + { + "epoch": 0.04931763105310148, + "grad_norm": 15.936106002351035, + "learning_rate": 3.2805429864253398e-06, + "loss": 0.9013, + "step": 145 + }, + { + "epoch": 0.05101823902044981, + "grad_norm": 14.723970861685023, + "learning_rate": 3.3936651583710413e-06, + "loss": 0.8686, + "step": 150 + }, + { + "epoch": 0.052718846987798135, + "grad_norm": 6.296754035865692, + "learning_rate": 3.506787330316742e-06, + "loss": 0.8403, + "step": 155 + }, + { + "epoch": 0.05441945495514647, + "grad_norm": 11.410646614152762, + "learning_rate": 3.6199095022624436e-06, + "loss": 0.82, + "step": 160 + }, + { + "epoch": 0.056120062922494794, + "grad_norm": 6.179450576960927, + "learning_rate": 3.7330316742081452e-06, + "loss": 0.8021, + "step": 165 + }, + { + "epoch": 0.05782067088984312, + "grad_norm": 7.548417828622113, + "learning_rate": 3.846153846153847e-06, + "loss": 0.7834, + "step": 170 + }, + { + "epoch": 0.05952127885719145, + "grad_norm": 8.169502681440726, + "learning_rate": 3.959276018099548e-06, + "loss": 0.7809, + "step": 175 + }, + { + "epoch": 0.06122188682453977, + "grad_norm": 8.997947648112065, + "learning_rate": 4.072398190045249e-06, + "loss": 0.7669, + "step": 180 + }, + { + "epoch": 0.0629224947918881, + "grad_norm": 23.942263352004463, + "learning_rate": 4.185520361990951e-06, + "loss": 0.7577, + "step": 185 + }, + { + "epoch": 0.06462310275923643, + "grad_norm": 9.65549631769889, + "learning_rate": 4.298642533936652e-06, + "loss": 0.7592, + "step": 190 + }, + { + "epoch": 0.06632371072658476, + "grad_norm": 9.151096835570465, + "learning_rate": 4.411764705882353e-06, + "loss": 0.732, + "step": 195 + }, + { + "epoch": 0.06802431869393308, + "grad_norm": 23.689339149203136, + "learning_rate": 4.5248868778280546e-06, + "loss": 0.7169, + "step": 200 + }, + { + "epoch": 0.06972492666128141, + "grad_norm": 9.503382054799566, + "learning_rate": 4.6380090497737566e-06, + "loss": 0.7167, + "step": 205 + }, + { + "epoch": 0.07142553462862973, + "grad_norm": 11.060398318836771, + "learning_rate": 4.751131221719457e-06, + "loss": 0.7278, + "step": 210 + }, + { + "epoch": 0.07312614259597806, + "grad_norm": 14.275008718187829, + "learning_rate": 4.864253393665159e-06, + "loss": 0.6725, + "step": 215 + }, + { + "epoch": 0.07482675056332638, + "grad_norm": 9.254261400953006, + "learning_rate": 4.97737556561086e-06, + "loss": 0.7149, + "step": 220 + }, + { + "epoch": 0.07652735853067472, + "grad_norm": 12.152819376143459, + "learning_rate": 4.999999058430077e-06, + "loss": 0.6756, + "step": 225 + }, + { + "epoch": 0.07822796649802305, + "grad_norm": 12.71525277853713, + "learning_rate": 4.999995233303476e-06, + "loss": 0.72, + "step": 230 + }, + { + "epoch": 0.07992857446537137, + "grad_norm": 16.52069302728288, + "learning_rate": 4.999988465776579e-06, + "loss": 0.68, + "step": 235 + }, + { + "epoch": 0.0816291824327197, + "grad_norm": 21.514609789250752, + "learning_rate": 4.999978755857349e-06, + "loss": 0.6682, + "step": 240 + }, + { + "epoch": 0.08332979040006802, + "grad_norm": 21.83122952918328, + "learning_rate": 4.999966103557213e-06, + "loss": 0.6931, + "step": 245 + }, + { + "epoch": 0.08503039836741635, + "grad_norm": 8.826610512818476, + "learning_rate": 4.999950508891065e-06, + "loss": 0.7082, + "step": 250 + }, + { + "epoch": 0.08673100633476467, + "grad_norm": 7.902453275444064, + "learning_rate": 4.999931971877258e-06, + "loss": 0.676, + "step": 255 + }, + { + "epoch": 0.088431614302113, + "grad_norm": 16.854627629783014, + "learning_rate": 4.99991049253761e-06, + "loss": 0.6749, + "step": 260 + }, + { + "epoch": 0.09013222226946133, + "grad_norm": 9.936283538247146, + "learning_rate": 4.999886070897401e-06, + "loss": 0.683, + "step": 265 + }, + { + "epoch": 0.09183283023680966, + "grad_norm": 9.757808014367393, + "learning_rate": 4.999858706985373e-06, + "loss": 0.6862, + "step": 270 + }, + { + "epoch": 0.09353343820415799, + "grad_norm": 19.431419507015317, + "learning_rate": 4.999828400833734e-06, + "loss": 0.6694, + "step": 275 + }, + { + "epoch": 0.09523404617150631, + "grad_norm": 15.051911904223115, + "learning_rate": 4.999795152478153e-06, + "loss": 0.6801, + "step": 280 + }, + { + "epoch": 0.09693465413885465, + "grad_norm": 6.485125825801793, + "learning_rate": 4.999758961957761e-06, + "loss": 0.6419, + "step": 285 + }, + { + "epoch": 0.09863526210620296, + "grad_norm": 11.220090644662514, + "learning_rate": 4.999719829315155e-06, + "loss": 0.6408, + "step": 290 + }, + { + "epoch": 0.1003358700735513, + "grad_norm": 10.519717283422201, + "learning_rate": 4.99967775459639e-06, + "loss": 0.6608, + "step": 295 + }, + { + "epoch": 0.10203647804089962, + "grad_norm": 6.226029435794413, + "learning_rate": 4.999632737850989e-06, + "loss": 0.6091, + "step": 300 + }, + { + "epoch": 0.10373708600824795, + "grad_norm": 8.430092149740332, + "learning_rate": 4.999584779131933e-06, + "loss": 0.6631, + "step": 305 + }, + { + "epoch": 0.10543769397559627, + "grad_norm": 12.188582572432878, + "learning_rate": 4.999533878495668e-06, + "loss": 0.6165, + "step": 310 + }, + { + "epoch": 0.1071383019429446, + "grad_norm": 45.910216646833675, + "learning_rate": 4.9994800360021025e-06, + "loss": 0.666, + "step": 315 + }, + { + "epoch": 0.10883890991029294, + "grad_norm": 10.217829004467873, + "learning_rate": 4.999423251714608e-06, + "loss": 0.6705, + "step": 320 + }, + { + "epoch": 0.11053951787764126, + "grad_norm": 14.882953923091867, + "learning_rate": 4.999363525700016e-06, + "loss": 0.6803, + "step": 325 + }, + { + "epoch": 0.11224012584498959, + "grad_norm": 31.58978597081891, + "learning_rate": 4.999300858028622e-06, + "loss": 0.6537, + "step": 330 + }, + { + "epoch": 0.11394073381233791, + "grad_norm": 13.780953871869633, + "learning_rate": 4.999235248774183e-06, + "loss": 0.662, + "step": 335 + }, + { + "epoch": 0.11564134177968624, + "grad_norm": 10.335809816067288, + "learning_rate": 4.999166698013921e-06, + "loss": 0.6321, + "step": 340 + }, + { + "epoch": 0.11734194974703456, + "grad_norm": 11.932357562298984, + "learning_rate": 4.999095205828515e-06, + "loss": 0.6204, + "step": 345 + }, + { + "epoch": 0.1190425577143829, + "grad_norm": 15.051840430556013, + "learning_rate": 4.99902077230211e-06, + "loss": 0.6196, + "step": 350 + }, + { + "epoch": 0.12074316568173121, + "grad_norm": 5.225931699307834, + "learning_rate": 4.9989433975223105e-06, + "loss": 0.6026, + "step": 355 + }, + { + "epoch": 0.12244377364907955, + "grad_norm": 23.381306701490157, + "learning_rate": 4.9988630815801845e-06, + "loss": 0.6267, + "step": 360 + }, + { + "epoch": 0.12414438161642788, + "grad_norm": 5.887716941478886, + "learning_rate": 4.9987798245702615e-06, + "loss": 0.64, + "step": 365 + }, + { + "epoch": 0.1258449895837762, + "grad_norm": 9.763764194339249, + "learning_rate": 4.99869362659053e-06, + "loss": 0.6487, + "step": 370 + }, + { + "epoch": 0.12754559755112452, + "grad_norm": 18.995686135308443, + "learning_rate": 4.998604487742444e-06, + "loss": 0.6238, + "step": 375 + }, + { + "epoch": 0.12924620551847285, + "grad_norm": 16.79081962728407, + "learning_rate": 4.998512408130914e-06, + "loss": 0.6544, + "step": 380 + }, + { + "epoch": 0.13094681348582118, + "grad_norm": 22.69480477412656, + "learning_rate": 4.998417387864316e-06, + "loss": 0.6659, + "step": 385 + }, + { + "epoch": 0.13264742145316952, + "grad_norm": 8.466445322881029, + "learning_rate": 4.998319427054486e-06, + "loss": 0.6341, + "step": 390 + }, + { + "epoch": 0.13434802942051782, + "grad_norm": 19.163211220414826, + "learning_rate": 4.998218525816717e-06, + "loss": 0.6256, + "step": 395 + }, + { + "epoch": 0.13604863738786616, + "grad_norm": 10.50771347186857, + "learning_rate": 4.99811468426977e-06, + "loss": 0.6509, + "step": 400 + }, + { + "epoch": 0.1377492453552145, + "grad_norm": 7.984299752999823, + "learning_rate": 4.99800790253586e-06, + "loss": 0.6003, + "step": 405 + }, + { + "epoch": 0.13944985332256282, + "grad_norm": 14.463342596093812, + "learning_rate": 4.997898180740665e-06, + "loss": 0.6412, + "step": 410 + }, + { + "epoch": 0.14115046128991116, + "grad_norm": 13.750816687294764, + "learning_rate": 4.997785519013324e-06, + "loss": 0.6466, + "step": 415 + }, + { + "epoch": 0.14285106925725946, + "grad_norm": 10.10934252434204, + "learning_rate": 4.997669917486437e-06, + "loss": 0.6155, + "step": 420 + }, + { + "epoch": 0.1445516772246078, + "grad_norm": 17.79848414615945, + "learning_rate": 4.997551376296061e-06, + "loss": 0.6091, + "step": 425 + }, + { + "epoch": 0.14625228519195613, + "grad_norm": 8.761272326725976, + "learning_rate": 4.997429895581715e-06, + "loss": 0.6654, + "step": 430 + }, + { + "epoch": 0.14795289315930446, + "grad_norm": 10.547931177670689, + "learning_rate": 4.9973054754863765e-06, + "loss": 0.6347, + "step": 435 + }, + { + "epoch": 0.14965350112665277, + "grad_norm": 21.541634564039974, + "learning_rate": 4.997178116156484e-06, + "loss": 0.6056, + "step": 440 + }, + { + "epoch": 0.1513541090940011, + "grad_norm": 58.40832117665425, + "learning_rate": 4.997047817741935e-06, + "loss": 0.599, + "step": 445 + }, + { + "epoch": 0.15305471706134943, + "grad_norm": 42.61840891068491, + "learning_rate": 4.996914580396085e-06, + "loss": 0.6139, + "step": 450 + }, + { + "epoch": 0.15475532502869777, + "grad_norm": 33.2602204534426, + "learning_rate": 4.99677840427575e-06, + "loss": 0.5814, + "step": 455 + }, + { + "epoch": 0.1564559329960461, + "grad_norm": 17.75586783558878, + "learning_rate": 4.9966392895412035e-06, + "loss": 0.6225, + "step": 460 + }, + { + "epoch": 0.1581565409633944, + "grad_norm": 22.11341977382103, + "learning_rate": 4.996497236356179e-06, + "loss": 0.6116, + "step": 465 + }, + { + "epoch": 0.15985714893074274, + "grad_norm": 6.925398096788078, + "learning_rate": 4.996352244887868e-06, + "loss": 0.6278, + "step": 470 + }, + { + "epoch": 0.16155775689809107, + "grad_norm": 13.062658589923394, + "learning_rate": 4.996204315306918e-06, + "loss": 0.6352, + "step": 475 + }, + { + "epoch": 0.1632583648654394, + "grad_norm": 9.168214718861417, + "learning_rate": 4.996053447787439e-06, + "loss": 0.6279, + "step": 480 + }, + { + "epoch": 0.1649589728327877, + "grad_norm": 11.089977042081552, + "learning_rate": 4.995899642506995e-06, + "loss": 0.5696, + "step": 485 + }, + { + "epoch": 0.16665958080013604, + "grad_norm": 21.32464496096674, + "learning_rate": 4.99574289964661e-06, + "loss": 0.6424, + "step": 490 + }, + { + "epoch": 0.16836018876748438, + "grad_norm": 10.20829075940166, + "learning_rate": 4.995583219390764e-06, + "loss": 0.6468, + "step": 495 + }, + { + "epoch": 0.1700607967348327, + "grad_norm": 12.808347360487026, + "learning_rate": 4.995420601927393e-06, + "loss": 0.6019, + "step": 500 + }, + { + "epoch": 0.17176140470218104, + "grad_norm": 14.52851578382901, + "learning_rate": 4.9952550474478944e-06, + "loss": 0.6404, + "step": 505 + }, + { + "epoch": 0.17346201266952935, + "grad_norm": 9.455805978378836, + "learning_rate": 4.995086556147118e-06, + "loss": 0.6054, + "step": 510 + }, + { + "epoch": 0.17516262063687768, + "grad_norm": 11.538105090235101, + "learning_rate": 4.994915128223372e-06, + "loss": 0.5966, + "step": 515 + }, + { + "epoch": 0.176863228604226, + "grad_norm": 8.620211977586647, + "learning_rate": 4.994740763878421e-06, + "loss": 0.5818, + "step": 520 + }, + { + "epoch": 0.17856383657157435, + "grad_norm": 5.597079164594491, + "learning_rate": 4.994563463317485e-06, + "loss": 0.6358, + "step": 525 + }, + { + "epoch": 0.18026444453892265, + "grad_norm": 33.76289557696585, + "learning_rate": 4.9943832267492395e-06, + "loss": 0.6182, + "step": 530 + }, + { + "epoch": 0.18196505250627099, + "grad_norm": 7.537374321416811, + "learning_rate": 4.9942000543858175e-06, + "loss": 0.5935, + "step": 535 + }, + { + "epoch": 0.18366566047361932, + "grad_norm": 22.412650764298824, + "learning_rate": 4.994013946442804e-06, + "loss": 0.6073, + "step": 540 + }, + { + "epoch": 0.18536626844096765, + "grad_norm": 8.955431330054688, + "learning_rate": 4.993824903139243e-06, + "loss": 0.6291, + "step": 545 + }, + { + "epoch": 0.18706687640831599, + "grad_norm": 15.448389517405271, + "learning_rate": 4.99363292469763e-06, + "loss": 0.6185, + "step": 550 + }, + { + "epoch": 0.1887674843756643, + "grad_norm": 15.638782533428731, + "learning_rate": 4.993438011343918e-06, + "loss": 0.6126, + "step": 555 + }, + { + "epoch": 0.19046809234301262, + "grad_norm": 18.146240583453405, + "learning_rate": 4.99324016330751e-06, + "loss": 0.5975, + "step": 560 + }, + { + "epoch": 0.19216870031036096, + "grad_norm": 6.679553445363789, + "learning_rate": 4.993039380821268e-06, + "loss": 0.5781, + "step": 565 + }, + { + "epoch": 0.1938693082777093, + "grad_norm": 7.274465103154115, + "learning_rate": 4.992835664121506e-06, + "loss": 0.6128, + "step": 570 + }, + { + "epoch": 0.1955699162450576, + "grad_norm": 8.554386699728521, + "learning_rate": 4.9926290134479885e-06, + "loss": 0.5844, + "step": 575 + }, + { + "epoch": 0.19727052421240593, + "grad_norm": 15.461566631562391, + "learning_rate": 4.992419429043937e-06, + "loss": 0.5844, + "step": 580 + }, + { + "epoch": 0.19897113217975426, + "grad_norm": 19.367094609515036, + "learning_rate": 4.992206911156024e-06, + "loss": 0.5846, + "step": 585 + }, + { + "epoch": 0.2006717401471026, + "grad_norm": 7.949321770564925, + "learning_rate": 4.991991460034376e-06, + "loss": 0.5842, + "step": 590 + }, + { + "epoch": 0.20237234811445093, + "grad_norm": 33.169106678809236, + "learning_rate": 4.991773075932569e-06, + "loss": 0.6138, + "step": 595 + }, + { + "epoch": 0.20407295608179923, + "grad_norm": 11.807875501068386, + "learning_rate": 4.991551759107634e-06, + "loss": 0.605, + "step": 600 + }, + { + "epoch": 0.20577356404914757, + "grad_norm": 9.818511278159773, + "learning_rate": 4.991327509820053e-06, + "loss": 0.59, + "step": 605 + }, + { + "epoch": 0.2074741720164959, + "grad_norm": 13.519496693016531, + "learning_rate": 4.991100328333758e-06, + "loss": 0.5705, + "step": 610 + }, + { + "epoch": 0.20917477998384423, + "grad_norm": 11.998677980651333, + "learning_rate": 4.990870214916134e-06, + "loss": 0.5961, + "step": 615 + }, + { + "epoch": 0.21087538795119254, + "grad_norm": 12.57627298318073, + "learning_rate": 4.990637169838016e-06, + "loss": 0.549, + "step": 620 + }, + { + "epoch": 0.21257599591854087, + "grad_norm": 17.643005954195313, + "learning_rate": 4.990401193373688e-06, + "loss": 0.5859, + "step": 625 + }, + { + "epoch": 0.2142766038858892, + "grad_norm": 8.023911558473781, + "learning_rate": 4.990162285800886e-06, + "loss": 0.6005, + "step": 630 + }, + { + "epoch": 0.21597721185323754, + "grad_norm": 11.829776562644458, + "learning_rate": 4.989920447400795e-06, + "loss": 0.5895, + "step": 635 + }, + { + "epoch": 0.21767781982058587, + "grad_norm": 10.355274989975698, + "learning_rate": 4.989675678458051e-06, + "loss": 0.5783, + "step": 640 + }, + { + "epoch": 0.21937842778793418, + "grad_norm": 8.69232719426023, + "learning_rate": 4.989427979260736e-06, + "loss": 0.5789, + "step": 645 + }, + { + "epoch": 0.2210790357552825, + "grad_norm": 37.636377516133614, + "learning_rate": 4.989177350100383e-06, + "loss": 0.5719, + "step": 650 + }, + { + "epoch": 0.22277964372263084, + "grad_norm": 5.815207714212499, + "learning_rate": 4.988923791271976e-06, + "loss": 0.583, + "step": 655 + }, + { + "epoch": 0.22448025168997918, + "grad_norm": 40.068811104492795, + "learning_rate": 4.98866730307394e-06, + "loss": 0.6066, + "step": 660 + }, + { + "epoch": 0.22618085965732748, + "grad_norm": 8.20744741521443, + "learning_rate": 4.988407885808153e-06, + "loss": 0.5652, + "step": 665 + }, + { + "epoch": 0.22788146762467582, + "grad_norm": 11.283153989927639, + "learning_rate": 4.988145539779941e-06, + "loss": 0.5962, + "step": 670 + }, + { + "epoch": 0.22958207559202415, + "grad_norm": 8.555390469834528, + "learning_rate": 4.987880265298074e-06, + "loss": 0.5461, + "step": 675 + }, + { + "epoch": 0.23128268355937248, + "grad_norm": 34.05887296744781, + "learning_rate": 4.987612062674771e-06, + "loss": 0.547, + "step": 680 + }, + { + "epoch": 0.23298329152672081, + "grad_norm": 7.011722422652996, + "learning_rate": 4.9873409322256965e-06, + "loss": 0.6133, + "step": 685 + }, + { + "epoch": 0.23468389949406912, + "grad_norm": 108.36469510812252, + "learning_rate": 4.9870668742699595e-06, + "loss": 0.5601, + "step": 690 + }, + { + "epoch": 0.23638450746141745, + "grad_norm": 9.570385046609053, + "learning_rate": 4.986789889130117e-06, + "loss": 0.5887, + "step": 695 + }, + { + "epoch": 0.2380851154287658, + "grad_norm": 9.717325030089524, + "learning_rate": 4.98650997713217e-06, + "loss": 0.561, + "step": 700 + }, + { + "epoch": 0.23978572339611412, + "grad_norm": 24.521659275717248, + "learning_rate": 4.986227138605564e-06, + "loss": 0.5919, + "step": 705 + }, + { + "epoch": 0.24148633136346243, + "grad_norm": 7.215984020082289, + "learning_rate": 4.985941373883189e-06, + "loss": 0.5791, + "step": 710 + }, + { + "epoch": 0.24318693933081076, + "grad_norm": 14.248665524828537, + "learning_rate": 4.985652683301379e-06, + "loss": 0.5486, + "step": 715 + }, + { + "epoch": 0.2448875472981591, + "grad_norm": 8.789194246601989, + "learning_rate": 4.985361067199915e-06, + "loss": 0.5545, + "step": 720 + }, + { + "epoch": 0.24658815526550742, + "grad_norm": 10.012395745311808, + "learning_rate": 4.985066525922014e-06, + "loss": 0.5868, + "step": 725 + }, + { + "epoch": 0.24828876323285576, + "grad_norm": 8.64765545690827, + "learning_rate": 4.984769059814343e-06, + "loss": 0.5867, + "step": 730 + }, + { + "epoch": 0.24998937120020406, + "grad_norm": 6.843706075436087, + "learning_rate": 4.984468669227007e-06, + "loss": 0.5557, + "step": 735 + }, + { + "epoch": 0.2516899791675524, + "grad_norm": 19.947108102711866, + "learning_rate": 4.984165354513555e-06, + "loss": 0.5571, + "step": 740 + }, + { + "epoch": 0.2533905871349007, + "grad_norm": 11.780021114199064, + "learning_rate": 4.983859116030976e-06, + "loss": 0.5666, + "step": 745 + }, + { + "epoch": 0.25509119510224904, + "grad_norm": 5.630908993158177, + "learning_rate": 4.983549954139702e-06, + "loss": 0.5511, + "step": 750 + }, + { + "epoch": 0.25679180306959737, + "grad_norm": 6.928219837079772, + "learning_rate": 4.983237869203606e-06, + "loss": 0.5519, + "step": 755 + }, + { + "epoch": 0.2584924110369457, + "grad_norm": 9.85491708098051, + "learning_rate": 4.982922861589997e-06, + "loss": 0.5477, + "step": 760 + }, + { + "epoch": 0.26019301900429403, + "grad_norm": 21.197537518574446, + "learning_rate": 4.982604931669631e-06, + "loss": 0.5686, + "step": 765 + }, + { + "epoch": 0.26189362697164237, + "grad_norm": 5.825778604217589, + "learning_rate": 4.982284079816697e-06, + "loss": 0.5858, + "step": 770 + }, + { + "epoch": 0.2635942349389907, + "grad_norm": 5.1625548382347075, + "learning_rate": 4.981960306408826e-06, + "loss": 0.5715, + "step": 775 + }, + { + "epoch": 0.26529484290633903, + "grad_norm": 10.036784754520419, + "learning_rate": 4.981633611827088e-06, + "loss": 0.5551, + "step": 780 + }, + { + "epoch": 0.26699545087368737, + "grad_norm": 10.305028664800242, + "learning_rate": 4.98130399645599e-06, + "loss": 0.5599, + "step": 785 + }, + { + "epoch": 0.26869605884103565, + "grad_norm": 4.589228297867227, + "learning_rate": 4.980971460683475e-06, + "loss": 0.5949, + "step": 790 + }, + { + "epoch": 0.270396666808384, + "grad_norm": 8.27995784938204, + "learning_rate": 4.980636004900927e-06, + "loss": 0.56, + "step": 795 + }, + { + "epoch": 0.2720972747757323, + "grad_norm": 4.493782493116703, + "learning_rate": 4.980297629503165e-06, + "loss": 0.5835, + "step": 800 + }, + { + "epoch": 0.27379788274308064, + "grad_norm": 7.126752565086873, + "learning_rate": 4.979956334888443e-06, + "loss": 0.5671, + "step": 805 + }, + { + "epoch": 0.275498490710429, + "grad_norm": 4.463794058054743, + "learning_rate": 4.979612121458452e-06, + "loss": 0.5703, + "step": 810 + }, + { + "epoch": 0.2771990986777773, + "grad_norm": 5.351060624932273, + "learning_rate": 4.9792649896183195e-06, + "loss": 0.5907, + "step": 815 + }, + { + "epoch": 0.27889970664512564, + "grad_norm": 92.26169791363449, + "learning_rate": 4.978914939776606e-06, + "loss": 0.5393, + "step": 820 + }, + { + "epoch": 0.280600314612474, + "grad_norm": 9.493610254132465, + "learning_rate": 4.978561972345306e-06, + "loss": 0.5723, + "step": 825 + }, + { + "epoch": 0.2823009225798223, + "grad_norm": 7.8072634292712575, + "learning_rate": 4.978206087739851e-06, + "loss": 0.5459, + "step": 830 + }, + { + "epoch": 0.2840015305471706, + "grad_norm": 5.331414597258025, + "learning_rate": 4.9778472863791e-06, + "loss": 0.5611, + "step": 835 + }, + { + "epoch": 0.2857021385145189, + "grad_norm": 16.750010087853518, + "learning_rate": 4.977485568685353e-06, + "loss": 0.5804, + "step": 840 + }, + { + "epoch": 0.28740274648186726, + "grad_norm": 9.52017836985358, + "learning_rate": 4.977120935084336e-06, + "loss": 0.5626, + "step": 845 + }, + { + "epoch": 0.2891033544492156, + "grad_norm": 5.48572070726049, + "learning_rate": 4.97675338600521e-06, + "loss": 0.6, + "step": 850 + }, + { + "epoch": 0.2908039624165639, + "grad_norm": 4.410706726025624, + "learning_rate": 4.976382921880564e-06, + "loss": 0.5426, + "step": 855 + }, + { + "epoch": 0.29250457038391225, + "grad_norm": 6.059898850103638, + "learning_rate": 4.976009543146423e-06, + "loss": 0.543, + "step": 860 + }, + { + "epoch": 0.2942051783512606, + "grad_norm": 20.172923388018912, + "learning_rate": 4.975633250242239e-06, + "loss": 0.5557, + "step": 865 + }, + { + "epoch": 0.2959057863186089, + "grad_norm": 5.932443651579922, + "learning_rate": 4.975254043610894e-06, + "loss": 0.5806, + "step": 870 + }, + { + "epoch": 0.29760639428595725, + "grad_norm": 3.8592716485343224, + "learning_rate": 4.9748719236987e-06, + "loss": 0.5553, + "step": 875 + }, + { + "epoch": 0.29930700225330553, + "grad_norm": 3.978579777692364, + "learning_rate": 4.974486890955398e-06, + "loss": 0.5903, + "step": 880 + }, + { + "epoch": 0.30100761022065387, + "grad_norm": 6.483714401233538, + "learning_rate": 4.9740989458341574e-06, + "loss": 0.527, + "step": 885 + }, + { + "epoch": 0.3027082181880022, + "grad_norm": 6.120570644247395, + "learning_rate": 4.973708088791574e-06, + "loss": 0.5599, + "step": 890 + }, + { + "epoch": 0.30440882615535053, + "grad_norm": 7.440959126495233, + "learning_rate": 4.973314320287674e-06, + "loss": 0.5384, + "step": 895 + }, + { + "epoch": 0.30610943412269886, + "grad_norm": 5.871413458662885, + "learning_rate": 4.972917640785906e-06, + "loss": 0.5498, + "step": 900 + }, + { + "epoch": 0.3078100420900472, + "grad_norm": 3.61871679460945, + "learning_rate": 4.972518050753146e-06, + "loss": 0.5527, + "step": 905 + }, + { + "epoch": 0.30951065005739553, + "grad_norm": 3.7899680523970214, + "learning_rate": 4.9721155506597e-06, + "loss": 0.5674, + "step": 910 + }, + { + "epoch": 0.31121125802474386, + "grad_norm": 5.213262776183392, + "learning_rate": 4.971710140979292e-06, + "loss": 0.5383, + "step": 915 + }, + { + "epoch": 0.3129118659920922, + "grad_norm": 5.3160173941547075, + "learning_rate": 4.971301822189077e-06, + "loss": 0.5613, + "step": 920 + }, + { + "epoch": 0.3146124739594405, + "grad_norm": 5.996017735597043, + "learning_rate": 4.970890594769627e-06, + "loss": 0.5563, + "step": 925 + }, + { + "epoch": 0.3163130819267888, + "grad_norm": 6.198763177382854, + "learning_rate": 4.970476459204945e-06, + "loss": 0.5409, + "step": 930 + }, + { + "epoch": 0.31801368989413714, + "grad_norm": 4.017575965045663, + "learning_rate": 4.97005941598245e-06, + "loss": 0.5448, + "step": 935 + }, + { + "epoch": 0.3197142978614855, + "grad_norm": 3.672395789135882, + "learning_rate": 4.9696394655929884e-06, + "loss": 0.5129, + "step": 940 + }, + { + "epoch": 0.3214149058288338, + "grad_norm": 24.946087982059257, + "learning_rate": 4.9692166085308244e-06, + "loss": 0.5546, + "step": 945 + }, + { + "epoch": 0.32311551379618214, + "grad_norm": 5.3019007125672415, + "learning_rate": 4.968790845293646e-06, + "loss": 0.5655, + "step": 950 + }, + { + "epoch": 0.3248161217635305, + "grad_norm": 5.693729615742935, + "learning_rate": 4.96836217638256e-06, + "loss": 0.5535, + "step": 955 + }, + { + "epoch": 0.3265167297308788, + "grad_norm": 5.49960606645776, + "learning_rate": 4.967930602302094e-06, + "loss": 0.5707, + "step": 960 + }, + { + "epoch": 0.32821733769822714, + "grad_norm": 16.264894478999373, + "learning_rate": 4.967496123560193e-06, + "loss": 0.5545, + "step": 965 + }, + { + "epoch": 0.3299179456655754, + "grad_norm": 7.924514395325215, + "learning_rate": 4.9670587406682235e-06, + "loss": 0.562, + "step": 970 + }, + { + "epoch": 0.33161855363292375, + "grad_norm": 8.601257289905918, + "learning_rate": 4.966618454140969e-06, + "loss": 0.5537, + "step": 975 + }, + { + "epoch": 0.3333191616002721, + "grad_norm": 8.691735260143128, + "learning_rate": 4.966175264496629e-06, + "loss": 0.5569, + "step": 980 + }, + { + "epoch": 0.3350197695676204, + "grad_norm": 19.92859770802887, + "learning_rate": 4.965729172256822e-06, + "loss": 0.5485, + "step": 985 + }, + { + "epoch": 0.33672037753496875, + "grad_norm": 6.849877453900109, + "learning_rate": 4.9652801779465815e-06, + "loss": 0.5443, + "step": 990 + }, + { + "epoch": 0.3384209855023171, + "grad_norm": 6.020515213223856, + "learning_rate": 4.964828282094356e-06, + "loss": 0.5159, + "step": 995 + }, + { + "epoch": 0.3401215934696654, + "grad_norm": 18.6239988809091, + "learning_rate": 4.964373485232012e-06, + "loss": 0.5711, + "step": 1000 + }, + { + "epoch": 0.34182220143701375, + "grad_norm": 4.822075213885056, + "learning_rate": 4.963915787894827e-06, + "loss": 0.5349, + "step": 1005 + }, + { + "epoch": 0.3435228094043621, + "grad_norm": 8.028398581194914, + "learning_rate": 4.963455190621492e-06, + "loss": 0.5413, + "step": 1010 + }, + { + "epoch": 0.34522341737171036, + "grad_norm": 5.371954315819236, + "learning_rate": 4.962991693954115e-06, + "loss": 0.5536, + "step": 1015 + }, + { + "epoch": 0.3469240253390587, + "grad_norm": 5.842648577621306, + "learning_rate": 4.962525298438213e-06, + "loss": 0.5436, + "step": 1020 + }, + { + "epoch": 0.34862463330640703, + "grad_norm": 4.632859936784621, + "learning_rate": 4.962056004622716e-06, + "loss": 0.5334, + "step": 1025 + }, + { + "epoch": 0.35032524127375536, + "grad_norm": 8.003687856035425, + "learning_rate": 4.961583813059966e-06, + "loss": 0.5131, + "step": 1030 + }, + { + "epoch": 0.3520258492411037, + "grad_norm": 11.804117251299203, + "learning_rate": 4.961108724305714e-06, + "loss": 0.5131, + "step": 1035 + }, + { + "epoch": 0.353726457208452, + "grad_norm": 5.612226850601007, + "learning_rate": 4.960630738919122e-06, + "loss": 0.5585, + "step": 1040 + }, + { + "epoch": 0.35542706517580036, + "grad_norm": 10.67681254905423, + "learning_rate": 4.9601498574627604e-06, + "loss": 0.5269, + "step": 1045 + }, + { + "epoch": 0.3571276731431487, + "grad_norm": 5.676959649188794, + "learning_rate": 4.959666080502609e-06, + "loss": 0.5331, + "step": 1050 + }, + { + "epoch": 0.358828281110497, + "grad_norm": 7.879954337915793, + "learning_rate": 4.959179408608053e-06, + "loss": 0.5649, + "step": 1055 + }, + { + "epoch": 0.3605288890778453, + "grad_norm": 10.994295716410706, + "learning_rate": 4.958689842351891e-06, + "loss": 0.5212, + "step": 1060 + }, + { + "epoch": 0.36222949704519364, + "grad_norm": 6.393504474286174, + "learning_rate": 4.95819738231032e-06, + "loss": 0.5469, + "step": 1065 + }, + { + "epoch": 0.36393010501254197, + "grad_norm": 5.526703277625495, + "learning_rate": 4.95770202906295e-06, + "loss": 0.5405, + "step": 1070 + }, + { + "epoch": 0.3656307129798903, + "grad_norm": 4.905009308794142, + "learning_rate": 4.957203783192791e-06, + "loss": 0.5049, + "step": 1075 + }, + { + "epoch": 0.36733132094723864, + "grad_norm": 3.650707522940902, + "learning_rate": 4.956702645286261e-06, + "loss": 0.5308, + "step": 1080 + }, + { + "epoch": 0.36903192891458697, + "grad_norm": 26.974090854070898, + "learning_rate": 4.95619861593318e-06, + "loss": 0.5526, + "step": 1085 + }, + { + "epoch": 0.3707325368819353, + "grad_norm": 5.833810588702081, + "learning_rate": 4.955691695726771e-06, + "loss": 0.5483, + "step": 1090 + }, + { + "epoch": 0.37243314484928364, + "grad_norm": 8.1828994258448, + "learning_rate": 4.95518188526366e-06, + "loss": 0.5137, + "step": 1095 + }, + { + "epoch": 0.37413375281663197, + "grad_norm": 4.851544353465898, + "learning_rate": 4.954669185143876e-06, + "loss": 0.5173, + "step": 1100 + }, + { + "epoch": 0.37583436078398025, + "grad_norm": 5.393727849252035, + "learning_rate": 4.9541535959708466e-06, + "loss": 0.5568, + "step": 1105 + }, + { + "epoch": 0.3775349687513286, + "grad_norm": 4.42721889010259, + "learning_rate": 4.953635118351401e-06, + "loss": 0.5273, + "step": 1110 + }, + { + "epoch": 0.3792355767186769, + "grad_norm": 9.849567416783387, + "learning_rate": 4.953113752895769e-06, + "loss": 0.5445, + "step": 1115 + }, + { + "epoch": 0.38093618468602525, + "grad_norm": 6.044269741022495, + "learning_rate": 4.952589500217576e-06, + "loss": 0.5313, + "step": 1120 + }, + { + "epoch": 0.3826367926533736, + "grad_norm": 4.982181056132783, + "learning_rate": 4.952062360933849e-06, + "loss": 0.5473, + "step": 1125 + }, + { + "epoch": 0.3843374006207219, + "grad_norm": 4.301027365473603, + "learning_rate": 4.9515323356650115e-06, + "loss": 0.5688, + "step": 1130 + }, + { + "epoch": 0.38603800858807025, + "grad_norm": 3.643635403946816, + "learning_rate": 4.950999425034882e-06, + "loss": 0.5363, + "step": 1135 + }, + { + "epoch": 0.3877386165554186, + "grad_norm": 3.83748828648786, + "learning_rate": 4.950463629670678e-06, + "loss": 0.5212, + "step": 1140 + }, + { + "epoch": 0.3894392245227669, + "grad_norm": 4.663616763812123, + "learning_rate": 4.949924950203009e-06, + "loss": 0.5326, + "step": 1145 + }, + { + "epoch": 0.3911398324901152, + "grad_norm": 6.234666871922535, + "learning_rate": 4.949383387265881e-06, + "loss": 0.5414, + "step": 1150 + }, + { + "epoch": 0.3928404404574635, + "grad_norm": 5.132252044177524, + "learning_rate": 4.948838941496692e-06, + "loss": 0.5429, + "step": 1155 + }, + { + "epoch": 0.39454104842481186, + "grad_norm": 6.7815614041683885, + "learning_rate": 4.948291613536237e-06, + "loss": 0.5556, + "step": 1160 + }, + { + "epoch": 0.3962416563921602, + "grad_norm": 6.167051999446675, + "learning_rate": 4.947741404028697e-06, + "loss": 0.5245, + "step": 1165 + }, + { + "epoch": 0.3979422643595085, + "grad_norm": 3.9963057579837558, + "learning_rate": 4.94718831362165e-06, + "loss": 0.5151, + "step": 1170 + }, + { + "epoch": 0.39964287232685686, + "grad_norm": 5.246564898589007, + "learning_rate": 4.946632342966063e-06, + "loss": 0.5208, + "step": 1175 + }, + { + "epoch": 0.4013434802942052, + "grad_norm": 5.94690942725412, + "learning_rate": 4.946073492716291e-06, + "loss": 0.5308, + "step": 1180 + }, + { + "epoch": 0.4030440882615535, + "grad_norm": 4.144690597020785, + "learning_rate": 4.945511763530081e-06, + "loss": 0.5042, + "step": 1185 + }, + { + "epoch": 0.40474469622890186, + "grad_norm": 9.088228806626756, + "learning_rate": 4.944947156068567e-06, + "loss": 0.5161, + "step": 1190 + }, + { + "epoch": 0.40644530419625013, + "grad_norm": 4.627393792456539, + "learning_rate": 4.944379670996269e-06, + "loss": 0.5397, + "step": 1195 + }, + { + "epoch": 0.40814591216359847, + "grad_norm": 3.9794107239823315, + "learning_rate": 4.943809308981097e-06, + "loss": 0.5032, + "step": 1200 + }, + { + "epoch": 0.4098465201309468, + "grad_norm": 5.199479025001829, + "learning_rate": 4.943236070694346e-06, + "loss": 0.506, + "step": 1205 + }, + { + "epoch": 0.41154712809829513, + "grad_norm": 3.6782110958459286, + "learning_rate": 4.942659956810695e-06, + "loss": 0.5125, + "step": 1210 + }, + { + "epoch": 0.41324773606564347, + "grad_norm": 5.334976336098045, + "learning_rate": 4.9420809680082095e-06, + "loss": 0.5203, + "step": 1215 + }, + { + "epoch": 0.4149483440329918, + "grad_norm": 5.595794839734362, + "learning_rate": 4.941499104968336e-06, + "loss": 0.5499, + "step": 1220 + }, + { + "epoch": 0.41664895200034013, + "grad_norm": 6.580528830910674, + "learning_rate": 4.9409143683759065e-06, + "loss": 0.4854, + "step": 1225 + }, + { + "epoch": 0.41834955996768847, + "grad_norm": 3.524339063672003, + "learning_rate": 4.940326758919133e-06, + "loss": 0.5283, + "step": 1230 + }, + { + "epoch": 0.4200501679350368, + "grad_norm": 5.1992380024915645, + "learning_rate": 4.93973627728961e-06, + "loss": 0.5289, + "step": 1235 + }, + { + "epoch": 0.4217507759023851, + "grad_norm": 3.932664128734379, + "learning_rate": 4.939142924182314e-06, + "loss": 0.5446, + "step": 1240 + }, + { + "epoch": 0.4234513838697334, + "grad_norm": 8.984495756116655, + "learning_rate": 4.9385467002955965e-06, + "loss": 0.5124, + "step": 1245 + }, + { + "epoch": 0.42515199183708174, + "grad_norm": 4.594542649829299, + "learning_rate": 4.937947606331192e-06, + "loss": 0.5525, + "step": 1250 + }, + { + "epoch": 0.4268525998044301, + "grad_norm": 3.943719389932639, + "learning_rate": 4.937345642994211e-06, + "loss": 0.5187, + "step": 1255 + }, + { + "epoch": 0.4285532077717784, + "grad_norm": 6.6305586617254635, + "learning_rate": 4.936740810993143e-06, + "loss": 0.524, + "step": 1260 + }, + { + "epoch": 0.43025381573912674, + "grad_norm": 4.2711052445927935, + "learning_rate": 4.936133111039852e-06, + "loss": 0.5247, + "step": 1265 + }, + { + "epoch": 0.4319544237064751, + "grad_norm": 4.105088800881805, + "learning_rate": 4.9355225438495755e-06, + "loss": 0.5343, + "step": 1270 + }, + { + "epoch": 0.4336550316738234, + "grad_norm": 29.45639019594073, + "learning_rate": 4.934909110140932e-06, + "loss": 0.5107, + "step": 1275 + }, + { + "epoch": 0.43535563964117174, + "grad_norm": 9.843585653327034, + "learning_rate": 4.934292810635907e-06, + "loss": 0.4964, + "step": 1280 + }, + { + "epoch": 0.43705624760852, + "grad_norm": 4.397799128373228, + "learning_rate": 4.933673646059863e-06, + "loss": 0.5314, + "step": 1285 + }, + { + "epoch": 0.43875685557586835, + "grad_norm": 3.6290143915072637, + "learning_rate": 4.933051617141533e-06, + "loss": 0.5169, + "step": 1290 + }, + { + "epoch": 0.4404574635432167, + "grad_norm": 4.597619249145893, + "learning_rate": 4.932426724613023e-06, + "loss": 0.5304, + "step": 1295 + }, + { + "epoch": 0.442158071510565, + "grad_norm": 4.281712154934983, + "learning_rate": 4.931798969209806e-06, + "loss": 0.5203, + "step": 1300 + }, + { + "epoch": 0.44385867947791335, + "grad_norm": 4.16242238081176, + "learning_rate": 4.931168351670727e-06, + "loss": 0.5115, + "step": 1305 + }, + { + "epoch": 0.4455592874452617, + "grad_norm": 3.668437512941314, + "learning_rate": 4.930534872737999e-06, + "loss": 0.5134, + "step": 1310 + }, + { + "epoch": 0.44725989541261, + "grad_norm": 3.914120522681882, + "learning_rate": 4.929898533157206e-06, + "loss": 0.5059, + "step": 1315 + }, + { + "epoch": 0.44896050337995835, + "grad_norm": 17.584805925934216, + "learning_rate": 4.92925933367729e-06, + "loss": 0.5062, + "step": 1320 + }, + { + "epoch": 0.4506611113473067, + "grad_norm": 9.716755292603246, + "learning_rate": 4.928617275050569e-06, + "loss": 0.543, + "step": 1325 + }, + { + "epoch": 0.45236171931465496, + "grad_norm": 9.825666844295863, + "learning_rate": 4.927972358032721e-06, + "loss": 0.5192, + "step": 1330 + }, + { + "epoch": 0.4540623272820033, + "grad_norm": 5.309335957751379, + "learning_rate": 4.927324583382788e-06, + "loss": 0.4956, + "step": 1335 + }, + { + "epoch": 0.45576293524935163, + "grad_norm": 6.105104421881541, + "learning_rate": 4.926673951863178e-06, + "loss": 0.5084, + "step": 1340 + }, + { + "epoch": 0.45746354321669996, + "grad_norm": 9.0127339165914, + "learning_rate": 4.926020464239658e-06, + "loss": 0.5192, + "step": 1345 + }, + { + "epoch": 0.4591641511840483, + "grad_norm": 7.508306727379984, + "learning_rate": 4.92536412128136e-06, + "loss": 0.5402, + "step": 1350 + }, + { + "epoch": 0.46086475915139663, + "grad_norm": 10.958004027192123, + "learning_rate": 4.924704923760773e-06, + "loss": 0.5311, + "step": 1355 + }, + { + "epoch": 0.46256536711874496, + "grad_norm": 23.348256819461614, + "learning_rate": 4.924042872453749e-06, + "loss": 0.5068, + "step": 1360 + }, + { + "epoch": 0.4642659750860933, + "grad_norm": 4.384966711835645, + "learning_rate": 4.923377968139498e-06, + "loss": 0.5272, + "step": 1365 + }, + { + "epoch": 0.46596658305344163, + "grad_norm": 5.097304462399159, + "learning_rate": 4.922710211600586e-06, + "loss": 0.4841, + "step": 1370 + }, + { + "epoch": 0.4676671910207899, + "grad_norm": 8.238318724884653, + "learning_rate": 4.922039603622939e-06, + "loss": 0.5133, + "step": 1375 + }, + { + "epoch": 0.46936779898813824, + "grad_norm": 4.312265953800085, + "learning_rate": 4.921366144995835e-06, + "loss": 0.5089, + "step": 1380 + }, + { + "epoch": 0.4710684069554866, + "grad_norm": 21.21052229234248, + "learning_rate": 4.920689836511911e-06, + "loss": 0.5194, + "step": 1385 + }, + { + "epoch": 0.4727690149228349, + "grad_norm": 6.828270310440812, + "learning_rate": 4.920010678967158e-06, + "loss": 0.4983, + "step": 1390 + }, + { + "epoch": 0.47446962289018324, + "grad_norm": 16.456284342932708, + "learning_rate": 4.919328673160916e-06, + "loss": 0.5232, + "step": 1395 + }, + { + "epoch": 0.4761702308575316, + "grad_norm": 5.452512136180211, + "learning_rate": 4.918643819895881e-06, + "loss": 0.4872, + "step": 1400 + }, + { + "epoch": 0.4778708388248799, + "grad_norm": 10.00043037154549, + "learning_rate": 4.917956119978101e-06, + "loss": 0.5121, + "step": 1405 + }, + { + "epoch": 0.47957144679222824, + "grad_norm": 6.648280131979151, + "learning_rate": 4.917265574216972e-06, + "loss": 0.4972, + "step": 1410 + }, + { + "epoch": 0.4812720547595766, + "grad_norm": 7.056170116608334, + "learning_rate": 4.9165721834252386e-06, + "loss": 0.5109, + "step": 1415 + }, + { + "epoch": 0.48297266272692485, + "grad_norm": 6.63777649357413, + "learning_rate": 4.915875948418999e-06, + "loss": 0.5093, + "step": 1420 + }, + { + "epoch": 0.4846732706942732, + "grad_norm": 6.289903815492902, + "learning_rate": 4.915176870017693e-06, + "loss": 0.4841, + "step": 1425 + }, + { + "epoch": 0.4863738786616215, + "grad_norm": 22.911413804487406, + "learning_rate": 4.9144749490441116e-06, + "loss": 0.5205, + "step": 1430 + }, + { + "epoch": 0.48807448662896985, + "grad_norm": 6.84767145522513, + "learning_rate": 4.913770186324387e-06, + "loss": 0.5217, + "step": 1435 + }, + { + "epoch": 0.4897750945963182, + "grad_norm": 15.458207722628924, + "learning_rate": 4.9130625826879996e-06, + "loss": 0.5044, + "step": 1440 + }, + { + "epoch": 0.4914757025636665, + "grad_norm": 6.0191432394301145, + "learning_rate": 4.912352138967773e-06, + "loss": 0.5227, + "step": 1445 + }, + { + "epoch": 0.49317631053101485, + "grad_norm": 5.759768357934574, + "learning_rate": 4.911638855999872e-06, + "loss": 0.5126, + "step": 1450 + }, + { + "epoch": 0.4948769184983632, + "grad_norm": 26.23263891230328, + "learning_rate": 4.910922734623804e-06, + "loss": 0.5401, + "step": 1455 + }, + { + "epoch": 0.4965775264657115, + "grad_norm": 8.5652855148443, + "learning_rate": 4.910203775682416e-06, + "loss": 0.5321, + "step": 1460 + }, + { + "epoch": 0.4982781344330598, + "grad_norm": 11.817399115283937, + "learning_rate": 4.909481980021897e-06, + "loss": 0.4836, + "step": 1465 + }, + { + "epoch": 0.4999787424004081, + "grad_norm": 7.423591169847086, + "learning_rate": 4.908757348491772e-06, + "loss": 0.5055, + "step": 1470 + }, + { + "epoch": 0.5016793503677565, + "grad_norm": 11.43019213459179, + "learning_rate": 4.9080298819449065e-06, + "loss": 0.5026, + "step": 1475 + }, + { + "epoch": 0.5033799583351048, + "grad_norm": 5.952995411347645, + "learning_rate": 4.9072995812375e-06, + "loss": 0.5047, + "step": 1480 + }, + { + "epoch": 0.5050805663024531, + "grad_norm": 5.842345126987176, + "learning_rate": 4.906566447229089e-06, + "loss": 0.5295, + "step": 1485 + }, + { + "epoch": 0.5067811742698014, + "grad_norm": 12.139042350418991, + "learning_rate": 4.905830480782546e-06, + "loss": 0.5132, + "step": 1490 + }, + { + "epoch": 0.5084817822371498, + "grad_norm": 6.024078140148232, + "learning_rate": 4.905091682764074e-06, + "loss": 0.504, + "step": 1495 + }, + { + "epoch": 0.5101823902044981, + "grad_norm": 6.014832908987949, + "learning_rate": 4.904350054043212e-06, + "loss": 0.5069, + "step": 1500 + }, + { + "epoch": 0.5118829981718465, + "grad_norm": 4.221709319864658, + "learning_rate": 4.9036055954928275e-06, + "loss": 0.5106, + "step": 1505 + }, + { + "epoch": 0.5135836061391947, + "grad_norm": 8.242477093088722, + "learning_rate": 4.9028583079891225e-06, + "loss": 0.5255, + "step": 1510 + }, + { + "epoch": 0.5152842141065431, + "grad_norm": 7.853884867567823, + "learning_rate": 4.902108192411623e-06, + "loss": 0.5027, + "step": 1515 + }, + { + "epoch": 0.5169848220738914, + "grad_norm": 8.378738866524177, + "learning_rate": 4.90135524964319e-06, + "loss": 0.5034, + "step": 1520 + }, + { + "epoch": 0.5186854300412398, + "grad_norm": 7.10934057499516, + "learning_rate": 4.900599480570007e-06, + "loss": 0.5433, + "step": 1525 + }, + { + "epoch": 0.5203860380085881, + "grad_norm": 3.633807957238506, + "learning_rate": 4.899840886081587e-06, + "loss": 0.4692, + "step": 1530 + }, + { + "epoch": 0.5220866459759363, + "grad_norm": 22.01575433481114, + "learning_rate": 4.899079467070765e-06, + "loss": 0.5064, + "step": 1535 + }, + { + "epoch": 0.5237872539432847, + "grad_norm": 7.008822525300486, + "learning_rate": 4.898315224433705e-06, + "loss": 0.503, + "step": 1540 + }, + { + "epoch": 0.525487861910633, + "grad_norm": 6.499364764260329, + "learning_rate": 4.89754815906989e-06, + "loss": 0.5127, + "step": 1545 + }, + { + "epoch": 0.5271884698779814, + "grad_norm": 7.1158336276148555, + "learning_rate": 4.896778271882129e-06, + "loss": 0.4955, + "step": 1550 + }, + { + "epoch": 0.5288890778453297, + "grad_norm": 4.583909761477803, + "learning_rate": 4.896005563776548e-06, + "loss": 0.4843, + "step": 1555 + }, + { + "epoch": 0.5305896858126781, + "grad_norm": 8.683598896960767, + "learning_rate": 4.895230035662596e-06, + "loss": 0.522, + "step": 1560 + }, + { + "epoch": 0.5322902937800263, + "grad_norm": 9.43675992818845, + "learning_rate": 4.894451688453041e-06, + "loss": 0.5015, + "step": 1565 + }, + { + "epoch": 0.5339909017473747, + "grad_norm": 6.993609954246139, + "learning_rate": 4.893670523063969e-06, + "loss": 0.4975, + "step": 1570 + }, + { + "epoch": 0.535691509714723, + "grad_norm": 8.637779532146578, + "learning_rate": 4.892886540414781e-06, + "loss": 0.5043, + "step": 1575 + }, + { + "epoch": 0.5373921176820713, + "grad_norm": 4.197137588803186, + "learning_rate": 4.892099741428195e-06, + "loss": 0.4979, + "step": 1580 + }, + { + "epoch": 0.5390927256494197, + "grad_norm": 4.895856929628726, + "learning_rate": 4.891310127030245e-06, + "loss": 0.5354, + "step": 1585 + }, + { + "epoch": 0.540793333616768, + "grad_norm": 77.59232070659658, + "learning_rate": 4.890517698150277e-06, + "loss": 0.4963, + "step": 1590 + }, + { + "epoch": 0.5424939415841163, + "grad_norm": 5.082350540203327, + "learning_rate": 4.8897224557209485e-06, + "loss": 0.4849, + "step": 1595 + }, + { + "epoch": 0.5441945495514646, + "grad_norm": 29.652950399011726, + "learning_rate": 4.8889244006782315e-06, + "loss": 0.5093, + "step": 1600 + }, + { + "epoch": 0.545895157518813, + "grad_norm": 3.7542360216139876, + "learning_rate": 4.8881235339614065e-06, + "loss": 0.496, + "step": 1605 + }, + { + "epoch": 0.5475957654861613, + "grad_norm": 5.102119315535732, + "learning_rate": 4.887319856513064e-06, + "loss": 0.4814, + "step": 1610 + }, + { + "epoch": 0.5492963734535097, + "grad_norm": 48.189015212909666, + "learning_rate": 4.8865133692791e-06, + "loss": 0.5043, + "step": 1615 + }, + { + "epoch": 0.550996981420858, + "grad_norm": 5.108464793813238, + "learning_rate": 4.885704073208723e-06, + "loss": 0.4871, + "step": 1620 + }, + { + "epoch": 0.5526975893882062, + "grad_norm": 3.801975209912871, + "learning_rate": 4.88489196925444e-06, + "loss": 0.4954, + "step": 1625 + }, + { + "epoch": 0.5543981973555546, + "grad_norm": 19.09107126052143, + "learning_rate": 4.88407705837207e-06, + "loss": 0.5217, + "step": 1630 + }, + { + "epoch": 0.5560988053229029, + "grad_norm": 4.28185994919201, + "learning_rate": 4.8832593415207306e-06, + "loss": 0.4829, + "step": 1635 + }, + { + "epoch": 0.5577994132902513, + "grad_norm": 5.515135680286602, + "learning_rate": 4.882438819662844e-06, + "loss": 0.5161, + "step": 1640 + }, + { + "epoch": 0.5595000212575996, + "grad_norm": 41.17614904295365, + "learning_rate": 4.881615493764136e-06, + "loss": 0.5137, + "step": 1645 + }, + { + "epoch": 0.561200629224948, + "grad_norm": 4.3491776766922605, + "learning_rate": 4.8807893647936266e-06, + "loss": 0.4831, + "step": 1650 + }, + { + "epoch": 0.5629012371922962, + "grad_norm": 21.235110695196852, + "learning_rate": 4.879960433723641e-06, + "loss": 0.4687, + "step": 1655 + }, + { + "epoch": 0.5646018451596446, + "grad_norm": 11.409972086344075, + "learning_rate": 4.879128701529798e-06, + "loss": 0.5001, + "step": 1660 + }, + { + "epoch": 0.5663024531269929, + "grad_norm": 15.662946322646652, + "learning_rate": 4.878294169191017e-06, + "loss": 0.4718, + "step": 1665 + }, + { + "epoch": 0.5680030610943412, + "grad_norm": 5.2615695796352835, + "learning_rate": 4.87745683768951e-06, + "loss": 0.5159, + "step": 1670 + }, + { + "epoch": 0.5697036690616896, + "grad_norm": 5.4981450533202825, + "learning_rate": 4.8766167080107845e-06, + "loss": 0.5018, + "step": 1675 + }, + { + "epoch": 0.5714042770290378, + "grad_norm": 4.772009449482228, + "learning_rate": 4.875773781143642e-06, + "loss": 0.4969, + "step": 1680 + }, + { + "epoch": 0.5731048849963862, + "grad_norm": 6.217195880579487, + "learning_rate": 4.874928058080176e-06, + "loss": 0.5014, + "step": 1685 + }, + { + "epoch": 0.5748054929637345, + "grad_norm": 4.835096996723914, + "learning_rate": 4.87407953981577e-06, + "loss": 0.5009, + "step": 1690 + }, + { + "epoch": 0.5765061009310829, + "grad_norm": 11.76873010691928, + "learning_rate": 4.873228227349098e-06, + "loss": 0.5219, + "step": 1695 + }, + { + "epoch": 0.5782067088984312, + "grad_norm": 7.812298909878062, + "learning_rate": 4.872374121682124e-06, + "loss": 0.4769, + "step": 1700 + }, + { + "epoch": 0.5799073168657796, + "grad_norm": 6.354998163013655, + "learning_rate": 4.871517223820097e-06, + "loss": 0.512, + "step": 1705 + }, + { + "epoch": 0.5816079248331278, + "grad_norm": 5.045657877477977, + "learning_rate": 4.870657534771553e-06, + "loss": 0.4787, + "step": 1710 + }, + { + "epoch": 0.5833085328004761, + "grad_norm": 4.220803612156143, + "learning_rate": 4.869795055548316e-06, + "loss": 0.4906, + "step": 1715 + }, + { + "epoch": 0.5850091407678245, + "grad_norm": 5.753072013667585, + "learning_rate": 4.868929787165488e-06, + "loss": 0.5104, + "step": 1720 + }, + { + "epoch": 0.5867097487351728, + "grad_norm": 7.075151115902222, + "learning_rate": 4.8680617306414605e-06, + "loss": 0.4792, + "step": 1725 + }, + { + "epoch": 0.5884103567025212, + "grad_norm": 7.925818725105634, + "learning_rate": 4.867190886997902e-06, + "loss": 0.4854, + "step": 1730 + }, + { + "epoch": 0.5901109646698695, + "grad_norm": 4.251475398397968, + "learning_rate": 4.8663172572597635e-06, + "loss": 0.5057, + "step": 1735 + }, + { + "epoch": 0.5918115726372178, + "grad_norm": 5.747439077315864, + "learning_rate": 4.865440842455273e-06, + "loss": 0.4957, + "step": 1740 + }, + { + "epoch": 0.5935121806045661, + "grad_norm": 6.078600480831179, + "learning_rate": 4.86456164361594e-06, + "loss": 0.4954, + "step": 1745 + }, + { + "epoch": 0.5952127885719145, + "grad_norm": 13.313329606941615, + "learning_rate": 4.863679661776546e-06, + "loss": 0.4921, + "step": 1750 + }, + { + "epoch": 0.5969133965392628, + "grad_norm": 17.953981141289145, + "learning_rate": 4.862794897975152e-06, + "loss": 0.5218, + "step": 1755 + }, + { + "epoch": 0.5986140045066111, + "grad_norm": 6.03392051370175, + "learning_rate": 4.86190735325309e-06, + "loss": 0.4786, + "step": 1760 + }, + { + "epoch": 0.6003146124739595, + "grad_norm": 4.247443888478685, + "learning_rate": 4.861017028654968e-06, + "loss": 0.5048, + "step": 1765 + }, + { + "epoch": 0.6020152204413077, + "grad_norm": 4.88629937124035, + "learning_rate": 4.8601239252286656e-06, + "loss": 0.4843, + "step": 1770 + }, + { + "epoch": 0.6037158284086561, + "grad_norm": 4.053847272751819, + "learning_rate": 4.859228044025329e-06, + "loss": 0.5098, + "step": 1775 + }, + { + "epoch": 0.6054164363760044, + "grad_norm": 20.647837698987473, + "learning_rate": 4.85832938609938e-06, + "loss": 0.5073, + "step": 1780 + }, + { + "epoch": 0.6071170443433528, + "grad_norm": 3.5679725080288116, + "learning_rate": 4.857427952508502e-06, + "loss": 0.4983, + "step": 1785 + }, + { + "epoch": 0.6088176523107011, + "grad_norm": 9.954128237756244, + "learning_rate": 4.856523744313651e-06, + "loss": 0.4835, + "step": 1790 + }, + { + "epoch": 0.6105182602780495, + "grad_norm": 3.2295931672229274, + "learning_rate": 4.855616762579045e-06, + "loss": 0.4841, + "step": 1795 + }, + { + "epoch": 0.6122188682453977, + "grad_norm": 4.346330125227621, + "learning_rate": 4.854707008372166e-06, + "loss": 0.5096, + "step": 1800 + }, + { + "epoch": 0.613919476212746, + "grad_norm": 7.383006888543724, + "learning_rate": 4.853794482763763e-06, + "loss": 0.4938, + "step": 1805 + }, + { + "epoch": 0.6156200841800944, + "grad_norm": 4.317265827403654, + "learning_rate": 4.852879186827843e-06, + "loss": 0.5038, + "step": 1810 + }, + { + "epoch": 0.6173206921474427, + "grad_norm": 4.598508284769239, + "learning_rate": 4.851961121641674e-06, + "loss": 0.4873, + "step": 1815 + }, + { + "epoch": 0.6190213001147911, + "grad_norm": 5.382585627951489, + "learning_rate": 4.851040288285786e-06, + "loss": 0.4984, + "step": 1820 + }, + { + "epoch": 0.6207219080821393, + "grad_norm": 8.002827010093496, + "learning_rate": 4.850116687843963e-06, + "loss": 0.4788, + "step": 1825 + }, + { + "epoch": 0.6224225160494877, + "grad_norm": 3.9799533025063973, + "learning_rate": 4.849190321403251e-06, + "loss": 0.4498, + "step": 1830 + }, + { + "epoch": 0.624123124016836, + "grad_norm": 8.894414052201425, + "learning_rate": 4.848261190053946e-06, + "loss": 0.4979, + "step": 1835 + }, + { + "epoch": 0.6258237319841844, + "grad_norm": 3.412756862000719, + "learning_rate": 4.8473292948896005e-06, + "loss": 0.4979, + "step": 1840 + }, + { + "epoch": 0.6275243399515327, + "grad_norm": 3.873859775374139, + "learning_rate": 4.846394637007022e-06, + "loss": 0.4902, + "step": 1845 + }, + { + "epoch": 0.629224947918881, + "grad_norm": 3.69179871568231, + "learning_rate": 4.845457217506265e-06, + "loss": 0.48, + "step": 1850 + }, + { + "epoch": 0.6309255558862293, + "grad_norm": 8.0157501503533, + "learning_rate": 4.84451703749064e-06, + "loss": 0.4866, + "step": 1855 + }, + { + "epoch": 0.6326261638535776, + "grad_norm": 6.609620215897019, + "learning_rate": 4.843574098066701e-06, + "loss": 0.4965, + "step": 1860 + }, + { + "epoch": 0.634326771820926, + "grad_norm": 3.693916650709301, + "learning_rate": 4.842628400344253e-06, + "loss": 0.4947, + "step": 1865 + }, + { + "epoch": 0.6360273797882743, + "grad_norm": 4.423863521675628, + "learning_rate": 4.841679945436348e-06, + "loss": 0.4773, + "step": 1870 + }, + { + "epoch": 0.6377279877556227, + "grad_norm": 6.541175340939763, + "learning_rate": 4.84072873445928e-06, + "loss": 0.4829, + "step": 1875 + }, + { + "epoch": 0.639428595722971, + "grad_norm": 3.9317693296152307, + "learning_rate": 4.8397747685325895e-06, + "loss": 0.4943, + "step": 1880 + }, + { + "epoch": 0.6411292036903193, + "grad_norm": 4.774570588691984, + "learning_rate": 4.838818048779057e-06, + "loss": 0.4889, + "step": 1885 + }, + { + "epoch": 0.6428298116576676, + "grad_norm": 4.703072623525204, + "learning_rate": 4.837858576324707e-06, + "loss": 0.5005, + "step": 1890 + }, + { + "epoch": 0.6445304196250159, + "grad_norm": 5.772442022027065, + "learning_rate": 4.8368963522988024e-06, + "loss": 0.4796, + "step": 1895 + }, + { + "epoch": 0.6462310275923643, + "grad_norm": 8.030324664728342, + "learning_rate": 4.835931377833845e-06, + "loss": 0.4812, + "step": 1900 + }, + { + "epoch": 0.6479316355597126, + "grad_norm": 3.5413513374729724, + "learning_rate": 4.834963654065572e-06, + "loss": 0.4856, + "step": 1905 + }, + { + "epoch": 0.649632243527061, + "grad_norm": 6.0837977052511025, + "learning_rate": 4.833993182132959e-06, + "loss": 0.4722, + "step": 1910 + }, + { + "epoch": 0.6513328514944092, + "grad_norm": 8.02450472974402, + "learning_rate": 4.833019963178214e-06, + "loss": 0.4681, + "step": 1915 + }, + { + "epoch": 0.6530334594617576, + "grad_norm": 4.181925906522143, + "learning_rate": 4.832043998346781e-06, + "loss": 0.4709, + "step": 1920 + }, + { + "epoch": 0.6547340674291059, + "grad_norm": 5.974847517676, + "learning_rate": 4.831065288787331e-06, + "loss": 0.4918, + "step": 1925 + }, + { + "epoch": 0.6564346753964543, + "grad_norm": 4.168052732228723, + "learning_rate": 4.830083835651771e-06, + "loss": 0.4852, + "step": 1930 + }, + { + "epoch": 0.6581352833638026, + "grad_norm": 6.232875709754771, + "learning_rate": 4.829099640095233e-06, + "loss": 0.4813, + "step": 1935 + }, + { + "epoch": 0.6598358913311508, + "grad_norm": 28.52667043886641, + "learning_rate": 4.828112703276078e-06, + "loss": 0.4605, + "step": 1940 + }, + { + "epoch": 0.6615364992984992, + "grad_norm": 46.96459715556022, + "learning_rate": 4.827123026355895e-06, + "loss": 0.494, + "step": 1945 + }, + { + "epoch": 0.6632371072658475, + "grad_norm": 6.5226820623683075, + "learning_rate": 4.826130610499495e-06, + "loss": 0.4722, + "step": 1950 + }, + { + "epoch": 0.6649377152331959, + "grad_norm": 7.257587063242, + "learning_rate": 4.8251354568749135e-06, + "loss": 0.4574, + "step": 1955 + }, + { + "epoch": 0.6666383232005442, + "grad_norm": 16.44207952733053, + "learning_rate": 4.824137566653411e-06, + "loss": 0.4688, + "step": 1960 + }, + { + "epoch": 0.6683389311678926, + "grad_norm": 10.657328727348993, + "learning_rate": 4.823136941009465e-06, + "loss": 0.5342, + "step": 1965 + }, + { + "epoch": 0.6700395391352408, + "grad_norm": 11.670933092961624, + "learning_rate": 4.822133581120775e-06, + "loss": 0.4665, + "step": 1970 + }, + { + "epoch": 0.6717401471025892, + "grad_norm": 12.400540360619097, + "learning_rate": 4.821127488168258e-06, + "loss": 0.4973, + "step": 1975 + }, + { + "epoch": 0.6734407550699375, + "grad_norm": 9.656617982085303, + "learning_rate": 4.820118663336047e-06, + "loss": 0.4849, + "step": 1980 + }, + { + "epoch": 0.6751413630372858, + "grad_norm": 7.568416904448632, + "learning_rate": 4.819107107811491e-06, + "loss": 0.4602, + "step": 1985 + }, + { + "epoch": 0.6768419710046342, + "grad_norm": 25.48241061674708, + "learning_rate": 4.818092822785153e-06, + "loss": 0.4568, + "step": 1990 + }, + { + "epoch": 0.6785425789719824, + "grad_norm": 12.682770296425272, + "learning_rate": 4.817075809450808e-06, + "loss": 0.4829, + "step": 1995 + }, + { + "epoch": 0.6802431869393308, + "grad_norm": 204.72845926361106, + "learning_rate": 4.816056069005442e-06, + "loss": 0.4807, + "step": 2000 + }, + { + "epoch": 0.6819437949066791, + "grad_norm": 10.013600707099535, + "learning_rate": 4.815033602649253e-06, + "loss": 0.4473, + "step": 2005 + }, + { + "epoch": 0.6836444028740275, + "grad_norm": 4.951889922224116, + "learning_rate": 4.814008411585644e-06, + "loss": 0.4805, + "step": 2010 + }, + { + "epoch": 0.6853450108413758, + "grad_norm": 7.398756791518998, + "learning_rate": 4.812980497021225e-06, + "loss": 0.4641, + "step": 2015 + }, + { + "epoch": 0.6870456188087242, + "grad_norm": 22.45007834808948, + "learning_rate": 4.811949860165815e-06, + "loss": 0.4614, + "step": 2020 + }, + { + "epoch": 0.6887462267760724, + "grad_norm": 5.60126858273136, + "learning_rate": 4.810916502232434e-06, + "loss": 0.479, + "step": 2025 + }, + { + "epoch": 0.6904468347434207, + "grad_norm": 7.2788557552454085, + "learning_rate": 4.809880424437306e-06, + "loss": 0.4944, + "step": 2030 + }, + { + "epoch": 0.6921474427107691, + "grad_norm": 4.656186187366622, + "learning_rate": 4.808841627999854e-06, + "loss": 0.4833, + "step": 2035 + }, + { + "epoch": 0.6938480506781174, + "grad_norm": 30.248464246047728, + "learning_rate": 4.807800114142703e-06, + "loss": 0.4691, + "step": 2040 + }, + { + "epoch": 0.6955486586454658, + "grad_norm": 10.50047738121829, + "learning_rate": 4.806755884091676e-06, + "loss": 0.4793, + "step": 2045 + }, + { + "epoch": 0.6972492666128141, + "grad_norm": 31.140998402104383, + "learning_rate": 4.8057089390757924e-06, + "loss": 0.5166, + "step": 2050 + }, + { + "epoch": 0.6989498745801624, + "grad_norm": 8.90954865280583, + "learning_rate": 4.804659280327268e-06, + "loss": 0.4768, + "step": 2055 + }, + { + "epoch": 0.7006504825475107, + "grad_norm": 5.63500786631946, + "learning_rate": 4.803606909081509e-06, + "loss": 0.4884, + "step": 2060 + }, + { + "epoch": 0.7023510905148591, + "grad_norm": 7.777067999911534, + "learning_rate": 4.802551826577119e-06, + "loss": 0.4808, + "step": 2065 + }, + { + "epoch": 0.7040516984822074, + "grad_norm": 75.5694198184285, + "learning_rate": 4.8014940340558905e-06, + "loss": 0.4969, + "step": 2070 + }, + { + "epoch": 0.7057523064495557, + "grad_norm": 6.143201015797779, + "learning_rate": 4.800433532762804e-06, + "loss": 0.4624, + "step": 2075 + }, + { + "epoch": 0.707452914416904, + "grad_norm": 15.70149527409961, + "learning_rate": 4.79937032394603e-06, + "loss": 0.4822, + "step": 2080 + }, + { + "epoch": 0.7091535223842523, + "grad_norm": 5.0318953609604575, + "learning_rate": 4.7983044088569265e-06, + "loss": 0.499, + "step": 2085 + }, + { + "epoch": 0.7108541303516007, + "grad_norm": 5.90864243120211, + "learning_rate": 4.797235788750034e-06, + "loss": 0.4735, + "step": 2090 + }, + { + "epoch": 0.712554738318949, + "grad_norm": 15.263140292292057, + "learning_rate": 4.796164464883078e-06, + "loss": 0.4926, + "step": 2095 + }, + { + "epoch": 0.7142553462862974, + "grad_norm": 7.923676324997792, + "learning_rate": 4.795090438516969e-06, + "loss": 0.4995, + "step": 2100 + }, + { + "epoch": 0.7159559542536457, + "grad_norm": 8.587103946080287, + "learning_rate": 4.794013710915793e-06, + "loss": 0.4451, + "step": 2105 + }, + { + "epoch": 0.717656562220994, + "grad_norm": 4.278158469703077, + "learning_rate": 4.792934283346817e-06, + "loss": 0.475, + "step": 2110 + }, + { + "epoch": 0.7193571701883423, + "grad_norm": 9.063783270010104, + "learning_rate": 4.79185215708049e-06, + "loss": 0.4841, + "step": 2115 + }, + { + "epoch": 0.7210577781556906, + "grad_norm": 10.694221466512458, + "learning_rate": 4.790767333390431e-06, + "loss": 0.4758, + "step": 2120 + }, + { + "epoch": 0.722758386123039, + "grad_norm": 12.8885009650192, + "learning_rate": 4.789679813553439e-06, + "loss": 0.4966, + "step": 2125 + }, + { + "epoch": 0.7244589940903873, + "grad_norm": 5.5429762120679715, + "learning_rate": 4.788589598849482e-06, + "loss": 0.4346, + "step": 2130 + }, + { + "epoch": 0.7261596020577357, + "grad_norm": 5.05789608696174, + "learning_rate": 4.787496690561701e-06, + "loss": 0.5014, + "step": 2135 + }, + { + "epoch": 0.7278602100250839, + "grad_norm": 9.032561200753488, + "learning_rate": 4.786401089976411e-06, + "loss": 0.4887, + "step": 2140 + }, + { + "epoch": 0.7295608179924323, + "grad_norm": 9.045399707510782, + "learning_rate": 4.78530279838309e-06, + "loss": 0.4758, + "step": 2145 + }, + { + "epoch": 0.7312614259597806, + "grad_norm": 3.9827265791533573, + "learning_rate": 4.784201817074387e-06, + "loss": 0.4973, + "step": 2150 + }, + { + "epoch": 0.732962033927129, + "grad_norm": 4.782939617573199, + "learning_rate": 4.783098147346116e-06, + "loss": 0.4871, + "step": 2155 + }, + { + "epoch": 0.7346626418944773, + "grad_norm": 34.78788179165436, + "learning_rate": 4.7819917904972534e-06, + "loss": 0.4889, + "step": 2160 + }, + { + "epoch": 0.7363632498618256, + "grad_norm": 6.056648176350482, + "learning_rate": 4.78088274782994e-06, + "loss": 0.4651, + "step": 2165 + }, + { + "epoch": 0.7380638578291739, + "grad_norm": 3.1753379829358948, + "learning_rate": 4.779771020649478e-06, + "loss": 0.4661, + "step": 2170 + }, + { + "epoch": 0.7397644657965222, + "grad_norm": 5.045547762543024, + "learning_rate": 4.778656610264327e-06, + "loss": 0.4597, + "step": 2175 + }, + { + "epoch": 0.7414650737638706, + "grad_norm": 1269.2808540525675, + "learning_rate": 4.777539517986109e-06, + "loss": 0.4991, + "step": 2180 + }, + { + "epoch": 0.7431656817312189, + "grad_norm": 4.946687469448479, + "learning_rate": 4.776419745129596e-06, + "loss": 0.4759, + "step": 2185 + }, + { + "epoch": 0.7448662896985673, + "grad_norm": 3.4129729667348045, + "learning_rate": 4.775297293012719e-06, + "loss": 0.4765, + "step": 2190 + }, + { + "epoch": 0.7465668976659156, + "grad_norm": 4.111665483599155, + "learning_rate": 4.774172162956565e-06, + "loss": 0.5037, + "step": 2195 + }, + { + "epoch": 0.7482675056332639, + "grad_norm": 3.429305468282582, + "learning_rate": 4.773044356285367e-06, + "loss": 0.4945, + "step": 2200 + }, + { + "epoch": 0.7499681136006122, + "grad_norm": 5.395335191960881, + "learning_rate": 4.771913874326513e-06, + "loss": 0.4932, + "step": 2205 + }, + { + "epoch": 0.7516687215679605, + "grad_norm": 5.678600438248466, + "learning_rate": 4.770780718410535e-06, + "loss": 0.4765, + "step": 2210 + }, + { + "epoch": 0.7533693295353089, + "grad_norm": 5.155283968677559, + "learning_rate": 4.769644889871116e-06, + "loss": 0.4555, + "step": 2215 + }, + { + "epoch": 0.7550699375026572, + "grad_norm": 4.8046584137620405, + "learning_rate": 4.768506390045085e-06, + "loss": 0.4633, + "step": 2220 + }, + { + "epoch": 0.7567705454700056, + "grad_norm": 5.655460549118874, + "learning_rate": 4.767365220272412e-06, + "loss": 0.4666, + "step": 2225 + }, + { + "epoch": 0.7584711534373538, + "grad_norm": 25.4746580475505, + "learning_rate": 4.76622138189621e-06, + "loss": 0.459, + "step": 2230 + }, + { + "epoch": 0.7601717614047022, + "grad_norm": 6.276798385082229, + "learning_rate": 4.7650748762627355e-06, + "loss": 0.4816, + "step": 2235 + }, + { + "epoch": 0.7618723693720505, + "grad_norm": 5.280921721834364, + "learning_rate": 4.763925704721382e-06, + "loss": 0.4534, + "step": 2240 + }, + { + "epoch": 0.7635729773393989, + "grad_norm": 5.545205141472925, + "learning_rate": 4.762773868624681e-06, + "loss": 0.4845, + "step": 2245 + }, + { + "epoch": 0.7652735853067472, + "grad_norm": 4.326151217698957, + "learning_rate": 4.7616193693282995e-06, + "loss": 0.5031, + "step": 2250 + }, + { + "epoch": 0.7669741932740954, + "grad_norm": 20.35394810547081, + "learning_rate": 4.76046220819104e-06, + "loss": 0.4931, + "step": 2255 + }, + { + "epoch": 0.7686748012414438, + "grad_norm": 4.459175948393059, + "learning_rate": 4.759302386574839e-06, + "loss": 0.4586, + "step": 2260 + }, + { + "epoch": 0.7703754092087921, + "grad_norm": 6.448537415717612, + "learning_rate": 4.758139905844762e-06, + "loss": 0.4915, + "step": 2265 + }, + { + "epoch": 0.7720760171761405, + "grad_norm": 4.674162493692723, + "learning_rate": 4.756974767369005e-06, + "loss": 0.4758, + "step": 2270 + }, + { + "epoch": 0.7737766251434888, + "grad_norm": 5.882512364200616, + "learning_rate": 4.755806972518891e-06, + "loss": 0.4757, + "step": 2275 + }, + { + "epoch": 0.7754772331108372, + "grad_norm": 7.673762814821439, + "learning_rate": 4.754636522668873e-06, + "loss": 0.488, + "step": 2280 + }, + { + "epoch": 0.7771778410781854, + "grad_norm": 8.76539028078966, + "learning_rate": 4.753463419196523e-06, + "loss": 0.4725, + "step": 2285 + }, + { + "epoch": 0.7788784490455338, + "grad_norm": 5.686655568345862, + "learning_rate": 4.752287663482544e-06, + "loss": 0.4487, + "step": 2290 + }, + { + "epoch": 0.7805790570128821, + "grad_norm": 10.123980373339233, + "learning_rate": 4.751109256910753e-06, + "loss": 0.4721, + "step": 2295 + }, + { + "epoch": 0.7822796649802304, + "grad_norm": 11.095650127885145, + "learning_rate": 4.749928200868092e-06, + "loss": 0.446, + "step": 2300 + }, + { + "epoch": 0.7839802729475788, + "grad_norm": 5.396599575860084, + "learning_rate": 4.748744496744617e-06, + "loss": 0.4875, + "step": 2305 + }, + { + "epoch": 0.785680880914927, + "grad_norm": 5.606663991669503, + "learning_rate": 4.747558145933506e-06, + "loss": 0.4378, + "step": 2310 + }, + { + "epoch": 0.7873814888822754, + "grad_norm": 45.39123351898769, + "learning_rate": 4.7463691498310475e-06, + "loss": 0.4757, + "step": 2315 + }, + { + "epoch": 0.7890820968496237, + "grad_norm": 17.03989550437034, + "learning_rate": 4.745177509836646e-06, + "loss": 0.4598, + "step": 2320 + }, + { + "epoch": 0.7907827048169721, + "grad_norm": 5.307561123525143, + "learning_rate": 4.743983227352817e-06, + "loss": 0.4667, + "step": 2325 + }, + { + "epoch": 0.7924833127843204, + "grad_norm": 7.788495450749629, + "learning_rate": 4.742786303785185e-06, + "loss": 0.4743, + "step": 2330 + }, + { + "epoch": 0.7941839207516688, + "grad_norm": 14.053571890786356, + "learning_rate": 4.741586740542485e-06, + "loss": 0.4795, + "step": 2335 + }, + { + "epoch": 0.795884528719017, + "grad_norm": 7.322934279287959, + "learning_rate": 4.740384539036559e-06, + "loss": 0.4729, + "step": 2340 + }, + { + "epoch": 0.7975851366863653, + "grad_norm": 37.789877963267486, + "learning_rate": 4.739179700682349e-06, + "loss": 0.4697, + "step": 2345 + }, + { + "epoch": 0.7992857446537137, + "grad_norm": 11.595352099933702, + "learning_rate": 4.737972226897909e-06, + "loss": 0.4745, + "step": 2350 + }, + { + "epoch": 0.800986352621062, + "grad_norm": 25.6667298770931, + "learning_rate": 4.736762119104386e-06, + "loss": 0.4823, + "step": 2355 + }, + { + "epoch": 0.8026869605884104, + "grad_norm": 4.220556063695543, + "learning_rate": 4.735549378726035e-06, + "loss": 0.4595, + "step": 2360 + }, + { + "epoch": 0.8043875685557587, + "grad_norm": 6.57385849376313, + "learning_rate": 4.734334007190204e-06, + "loss": 0.4611, + "step": 2365 + }, + { + "epoch": 0.806088176523107, + "grad_norm": 4.593605374546327, + "learning_rate": 4.7331160059273384e-06, + "loss": 0.486, + "step": 2370 + }, + { + "epoch": 0.8077887844904553, + "grad_norm": 6.230998624428787, + "learning_rate": 4.7318953763709815e-06, + "loss": 0.4626, + "step": 2375 + }, + { + "epoch": 0.8094893924578037, + "grad_norm": 4.308841381825005, + "learning_rate": 4.730672119957769e-06, + "loss": 0.4836, + "step": 2380 + }, + { + "epoch": 0.811190000425152, + "grad_norm": 10.504270551953045, + "learning_rate": 4.729446238127426e-06, + "loss": 0.4838, + "step": 2385 + }, + { + "epoch": 0.8128906083925003, + "grad_norm": 8.747795183006614, + "learning_rate": 4.72821773232277e-06, + "loss": 0.4655, + "step": 2390 + }, + { + "epoch": 0.8145912163598487, + "grad_norm": 4.696438683929611, + "learning_rate": 4.726986603989706e-06, + "loss": 0.4554, + "step": 2395 + }, + { + "epoch": 0.8162918243271969, + "grad_norm": 5.197297769902515, + "learning_rate": 4.725752854577226e-06, + "loss": 0.4854, + "step": 2400 + }, + { + "epoch": 0.8179924322945453, + "grad_norm": 9.708673608244993, + "learning_rate": 4.724516485537406e-06, + "loss": 0.4588, + "step": 2405 + }, + { + "epoch": 0.8196930402618936, + "grad_norm": 6.237931890661816, + "learning_rate": 4.723277498325406e-06, + "loss": 0.4478, + "step": 2410 + }, + { + "epoch": 0.821393648229242, + "grad_norm": 5.926988930736583, + "learning_rate": 4.722035894399467e-06, + "loss": 0.4962, + "step": 2415 + }, + { + "epoch": 0.8230942561965903, + "grad_norm": 3.4452720408933137, + "learning_rate": 4.7207916752209114e-06, + "loss": 0.4626, + "step": 2420 + }, + { + "epoch": 0.8247948641639387, + "grad_norm": 5.756019877459417, + "learning_rate": 4.719544842254138e-06, + "loss": 0.4521, + "step": 2425 + }, + { + "epoch": 0.8264954721312869, + "grad_norm": 10.606430075297766, + "learning_rate": 4.7182953969666205e-06, + "loss": 0.4817, + "step": 2430 + }, + { + "epoch": 0.8281960800986352, + "grad_norm": 6.154783522566068, + "learning_rate": 4.7170433408289115e-06, + "loss": 0.4515, + "step": 2435 + }, + { + "epoch": 0.8298966880659836, + "grad_norm": 7.466536394495599, + "learning_rate": 4.715788675314632e-06, + "loss": 0.4731, + "step": 2440 + }, + { + "epoch": 0.8315972960333319, + "grad_norm": 4.166417892434687, + "learning_rate": 4.714531401900477e-06, + "loss": 0.4818, + "step": 2445 + }, + { + "epoch": 0.8332979040006803, + "grad_norm": 9.757418884123936, + "learning_rate": 4.713271522066209e-06, + "loss": 0.4447, + "step": 2450 + }, + { + "epoch": 0.8349985119680285, + "grad_norm": 5.439076498016159, + "learning_rate": 4.712009037294661e-06, + "loss": 0.4596, + "step": 2455 + }, + { + "epoch": 0.8366991199353769, + "grad_norm": 17.66563572208141, + "learning_rate": 4.710743949071729e-06, + "loss": 0.4622, + "step": 2460 + }, + { + "epoch": 0.8383997279027252, + "grad_norm": 7.339347866198813, + "learning_rate": 4.709476258886374e-06, + "loss": 0.4777, + "step": 2465 + }, + { + "epoch": 0.8401003358700736, + "grad_norm": 5.808232850206273, + "learning_rate": 4.7082059682306205e-06, + "loss": 0.4658, + "step": 2470 + }, + { + "epoch": 0.8418009438374219, + "grad_norm": 4.758995702808101, + "learning_rate": 4.706933078599552e-06, + "loss": 0.4586, + "step": 2475 + }, + { + "epoch": 0.8435015518047702, + "grad_norm": 5.963329705407439, + "learning_rate": 4.70565759149131e-06, + "loss": 0.457, + "step": 2480 + }, + { + "epoch": 0.8452021597721185, + "grad_norm": 3.032448624315757, + "learning_rate": 4.7043795084070984e-06, + "loss": 0.4813, + "step": 2485 + }, + { + "epoch": 0.8469027677394668, + "grad_norm": 5.6970102027520415, + "learning_rate": 4.703098830851172e-06, + "loss": 0.4655, + "step": 2490 + }, + { + "epoch": 0.8486033757068152, + "grad_norm": 4.745699201763051, + "learning_rate": 4.701815560330838e-06, + "loss": 0.5015, + "step": 2495 + }, + { + "epoch": 0.8503039836741635, + "grad_norm": 4.211802120114542, + "learning_rate": 4.700529698356459e-06, + "loss": 0.496, + "step": 2500 + }, + { + "epoch": 0.8520045916415119, + "grad_norm": 4.025902675598321, + "learning_rate": 4.699241246441445e-06, + "loss": 0.4853, + "step": 2505 + }, + { + "epoch": 0.8537051996088602, + "grad_norm": 3.816704219546769, + "learning_rate": 4.697950206102258e-06, + "loss": 0.4383, + "step": 2510 + }, + { + "epoch": 0.8554058075762085, + "grad_norm": 9.123638826957368, + "learning_rate": 4.6966565788584e-06, + "loss": 0.4801, + "step": 2515 + }, + { + "epoch": 0.8571064155435568, + "grad_norm": 3.5778897585484786, + "learning_rate": 4.695360366232425e-06, + "loss": 0.4646, + "step": 2520 + }, + { + "epoch": 0.8588070235109051, + "grad_norm": 16.129442045616248, + "learning_rate": 4.694061569749926e-06, + "loss": 0.4688, + "step": 2525 + }, + { + "epoch": 0.8605076314782535, + "grad_norm": 6.115096395312036, + "learning_rate": 4.692760190939536e-06, + "loss": 0.4795, + "step": 2530 + }, + { + "epoch": 0.8622082394456018, + "grad_norm": 3.880119186396497, + "learning_rate": 4.69145623133293e-06, + "loss": 0.4716, + "step": 2535 + }, + { + "epoch": 0.8639088474129502, + "grad_norm": 4.367106638465115, + "learning_rate": 4.690149692464819e-06, + "loss": 0.4711, + "step": 2540 + }, + { + "epoch": 0.8656094553802984, + "grad_norm": 6.151471448173627, + "learning_rate": 4.688840575872949e-06, + "loss": 0.4756, + "step": 2545 + }, + { + "epoch": 0.8673100633476468, + "grad_norm": 4.2213353474054625, + "learning_rate": 4.687528883098104e-06, + "loss": 0.4862, + "step": 2550 + }, + { + "epoch": 0.8690106713149951, + "grad_norm": 3.7338203683525846, + "learning_rate": 4.686214615684095e-06, + "loss": 0.4693, + "step": 2555 + }, + { + "epoch": 0.8707112792823435, + "grad_norm": 3.793322509112781, + "learning_rate": 4.684897775177765e-06, + "loss": 0.4807, + "step": 2560 + }, + { + "epoch": 0.8724118872496918, + "grad_norm": 3.3071137506742536, + "learning_rate": 4.683578363128985e-06, + "loss": 0.4685, + "step": 2565 + }, + { + "epoch": 0.87411249521704, + "grad_norm": 3.70214370047083, + "learning_rate": 4.6822563810906555e-06, + "loss": 0.4666, + "step": 2570 + }, + { + "epoch": 0.8758131031843884, + "grad_norm": 12.650059452463687, + "learning_rate": 4.680931830618698e-06, + "loss": 0.4861, + "step": 2575 + }, + { + "epoch": 0.8775137111517367, + "grad_norm": 5.255235753352614, + "learning_rate": 4.67960471327206e-06, + "loss": 0.4642, + "step": 2580 + }, + { + "epoch": 0.8792143191190851, + "grad_norm": 3.427445126764981, + "learning_rate": 4.678275030612708e-06, + "loss": 0.4574, + "step": 2585 + }, + { + "epoch": 0.8809149270864334, + "grad_norm": 3.619414149116754, + "learning_rate": 4.676942784205627e-06, + "loss": 0.4566, + "step": 2590 + }, + { + "epoch": 0.8826155350537818, + "grad_norm": 4.467195890103454, + "learning_rate": 4.675607975618823e-06, + "loss": 0.4398, + "step": 2595 + }, + { + "epoch": 0.88431614302113, + "grad_norm": 7.306921975067082, + "learning_rate": 4.674270606423315e-06, + "loss": 0.4616, + "step": 2600 + }, + { + "epoch": 0.8860167509884784, + "grad_norm": 6.891857969882333, + "learning_rate": 4.672930678193135e-06, + "loss": 0.4634, + "step": 2605 + }, + { + "epoch": 0.8877173589558267, + "grad_norm": 9.568307356460663, + "learning_rate": 4.671588192505329e-06, + "loss": 0.4473, + "step": 2610 + }, + { + "epoch": 0.889417966923175, + "grad_norm": 6.489021091237962, + "learning_rate": 4.670243150939951e-06, + "loss": 0.4602, + "step": 2615 + }, + { + "epoch": 0.8911185748905234, + "grad_norm": 4.418966555081972, + "learning_rate": 4.668895555080067e-06, + "loss": 0.467, + "step": 2620 + }, + { + "epoch": 0.8928191828578717, + "grad_norm": 8.801700053716434, + "learning_rate": 4.667545406511745e-06, + "loss": 0.4814, + "step": 2625 + }, + { + "epoch": 0.89451979082522, + "grad_norm": 6.603121083107336, + "learning_rate": 4.666192706824058e-06, + "loss": 0.4666, + "step": 2630 + }, + { + "epoch": 0.8962203987925683, + "grad_norm": 17.83521499702972, + "learning_rate": 4.664837457609084e-06, + "loss": 0.4595, + "step": 2635 + }, + { + "epoch": 0.8979210067599167, + "grad_norm": 10.753629924265912, + "learning_rate": 4.6634796604619e-06, + "loss": 0.4613, + "step": 2640 + }, + { + "epoch": 0.899621614727265, + "grad_norm": 4.53607565282488, + "learning_rate": 4.662119316980581e-06, + "loss": 0.4613, + "step": 2645 + }, + { + "epoch": 0.9013222226946134, + "grad_norm": 28.28754483294441, + "learning_rate": 4.6607564287662025e-06, + "loss": 0.4687, + "step": 2650 + }, + { + "epoch": 0.9030228306619617, + "grad_norm": 5.844627428539397, + "learning_rate": 4.6593909974228305e-06, + "loss": 0.4811, + "step": 2655 + }, + { + "epoch": 0.9047234386293099, + "grad_norm": 16.725038565305354, + "learning_rate": 4.658023024557528e-06, + "loss": 0.4608, + "step": 2660 + }, + { + "epoch": 0.9064240465966583, + "grad_norm": 5.72088266552718, + "learning_rate": 4.656652511780346e-06, + "loss": 0.4965, + "step": 2665 + }, + { + "epoch": 0.9081246545640066, + "grad_norm": 8.49705017317251, + "learning_rate": 4.655279460704327e-06, + "loss": 0.5135, + "step": 2670 + }, + { + "epoch": 0.909825262531355, + "grad_norm": 13.24214617793498, + "learning_rate": 4.653903872945501e-06, + "loss": 0.4133, + "step": 2675 + }, + { + "epoch": 0.9115258704987033, + "grad_norm": 16.276010232458102, + "learning_rate": 4.652525750122881e-06, + "loss": 0.4976, + "step": 2680 + }, + { + "epoch": 0.9132264784660516, + "grad_norm": 6.662302437602245, + "learning_rate": 4.651145093858469e-06, + "loss": 0.472, + "step": 2685 + }, + { + "epoch": 0.9149270864333999, + "grad_norm": 6.4514546340510055, + "learning_rate": 4.6497619057772435e-06, + "loss": 0.4697, + "step": 2690 + }, + { + "epoch": 0.9166276944007483, + "grad_norm": 5.021678339377137, + "learning_rate": 4.648376187507165e-06, + "loss": 0.4664, + "step": 2695 + }, + { + "epoch": 0.9183283023680966, + "grad_norm": 6.669097101168566, + "learning_rate": 4.646987940679171e-06, + "loss": 0.4601, + "step": 2700 + }, + { + "epoch": 0.9200289103354449, + "grad_norm": 5.902255296320931, + "learning_rate": 4.645597166927177e-06, + "loss": 0.432, + "step": 2705 + }, + { + "epoch": 0.9217295183027933, + "grad_norm": 3.8720393482781272, + "learning_rate": 4.644203867888071e-06, + "loss": 0.4432, + "step": 2710 + }, + { + "epoch": 0.9234301262701415, + "grad_norm": 4.313561539291421, + "learning_rate": 4.642808045201713e-06, + "loss": 0.4798, + "step": 2715 + }, + { + "epoch": 0.9251307342374899, + "grad_norm": 3.977826142428004, + "learning_rate": 4.641409700510935e-06, + "loss": 0.4506, + "step": 2720 + }, + { + "epoch": 0.9268313422048382, + "grad_norm": 4.959302496448603, + "learning_rate": 4.640008835461535e-06, + "loss": 0.4767, + "step": 2725 + }, + { + "epoch": 0.9285319501721866, + "grad_norm": 5.247335980334496, + "learning_rate": 4.638605451702279e-06, + "loss": 0.4629, + "step": 2730 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 4.279180890288213, + "learning_rate": 4.637199550884896e-06, + "loss": 0.4616, + "step": 2735 + }, + { + "epoch": 0.9319331661068833, + "grad_norm": 6.556960210684756, + "learning_rate": 4.635791134664079e-06, + "loss": 0.4738, + "step": 2740 + }, + { + "epoch": 0.9336337740742315, + "grad_norm": 4.8762624843050135, + "learning_rate": 4.634380204697481e-06, + "loss": 0.4566, + "step": 2745 + }, + { + "epoch": 0.9353343820415798, + "grad_norm": 9.973695201929845, + "learning_rate": 4.632966762645713e-06, + "loss": 0.4344, + "step": 2750 + }, + { + "epoch": 0.9370349900089282, + "grad_norm": 7.938627728975852, + "learning_rate": 4.631550810172344e-06, + "loss": 0.4724, + "step": 2755 + }, + { + "epoch": 0.9387355979762765, + "grad_norm": 4.1641712047801756, + "learning_rate": 4.630132348943895e-06, + "loss": 0.4425, + "step": 2760 + }, + { + "epoch": 0.9404362059436249, + "grad_norm": 4.969972919685798, + "learning_rate": 4.628711380629843e-06, + "loss": 0.466, + "step": 2765 + }, + { + "epoch": 0.9421368139109731, + "grad_norm": 4.011899311812286, + "learning_rate": 4.627287906902615e-06, + "loss": 0.4923, + "step": 2770 + }, + { + "epoch": 0.9438374218783215, + "grad_norm": 11.1196782659473, + "learning_rate": 4.625861929437584e-06, + "loss": 0.4652, + "step": 2775 + }, + { + "epoch": 0.9455380298456698, + "grad_norm": 3.6956879692855003, + "learning_rate": 4.6244334499130725e-06, + "loss": 0.4909, + "step": 2780 + }, + { + "epoch": 0.9472386378130182, + "grad_norm": 19.071716213434737, + "learning_rate": 4.6230024700103485e-06, + "loss": 0.4454, + "step": 2785 + }, + { + "epoch": 0.9489392457803665, + "grad_norm": 7.606087801760514, + "learning_rate": 4.621568991413619e-06, + "loss": 0.4779, + "step": 2790 + }, + { + "epoch": 0.9506398537477148, + "grad_norm": 14.696215182161811, + "learning_rate": 4.6201330158100354e-06, + "loss": 0.4957, + "step": 2795 + }, + { + "epoch": 0.9523404617150631, + "grad_norm": 3.8448687891191873, + "learning_rate": 4.618694544889688e-06, + "loss": 0.4629, + "step": 2800 + }, + { + "epoch": 0.9540410696824114, + "grad_norm": 4.724455462118728, + "learning_rate": 4.617253580345602e-06, + "loss": 0.4518, + "step": 2805 + }, + { + "epoch": 0.9557416776497598, + "grad_norm": 5.1164099572684325, + "learning_rate": 4.6158101238737385e-06, + "loss": 0.4759, + "step": 2810 + }, + { + "epoch": 0.9574422856171081, + "grad_norm": 3.324153731402897, + "learning_rate": 4.6143641771729914e-06, + "loss": 0.445, + "step": 2815 + }, + { + "epoch": 0.9591428935844565, + "grad_norm": 6.728755458552797, + "learning_rate": 4.612915741945185e-06, + "loss": 0.4359, + "step": 2820 + }, + { + "epoch": 0.9608435015518048, + "grad_norm": 5.2302562387140625, + "learning_rate": 4.611464819895075e-06, + "loss": 0.4576, + "step": 2825 + }, + { + "epoch": 0.9625441095191531, + "grad_norm": 9.353759033223223, + "learning_rate": 4.61001141273034e-06, + "loss": 0.4831, + "step": 2830 + }, + { + "epoch": 0.9642447174865014, + "grad_norm": 5.5242719103803495, + "learning_rate": 4.608555522161586e-06, + "loss": 0.503, + "step": 2835 + }, + { + "epoch": 0.9659453254538497, + "grad_norm": 7.7545751238763865, + "learning_rate": 4.607097149902342e-06, + "loss": 0.4575, + "step": 2840 + }, + { + "epoch": 0.9676459334211981, + "grad_norm": 5.8608037887213795, + "learning_rate": 4.605636297669057e-06, + "loss": 0.4603, + "step": 2845 + }, + { + "epoch": 0.9693465413885464, + "grad_norm": 7.1451302730190065, + "learning_rate": 4.6041729671811e-06, + "loss": 0.4429, + "step": 2850 + }, + { + "epoch": 0.9710471493558948, + "grad_norm": 6.7365333394401645, + "learning_rate": 4.602707160160753e-06, + "loss": 0.4698, + "step": 2855 + }, + { + "epoch": 0.972747757323243, + "grad_norm": 3.6076879083981384, + "learning_rate": 4.601238878333218e-06, + "loss": 0.4658, + "step": 2860 + }, + { + "epoch": 0.9744483652905914, + "grad_norm": 9.201931404991615, + "learning_rate": 4.599768123426608e-06, + "loss": 0.4593, + "step": 2865 + }, + { + "epoch": 0.9761489732579397, + "grad_norm": 2.7281621411639883, + "learning_rate": 4.598294897171945e-06, + "loss": 0.4848, + "step": 2870 + }, + { + "epoch": 0.9778495812252881, + "grad_norm": 9.274983810881102, + "learning_rate": 4.596819201303161e-06, + "loss": 0.4579, + "step": 2875 + }, + { + "epoch": 0.9795501891926364, + "grad_norm": 3.935092331123125, + "learning_rate": 4.595341037557095e-06, + "loss": 0.4814, + "step": 2880 + }, + { + "epoch": 0.9812507971599846, + "grad_norm": 5.058622072366201, + "learning_rate": 4.59386040767349e-06, + "loss": 0.4646, + "step": 2885 + }, + { + "epoch": 0.982951405127333, + "grad_norm": 4.204448443843568, + "learning_rate": 4.59237731339499e-06, + "loss": 0.4688, + "step": 2890 + }, + { + "epoch": 0.9846520130946813, + "grad_norm": 3.554591216834179, + "learning_rate": 4.590891756467143e-06, + "loss": 0.4388, + "step": 2895 + }, + { + "epoch": 0.9863526210620297, + "grad_norm": 32.519673573312126, + "learning_rate": 4.589403738638393e-06, + "loss": 0.454, + "step": 2900 + }, + { + "epoch": 0.988053229029378, + "grad_norm": 3.968138282150363, + "learning_rate": 4.587913261660081e-06, + "loss": 0.4524, + "step": 2905 + }, + { + "epoch": 0.9897538369967264, + "grad_norm": 4.53973252164776, + "learning_rate": 4.586420327286442e-06, + "loss": 0.4525, + "step": 2910 + }, + { + "epoch": 0.9914544449640746, + "grad_norm": 6.107299256583201, + "learning_rate": 4.584924937274606e-06, + "loss": 0.4386, + "step": 2915 + }, + { + "epoch": 0.993155052931423, + "grad_norm": 3.4189689701799253, + "learning_rate": 4.583427093384587e-06, + "loss": 0.4676, + "step": 2920 + }, + { + "epoch": 0.9948556608987713, + "grad_norm": 4.958127080161495, + "learning_rate": 4.581926797379293e-06, + "loss": 0.4401, + "step": 2925 + }, + { + "epoch": 0.9965562688661196, + "grad_norm": 6.7040103587950775, + "learning_rate": 4.580424051024514e-06, + "loss": 0.449, + "step": 2930 + }, + { + "epoch": 0.998256876833468, + "grad_norm": 4.556130318387315, + "learning_rate": 4.57891885608893e-06, + "loss": 0.445, + "step": 2935 + }, + { + "epoch": 0.9999574848008163, + "grad_norm": 6.870877500958566, + "learning_rate": 4.577411214344095e-06, + "loss": 0.4826, + "step": 2940 + }, + { + "epoch": 1.0013604863738788, + "grad_norm": 4.65609078495718, + "learning_rate": 4.5759011275644476e-06, + "loss": 0.3659, + "step": 2945 + }, + { + "epoch": 1.003061094341227, + "grad_norm": 6.095258824323697, + "learning_rate": 4.574388597527303e-06, + "loss": 0.4694, + "step": 2950 + }, + { + "epoch": 1.0047617023085753, + "grad_norm": 13.23422585656265, + "learning_rate": 4.5728736260128534e-06, + "loss": 0.4472, + "step": 2955 + }, + { + "epoch": 1.0064623102759236, + "grad_norm": 12.631205425261738, + "learning_rate": 4.571356214804162e-06, + "loss": 0.4765, + "step": 2960 + }, + { + "epoch": 1.0081629182432719, + "grad_norm": 5.619759816789756, + "learning_rate": 4.569836365687164e-06, + "loss": 0.4474, + "step": 2965 + }, + { + "epoch": 1.0098635262106204, + "grad_norm": 7.497889977874089, + "learning_rate": 4.568314080450667e-06, + "loss": 0.4511, + "step": 2970 + }, + { + "epoch": 1.0115641341779686, + "grad_norm": 10.195727142868547, + "learning_rate": 4.566789360886341e-06, + "loss": 0.4309, + "step": 2975 + }, + { + "epoch": 1.013264742145317, + "grad_norm": 7.06258951989465, + "learning_rate": 4.565262208788725e-06, + "loss": 0.4776, + "step": 2980 + }, + { + "epoch": 1.0149653501126652, + "grad_norm": 4.829993597545264, + "learning_rate": 4.5637326259552195e-06, + "loss": 0.4804, + "step": 2985 + }, + { + "epoch": 1.0166659580800137, + "grad_norm": 3.9442902889511386, + "learning_rate": 4.562200614186085e-06, + "loss": 0.4682, + "step": 2990 + }, + { + "epoch": 1.018366566047362, + "grad_norm": 16.10502076363894, + "learning_rate": 4.560666175284441e-06, + "loss": 0.4772, + "step": 2995 + }, + { + "epoch": 1.0200671740147103, + "grad_norm": 6.765284857357975, + "learning_rate": 4.559129311056268e-06, + "loss": 0.4535, + "step": 3000 + }, + { + "epoch": 1.0217677819820585, + "grad_norm": 11.530197956087518, + "learning_rate": 4.557590023310393e-06, + "loss": 0.4638, + "step": 3005 + }, + { + "epoch": 1.0234683899494068, + "grad_norm": 3.2137510867237205, + "learning_rate": 4.556048313858503e-06, + "loss": 0.4323, + "step": 3010 + }, + { + "epoch": 1.0251689979167553, + "grad_norm": 5.4221029924761535, + "learning_rate": 4.554504184515129e-06, + "loss": 0.4371, + "step": 3015 + }, + { + "epoch": 1.0268696058841036, + "grad_norm": 4.507971574496705, + "learning_rate": 4.552957637097657e-06, + "loss": 0.4416, + "step": 3020 + }, + { + "epoch": 1.0285702138514519, + "grad_norm": 9.912936642252575, + "learning_rate": 4.551408673426311e-06, + "loss": 0.4822, + "step": 3025 + }, + { + "epoch": 1.0302708218188001, + "grad_norm": 5.346358979650128, + "learning_rate": 4.5498572953241655e-06, + "loss": 0.4549, + "step": 3030 + }, + { + "epoch": 1.0319714297861486, + "grad_norm": 6.011299429761261, + "learning_rate": 4.548303504617133e-06, + "loss": 0.4475, + "step": 3035 + }, + { + "epoch": 1.033672037753497, + "grad_norm": 3.2839622653873755, + "learning_rate": 4.546747303133968e-06, + "loss": 0.4557, + "step": 3040 + }, + { + "epoch": 1.0353726457208452, + "grad_norm": 23.614972369165166, + "learning_rate": 4.54518869270626e-06, + "loss": 0.4278, + "step": 3045 + }, + { + "epoch": 1.0370732536881935, + "grad_norm": 4.660578836842771, + "learning_rate": 4.543627675168434e-06, + "loss": 0.4481, + "step": 3050 + }, + { + "epoch": 1.0387738616555418, + "grad_norm": 10.575905769665871, + "learning_rate": 4.542064252357751e-06, + "loss": 0.4574, + "step": 3055 + }, + { + "epoch": 1.0404744696228903, + "grad_norm": 7.379932610615159, + "learning_rate": 4.540498426114299e-06, + "loss": 0.4684, + "step": 3060 + }, + { + "epoch": 1.0421750775902385, + "grad_norm": 8.807344023575178, + "learning_rate": 4.538930198280998e-06, + "loss": 0.4404, + "step": 3065 + }, + { + "epoch": 1.0438756855575868, + "grad_norm": 4.329853268424765, + "learning_rate": 4.537359570703591e-06, + "loss": 0.4524, + "step": 3070 + }, + { + "epoch": 1.045576293524935, + "grad_norm": 3.074946328347183, + "learning_rate": 4.53578654523065e-06, + "loss": 0.445, + "step": 3075 + }, + { + "epoch": 1.0472769014922836, + "grad_norm": 5.265770851396301, + "learning_rate": 4.5342111237135655e-06, + "loss": 0.4519, + "step": 3080 + }, + { + "epoch": 1.0489775094596319, + "grad_norm": 10.079118708913205, + "learning_rate": 4.53263330800655e-06, + "loss": 0.4468, + "step": 3085 + }, + { + "epoch": 1.0506781174269801, + "grad_norm": 4.1319553496504176, + "learning_rate": 4.531053099966632e-06, + "loss": 0.4688, + "step": 3090 + }, + { + "epoch": 1.0523787253943284, + "grad_norm": 3.4649959921528852, + "learning_rate": 4.529470501453659e-06, + "loss": 0.4332, + "step": 3095 + }, + { + "epoch": 1.0540793333616767, + "grad_norm": 6.613087258668495, + "learning_rate": 4.527885514330287e-06, + "loss": 0.4308, + "step": 3100 + }, + { + "epoch": 1.0557799413290252, + "grad_norm": 3.239224295796761, + "learning_rate": 4.5262981404619885e-06, + "loss": 0.4474, + "step": 3105 + }, + { + "epoch": 1.0574805492963735, + "grad_norm": 6.602689444846102, + "learning_rate": 4.524708381717042e-06, + "loss": 0.4446, + "step": 3110 + }, + { + "epoch": 1.0591811572637218, + "grad_norm": 6.351423156609205, + "learning_rate": 4.523116239966533e-06, + "loss": 0.4614, + "step": 3115 + }, + { + "epoch": 1.06088176523107, + "grad_norm": 5.347267558037939, + "learning_rate": 4.521521717084354e-06, + "loss": 0.4599, + "step": 3120 + }, + { + "epoch": 1.0625823731984185, + "grad_norm": 6.26149946924594, + "learning_rate": 4.519924814947197e-06, + "loss": 0.4677, + "step": 3125 + }, + { + "epoch": 1.0642829811657668, + "grad_norm": 5.167871496778332, + "learning_rate": 4.518325535434557e-06, + "loss": 0.4429, + "step": 3130 + }, + { + "epoch": 1.065983589133115, + "grad_norm": 3.674103729786612, + "learning_rate": 4.516723880428725e-06, + "loss": 0.4338, + "step": 3135 + }, + { + "epoch": 1.0676841971004634, + "grad_norm": 7.916041729824052, + "learning_rate": 4.515119851814788e-06, + "loss": 0.4385, + "step": 3140 + }, + { + "epoch": 1.0693848050678116, + "grad_norm": 6.309378831258314, + "learning_rate": 4.513513451480629e-06, + "loss": 0.4474, + "step": 3145 + }, + { + "epoch": 1.0710854130351601, + "grad_norm": 3.51815928246974, + "learning_rate": 4.511904681316919e-06, + "loss": 0.4418, + "step": 3150 + }, + { + "epoch": 1.0727860210025084, + "grad_norm": 7.411610274427342, + "learning_rate": 4.5102935432171215e-06, + "loss": 0.464, + "step": 3155 + }, + { + "epoch": 1.0744866289698567, + "grad_norm": 28.98091241308416, + "learning_rate": 4.508680039077484e-06, + "loss": 0.4333, + "step": 3160 + }, + { + "epoch": 1.076187236937205, + "grad_norm": 4.84016751436113, + "learning_rate": 4.507064170797041e-06, + "loss": 0.4527, + "step": 3165 + }, + { + "epoch": 1.0778878449045535, + "grad_norm": 28.89849564174859, + "learning_rate": 4.505445940277608e-06, + "loss": 0.4709, + "step": 3170 + }, + { + "epoch": 1.0795884528719017, + "grad_norm": 6.460096323509922, + "learning_rate": 4.503825349423782e-06, + "loss": 0.4478, + "step": 3175 + }, + { + "epoch": 1.08128906083925, + "grad_norm": 4.593375607263882, + "learning_rate": 4.502202400142938e-06, + "loss": 0.4602, + "step": 3180 + }, + { + "epoch": 1.0829896688065983, + "grad_norm": 6.4386210353397795, + "learning_rate": 4.500577094345224e-06, + "loss": 0.4451, + "step": 3185 + }, + { + "epoch": 1.0846902767739466, + "grad_norm": 6.617364744010471, + "learning_rate": 4.498949433943567e-06, + "loss": 0.469, + "step": 3190 + }, + { + "epoch": 1.086390884741295, + "grad_norm": 9.878249271965329, + "learning_rate": 4.497319420853658e-06, + "loss": 0.4468, + "step": 3195 + }, + { + "epoch": 1.0880914927086434, + "grad_norm": 13.343862517629125, + "learning_rate": 4.495687056993966e-06, + "loss": 0.4649, + "step": 3200 + }, + { + "epoch": 1.0897921006759916, + "grad_norm": 8.119912932277256, + "learning_rate": 4.4940523442857176e-06, + "loss": 0.4563, + "step": 3205 + }, + { + "epoch": 1.09149270864334, + "grad_norm": 5.451266829499762, + "learning_rate": 4.49241528465291e-06, + "loss": 0.438, + "step": 3210 + }, + { + "epoch": 1.0931933166106884, + "grad_norm": 5.946305828392075, + "learning_rate": 4.490775880022301e-06, + "loss": 0.4577, + "step": 3215 + }, + { + "epoch": 1.0948939245780367, + "grad_norm": 8.190063989790795, + "learning_rate": 4.489134132323407e-06, + "loss": 0.4244, + "step": 3220 + }, + { + "epoch": 1.096594532545385, + "grad_norm": 5.297926056975686, + "learning_rate": 4.487490043488504e-06, + "loss": 0.4599, + "step": 3225 + }, + { + "epoch": 1.0982951405127332, + "grad_norm": 4.262145728783015, + "learning_rate": 4.485843615452622e-06, + "loss": 0.4454, + "step": 3230 + }, + { + "epoch": 1.0999957484800815, + "grad_norm": 5.098276012820175, + "learning_rate": 4.484194850153546e-06, + "loss": 0.4561, + "step": 3235 + }, + { + "epoch": 1.10169635644743, + "grad_norm": 7.273781115260355, + "learning_rate": 4.4825437495318105e-06, + "loss": 0.468, + "step": 3240 + }, + { + "epoch": 1.1033969644147783, + "grad_norm": 4.933918921533191, + "learning_rate": 4.480890315530698e-06, + "loss": 0.453, + "step": 3245 + }, + { + "epoch": 1.1050975723821266, + "grad_norm": 17.626403443693594, + "learning_rate": 4.479234550096238e-06, + "loss": 0.4442, + "step": 3250 + }, + { + "epoch": 1.1067981803494749, + "grad_norm": 23.676732400103766, + "learning_rate": 4.477576455177205e-06, + "loss": 0.4455, + "step": 3255 + }, + { + "epoch": 1.1084987883168234, + "grad_norm": 6.900734191009607, + "learning_rate": 4.475916032725114e-06, + "loss": 0.4584, + "step": 3260 + }, + { + "epoch": 1.1101993962841716, + "grad_norm": 16.725403823482687, + "learning_rate": 4.474253284694219e-06, + "loss": 0.4481, + "step": 3265 + }, + { + "epoch": 1.11190000425152, + "grad_norm": 4.13809372200513, + "learning_rate": 4.472588213041514e-06, + "loss": 0.4184, + "step": 3270 + }, + { + "epoch": 1.1136006122188682, + "grad_norm": 11.916770024145645, + "learning_rate": 4.470920819726722e-06, + "loss": 0.4359, + "step": 3275 + }, + { + "epoch": 1.1153012201862165, + "grad_norm": 4.327376908798274, + "learning_rate": 4.469251106712306e-06, + "loss": 0.4727, + "step": 3280 + }, + { + "epoch": 1.117001828153565, + "grad_norm": 15.701918872574687, + "learning_rate": 4.467579075963452e-06, + "loss": 0.4573, + "step": 3285 + }, + { + "epoch": 1.1187024361209132, + "grad_norm": 3.1919875586325404, + "learning_rate": 4.46590472944808e-06, + "loss": 0.4645, + "step": 3290 + }, + { + "epoch": 1.1204030440882615, + "grad_norm": 3.694077218569007, + "learning_rate": 4.464228069136832e-06, + "loss": 0.4337, + "step": 3295 + }, + { + "epoch": 1.1221036520556098, + "grad_norm": 4.949030798295776, + "learning_rate": 4.462549097003074e-06, + "loss": 0.4358, + "step": 3300 + }, + { + "epoch": 1.1238042600229583, + "grad_norm": 9.681946220701647, + "learning_rate": 4.460867815022892e-06, + "loss": 0.4263, + "step": 3305 + }, + { + "epoch": 1.1255048679903066, + "grad_norm": 3.977714371887995, + "learning_rate": 4.459184225175093e-06, + "loss": 0.3886, + "step": 3310 + }, + { + "epoch": 1.1272054759576549, + "grad_norm": 11.168871492637267, + "learning_rate": 4.4574983294411986e-06, + "loss": 0.4295, + "step": 3315 + }, + { + "epoch": 1.1289060839250031, + "grad_norm": 6.249038479588919, + "learning_rate": 4.455810129805443e-06, + "loss": 0.4602, + "step": 3320 + }, + { + "epoch": 1.1306066918923516, + "grad_norm": 6.658957506492276, + "learning_rate": 4.454119628254776e-06, + "loss": 0.4495, + "step": 3325 + }, + { + "epoch": 1.1323072998597, + "grad_norm": 3.170050443727355, + "learning_rate": 4.452426826778854e-06, + "loss": 0.4616, + "step": 3330 + }, + { + "epoch": 1.1340079078270482, + "grad_norm": 4.034518863293827, + "learning_rate": 4.45073172737004e-06, + "loss": 0.4699, + "step": 3335 + }, + { + "epoch": 1.1357085157943965, + "grad_norm": 14.956232374656196, + "learning_rate": 4.449034332023401e-06, + "loss": 0.4378, + "step": 3340 + }, + { + "epoch": 1.1374091237617447, + "grad_norm": 5.544782939969213, + "learning_rate": 4.447334642736709e-06, + "loss": 0.4552, + "step": 3345 + }, + { + "epoch": 1.1391097317290932, + "grad_norm": 4.562157624305602, + "learning_rate": 4.445632661510434e-06, + "loss": 0.4546, + "step": 3350 + }, + { + "epoch": 1.1408103396964415, + "grad_norm": 4.225927268811578, + "learning_rate": 4.443928390347744e-06, + "loss": 0.4529, + "step": 3355 + }, + { + "epoch": 1.1425109476637898, + "grad_norm": 4.3257536228426074, + "learning_rate": 4.442221831254502e-06, + "loss": 0.4285, + "step": 3360 + }, + { + "epoch": 1.144211555631138, + "grad_norm": 5.977479998785959, + "learning_rate": 4.440512986239263e-06, + "loss": 0.449, + "step": 3365 + }, + { + "epoch": 1.1459121635984864, + "grad_norm": 4.4066822856795635, + "learning_rate": 4.438801857313274e-06, + "loss": 0.4453, + "step": 3370 + }, + { + "epoch": 1.1476127715658349, + "grad_norm": 3.491649964713033, + "learning_rate": 4.437088446490469e-06, + "loss": 0.4454, + "step": 3375 + }, + { + "epoch": 1.1493133795331831, + "grad_norm": 7.232323088989081, + "learning_rate": 4.435372755787469e-06, + "loss": 0.4534, + "step": 3380 + }, + { + "epoch": 1.1510139875005314, + "grad_norm": 3.293718858011363, + "learning_rate": 4.433654787223576e-06, + "loss": 0.4597, + "step": 3385 + }, + { + "epoch": 1.1527145954678797, + "grad_norm": 7.433792579695618, + "learning_rate": 4.431934542820775e-06, + "loss": 0.4573, + "step": 3390 + }, + { + "epoch": 1.1544152034352282, + "grad_norm": 3.82924370466945, + "learning_rate": 4.4302120246037295e-06, + "loss": 0.4567, + "step": 3395 + }, + { + "epoch": 1.1561158114025765, + "grad_norm": 7.983481827345274, + "learning_rate": 4.428487234599777e-06, + "loss": 0.4414, + "step": 3400 + }, + { + "epoch": 1.1578164193699247, + "grad_norm": 9.436609447791255, + "learning_rate": 4.426760174838932e-06, + "loss": 0.4262, + "step": 3405 + }, + { + "epoch": 1.159517027337273, + "grad_norm": 4.743398659803592, + "learning_rate": 4.425030847353878e-06, + "loss": 0.4155, + "step": 3410 + }, + { + "epoch": 1.1612176353046215, + "grad_norm": 3.204865750073265, + "learning_rate": 4.42329925417997e-06, + "loss": 0.4773, + "step": 3415 + }, + { + "epoch": 1.1629182432719698, + "grad_norm": 12.234446040030164, + "learning_rate": 4.421565397355225e-06, + "loss": 0.4617, + "step": 3420 + }, + { + "epoch": 1.164618851239318, + "grad_norm": 4.4116669121739305, + "learning_rate": 4.41982927892033e-06, + "loss": 0.4391, + "step": 3425 + }, + { + "epoch": 1.1663194592066664, + "grad_norm": 5.111216592507992, + "learning_rate": 4.418090900918629e-06, + "loss": 0.4631, + "step": 3430 + }, + { + "epoch": 1.1680200671740146, + "grad_norm": 7.803946682117352, + "learning_rate": 4.416350265396129e-06, + "loss": 0.4269, + "step": 3435 + }, + { + "epoch": 1.1697206751413631, + "grad_norm": 3.828818855349512, + "learning_rate": 4.414607374401492e-06, + "loss": 0.4256, + "step": 3440 + }, + { + "epoch": 1.1714212831087114, + "grad_norm": 5.059207154144129, + "learning_rate": 4.412862229986034e-06, + "loss": 0.4122, + "step": 3445 + }, + { + "epoch": 1.1731218910760597, + "grad_norm": 5.035231185244624, + "learning_rate": 4.411114834203726e-06, + "loss": 0.4311, + "step": 3450 + }, + { + "epoch": 1.174822499043408, + "grad_norm": 4.045539793414604, + "learning_rate": 4.409365189111187e-06, + "loss": 0.4287, + "step": 3455 + }, + { + "epoch": 1.1765231070107562, + "grad_norm": 6.292496913133239, + "learning_rate": 4.407613296767682e-06, + "loss": 0.4475, + "step": 3460 + }, + { + "epoch": 1.1782237149781047, + "grad_norm": 3.6674441134732283, + "learning_rate": 4.405859159235123e-06, + "loss": 0.4389, + "step": 3465 + }, + { + "epoch": 1.179924322945453, + "grad_norm": 5.160559006742301, + "learning_rate": 4.404102778578064e-06, + "loss": 0.4276, + "step": 3470 + }, + { + "epoch": 1.1816249309128013, + "grad_norm": 4.055580025128073, + "learning_rate": 4.402344156863699e-06, + "loss": 0.457, + "step": 3475 + }, + { + "epoch": 1.1833255388801496, + "grad_norm": 4.361898800931103, + "learning_rate": 4.40058329616186e-06, + "loss": 0.4286, + "step": 3480 + }, + { + "epoch": 1.185026146847498, + "grad_norm": 6.495140588757689, + "learning_rate": 4.398820198545013e-06, + "loss": 0.4453, + "step": 3485 + }, + { + "epoch": 1.1867267548148464, + "grad_norm": 11.019077172110329, + "learning_rate": 4.397054866088258e-06, + "loss": 0.442, + "step": 3490 + }, + { + "epoch": 1.1884273627821946, + "grad_norm": 5.228402398620584, + "learning_rate": 4.3952873008693245e-06, + "loss": 0.4321, + "step": 3495 + }, + { + "epoch": 1.190127970749543, + "grad_norm": 4.805648806263351, + "learning_rate": 4.39351750496857e-06, + "loss": 0.4366, + "step": 3500 + }, + { + "epoch": 1.1918285787168914, + "grad_norm": 7.796340025598192, + "learning_rate": 4.391745480468978e-06, + "loss": 0.461, + "step": 3505 + }, + { + "epoch": 1.1935291866842397, + "grad_norm": 3.063990773463929, + "learning_rate": 4.389971229456154e-06, + "loss": 0.4458, + "step": 3510 + }, + { + "epoch": 1.195229794651588, + "grad_norm": 4.429362476378763, + "learning_rate": 4.388194754018327e-06, + "loss": 0.4428, + "step": 3515 + }, + { + "epoch": 1.1969304026189362, + "grad_norm": 2.9371227567250004, + "learning_rate": 4.38641605624634e-06, + "loss": 0.4306, + "step": 3520 + }, + { + "epoch": 1.1986310105862845, + "grad_norm": 3.8651376005294944, + "learning_rate": 4.384635138233653e-06, + "loss": 0.4449, + "step": 3525 + }, + { + "epoch": 1.200331618553633, + "grad_norm": 3.4940040974535727, + "learning_rate": 4.38285200207634e-06, + "loss": 0.4482, + "step": 3530 + }, + { + "epoch": 1.2020322265209813, + "grad_norm": 5.782702541050796, + "learning_rate": 4.381066649873085e-06, + "loss": 0.4207, + "step": 3535 + }, + { + "epoch": 1.2037328344883296, + "grad_norm": 5.50472142631619, + "learning_rate": 4.37927908372518e-06, + "loss": 0.4572, + "step": 3540 + }, + { + "epoch": 1.2054334424556779, + "grad_norm": 7.824789252994781, + "learning_rate": 4.3774893057365244e-06, + "loss": 0.4318, + "step": 3545 + }, + { + "epoch": 1.2071340504230261, + "grad_norm": 13.54244874218656, + "learning_rate": 4.375697318013618e-06, + "loss": 0.4301, + "step": 3550 + }, + { + "epoch": 1.2088346583903746, + "grad_norm": 4.230530471384282, + "learning_rate": 4.373903122665563e-06, + "loss": 0.4292, + "step": 3555 + }, + { + "epoch": 1.210535266357723, + "grad_norm": 6.658320209793368, + "learning_rate": 4.372106721804061e-06, + "loss": 0.4368, + "step": 3560 + }, + { + "epoch": 1.2122358743250712, + "grad_norm": 2.857261861954253, + "learning_rate": 4.370308117543407e-06, + "loss": 0.4522, + "step": 3565 + }, + { + "epoch": 1.2139364822924195, + "grad_norm": 5.39200827416774, + "learning_rate": 4.368507312000491e-06, + "loss": 0.4387, + "step": 3570 + }, + { + "epoch": 1.215637090259768, + "grad_norm": 23.642260016581286, + "learning_rate": 4.366704307294794e-06, + "loss": 0.4245, + "step": 3575 + }, + { + "epoch": 1.2173376982271162, + "grad_norm": 4.715256234640616, + "learning_rate": 4.364899105548384e-06, + "loss": 0.4123, + "step": 3580 + }, + { + "epoch": 1.2190383061944645, + "grad_norm": 5.092755025756587, + "learning_rate": 4.363091708885916e-06, + "loss": 0.4303, + "step": 3585 + }, + { + "epoch": 1.2207389141618128, + "grad_norm": 4.644408008547105, + "learning_rate": 4.361282119434626e-06, + "loss": 0.4406, + "step": 3590 + }, + { + "epoch": 1.2224395221291613, + "grad_norm": 3.8748710347038995, + "learning_rate": 4.359470339324335e-06, + "loss": 0.4194, + "step": 3595 + }, + { + "epoch": 1.2241401300965096, + "grad_norm": 6.939396520641551, + "learning_rate": 4.35765637068744e-06, + "loss": 0.4308, + "step": 3600 + }, + { + "epoch": 1.2258407380638578, + "grad_norm": 5.883415452113619, + "learning_rate": 4.355840215658912e-06, + "loss": 0.4245, + "step": 3605 + }, + { + "epoch": 1.2275413460312061, + "grad_norm": 4.372515269841919, + "learning_rate": 4.354021876376297e-06, + "loss": 0.466, + "step": 3610 + }, + { + "epoch": 1.2292419539985544, + "grad_norm": 3.38508833964315, + "learning_rate": 4.352201354979715e-06, + "loss": 0.4437, + "step": 3615 + }, + { + "epoch": 1.230942561965903, + "grad_norm": 4.334673736588709, + "learning_rate": 4.350378653611848e-06, + "loss": 0.4338, + "step": 3620 + }, + { + "epoch": 1.2326431699332512, + "grad_norm": 4.362597499023024, + "learning_rate": 4.348553774417948e-06, + "loss": 0.4537, + "step": 3625 + }, + { + "epoch": 1.2343437779005995, + "grad_norm": 10.152475333998257, + "learning_rate": 4.346726719545828e-06, + "loss": 0.437, + "step": 3630 + }, + { + "epoch": 1.2360443858679477, + "grad_norm": 6.917711265677128, + "learning_rate": 4.344897491145866e-06, + "loss": 0.4567, + "step": 3635 + }, + { + "epoch": 1.237744993835296, + "grad_norm": 3.0024625558233393, + "learning_rate": 4.343066091370992e-06, + "loss": 0.4586, + "step": 3640 + }, + { + "epoch": 1.2394456018026445, + "grad_norm": 6.066921141014438, + "learning_rate": 4.341232522376696e-06, + "loss": 0.4439, + "step": 3645 + }, + { + "epoch": 1.2411462097699928, + "grad_norm": 10.159516057402293, + "learning_rate": 4.339396786321018e-06, + "loss": 0.4262, + "step": 3650 + }, + { + "epoch": 1.242846817737341, + "grad_norm": 6.1930370198982105, + "learning_rate": 4.337558885364552e-06, + "loss": 0.4147, + "step": 3655 + }, + { + "epoch": 1.2445474257046893, + "grad_norm": 4.609436343311945, + "learning_rate": 4.335718821670439e-06, + "loss": 0.4256, + "step": 3660 + }, + { + "epoch": 1.2462480336720378, + "grad_norm": 6.704588153460611, + "learning_rate": 4.333876597404362e-06, + "loss": 0.4298, + "step": 3665 + }, + { + "epoch": 1.2479486416393861, + "grad_norm": 6.537990452906174, + "learning_rate": 4.332032214734552e-06, + "loss": 0.4309, + "step": 3670 + }, + { + "epoch": 1.2496492496067344, + "grad_norm": 6.06218752888017, + "learning_rate": 4.3301856758317765e-06, + "loss": 0.4543, + "step": 3675 + }, + { + "epoch": 1.2513498575740827, + "grad_norm": 4.072305293977542, + "learning_rate": 4.328336982869343e-06, + "loss": 0.4389, + "step": 3680 + }, + { + "epoch": 1.2530504655414312, + "grad_norm": 5.420253035287267, + "learning_rate": 4.326486138023094e-06, + "loss": 0.4543, + "step": 3685 + }, + { + "epoch": 1.2547510735087795, + "grad_norm": 4.952172100936549, + "learning_rate": 4.324633143471402e-06, + "loss": 0.4219, + "step": 3690 + }, + { + "epoch": 1.2564516814761277, + "grad_norm": 4.895280001327636, + "learning_rate": 4.322778001395174e-06, + "loss": 0.4624, + "step": 3695 + }, + { + "epoch": 1.258152289443476, + "grad_norm": 5.021111606296189, + "learning_rate": 4.320920713977843e-06, + "loss": 0.4493, + "step": 3700 + }, + { + "epoch": 1.2598528974108243, + "grad_norm": 8.792884307722462, + "learning_rate": 4.319061283405365e-06, + "loss": 0.4507, + "step": 3705 + }, + { + "epoch": 1.2615535053781728, + "grad_norm": 8.462433497526717, + "learning_rate": 4.317199711866219e-06, + "loss": 0.4387, + "step": 3710 + }, + { + "epoch": 1.263254113345521, + "grad_norm": 11.089756877549231, + "learning_rate": 4.315336001551407e-06, + "loss": 0.4351, + "step": 3715 + }, + { + "epoch": 1.2649547213128693, + "grad_norm": 4.625412332527817, + "learning_rate": 4.313470154654443e-06, + "loss": 0.4348, + "step": 3720 + }, + { + "epoch": 1.2666553292802176, + "grad_norm": 5.432964358134367, + "learning_rate": 4.311602173371362e-06, + "loss": 0.4486, + "step": 3725 + }, + { + "epoch": 1.268355937247566, + "grad_norm": 4.206142665765904, + "learning_rate": 4.309732059900705e-06, + "loss": 0.448, + "step": 3730 + }, + { + "epoch": 1.2700565452149144, + "grad_norm": 6.159446545724075, + "learning_rate": 4.307859816443526e-06, + "loss": 0.4086, + "step": 3735 + }, + { + "epoch": 1.2717571531822627, + "grad_norm": 7.831797795407591, + "learning_rate": 4.305985445203385e-06, + "loss": 0.4538, + "step": 3740 + }, + { + "epoch": 1.273457761149611, + "grad_norm": 8.60007583844762, + "learning_rate": 4.304108948386346e-06, + "loss": 0.4534, + "step": 3745 + }, + { + "epoch": 1.2751583691169592, + "grad_norm": 16.34293430521431, + "learning_rate": 4.3022303282009755e-06, + "loss": 0.4493, + "step": 3750 + }, + { + "epoch": 1.2768589770843075, + "grad_norm": 5.278467269983003, + "learning_rate": 4.30034958685834e-06, + "loss": 0.4491, + "step": 3755 + }, + { + "epoch": 1.278559585051656, + "grad_norm": 9.360024586550447, + "learning_rate": 4.298466726571999e-06, + "loss": 0.4212, + "step": 3760 + }, + { + "epoch": 1.2802601930190043, + "grad_norm": 3.7113866114527947, + "learning_rate": 4.296581749558011e-06, + "loss": 0.4638, + "step": 3765 + }, + { + "epoch": 1.2819608009863526, + "grad_norm": 5.854958952627499, + "learning_rate": 4.29469465803492e-06, + "loss": 0.4462, + "step": 3770 + }, + { + "epoch": 1.283661408953701, + "grad_norm": 3.6818168663588784, + "learning_rate": 4.292805454223763e-06, + "loss": 0.4291, + "step": 3775 + }, + { + "epoch": 1.2853620169210493, + "grad_norm": 3.243734301495061, + "learning_rate": 4.290914140348063e-06, + "loss": 0.4256, + "step": 3780 + }, + { + "epoch": 1.2870626248883976, + "grad_norm": 3.2162443775372447, + "learning_rate": 4.289020718633822e-06, + "loss": 0.4217, + "step": 3785 + }, + { + "epoch": 1.288763232855746, + "grad_norm": 5.687066509718208, + "learning_rate": 4.28712519130953e-06, + "loss": 0.4271, + "step": 3790 + }, + { + "epoch": 1.2904638408230942, + "grad_norm": 24.368889431915512, + "learning_rate": 4.285227560606149e-06, + "loss": 0.4308, + "step": 3795 + }, + { + "epoch": 1.2921644487904427, + "grad_norm": 4.154763886002037, + "learning_rate": 4.2833278287571186e-06, + "loss": 0.4554, + "step": 3800 + }, + { + "epoch": 1.293865056757791, + "grad_norm": 6.2104169824121955, + "learning_rate": 4.281425997998353e-06, + "loss": 0.4408, + "step": 3805 + }, + { + "epoch": 1.2955656647251392, + "grad_norm": 4.695856290250523, + "learning_rate": 4.279522070568235e-06, + "loss": 0.457, + "step": 3810 + }, + { + "epoch": 1.2972662726924875, + "grad_norm": 5.810017736651081, + "learning_rate": 4.277616048707615e-06, + "loss": 0.4248, + "step": 3815 + }, + { + "epoch": 1.2989668806598358, + "grad_norm": 6.107344653532907, + "learning_rate": 4.2757079346598105e-06, + "loss": 0.4452, + "step": 3820 + }, + { + "epoch": 1.3006674886271843, + "grad_norm": 3.4942653230821863, + "learning_rate": 4.273797730670598e-06, + "loss": 0.4217, + "step": 3825 + }, + { + "epoch": 1.3023680965945326, + "grad_norm": 18.112883039304148, + "learning_rate": 4.271885438988217e-06, + "loss": 0.437, + "step": 3830 + }, + { + "epoch": 1.3040687045618808, + "grad_norm": 19.389834849422982, + "learning_rate": 4.269971061863362e-06, + "loss": 0.419, + "step": 3835 + }, + { + "epoch": 1.3057693125292291, + "grad_norm": 5.474955920511076, + "learning_rate": 4.268054601549183e-06, + "loss": 0.4595, + "step": 3840 + }, + { + "epoch": 1.3074699204965774, + "grad_norm": 2.9582870715276792, + "learning_rate": 4.2661360603012825e-06, + "loss": 0.4621, + "step": 3845 + }, + { + "epoch": 1.309170528463926, + "grad_norm": 16.34027412673593, + "learning_rate": 4.2642154403777105e-06, + "loss": 0.4233, + "step": 3850 + }, + { + "epoch": 1.3108711364312742, + "grad_norm": 5.918808175469473, + "learning_rate": 4.262292744038964e-06, + "loss": 0.4406, + "step": 3855 + }, + { + "epoch": 1.3125717443986225, + "grad_norm": 3.399498455379893, + "learning_rate": 4.260367973547985e-06, + "loss": 0.4299, + "step": 3860 + }, + { + "epoch": 1.314272352365971, + "grad_norm": 13.542704000607294, + "learning_rate": 4.258441131170157e-06, + "loss": 0.4132, + "step": 3865 + }, + { + "epoch": 1.3159729603333192, + "grad_norm": 9.02967740830736, + "learning_rate": 4.256512219173298e-06, + "loss": 0.4239, + "step": 3870 + }, + { + "epoch": 1.3176735683006675, + "grad_norm": 3.9130220430192524, + "learning_rate": 4.254581239827667e-06, + "loss": 0.4342, + "step": 3875 + }, + { + "epoch": 1.3193741762680158, + "grad_norm": 5.574299002291679, + "learning_rate": 4.252648195405954e-06, + "loss": 0.4568, + "step": 3880 + }, + { + "epoch": 1.321074784235364, + "grad_norm": 6.3324159546947465, + "learning_rate": 4.250713088183278e-06, + "loss": 0.4419, + "step": 3885 + }, + { + "epoch": 1.3227753922027126, + "grad_norm": 14.164208290467345, + "learning_rate": 4.248775920437191e-06, + "loss": 0.4401, + "step": 3890 + }, + { + "epoch": 1.3244760001700608, + "grad_norm": 6.1888271893560916, + "learning_rate": 4.246836694447661e-06, + "loss": 0.4475, + "step": 3895 + }, + { + "epoch": 1.3261766081374091, + "grad_norm": 2.957848596009968, + "learning_rate": 4.244895412497088e-06, + "loss": 0.4035, + "step": 3900 + }, + { + "epoch": 1.3278772161047574, + "grad_norm": 4.295157380784761, + "learning_rate": 4.242952076870287e-06, + "loss": 0.4524, + "step": 3905 + }, + { + "epoch": 1.3295778240721057, + "grad_norm": 9.30317062363427, + "learning_rate": 4.241006689854491e-06, + "loss": 0.4306, + "step": 3910 + }, + { + "epoch": 1.3312784320394542, + "grad_norm": 4.482380339897904, + "learning_rate": 4.239059253739346e-06, + "loss": 0.4121, + "step": 3915 + }, + { + "epoch": 1.3329790400068025, + "grad_norm": 4.424068078801065, + "learning_rate": 4.237109770816913e-06, + "loss": 0.4164, + "step": 3920 + }, + { + "epoch": 1.3346796479741507, + "grad_norm": 4.998850928883728, + "learning_rate": 4.235158243381658e-06, + "loss": 0.4392, + "step": 3925 + }, + { + "epoch": 1.336380255941499, + "grad_norm": 6.3018273454234635, + "learning_rate": 4.233204673730456e-06, + "loss": 0.4501, + "step": 3930 + }, + { + "epoch": 1.3380808639088473, + "grad_norm": 8.502288918753674, + "learning_rate": 4.231249064162586e-06, + "loss": 0.4359, + "step": 3935 + }, + { + "epoch": 1.3397814718761958, + "grad_norm": 4.793062000875688, + "learning_rate": 4.229291416979726e-06, + "loss": 0.455, + "step": 3940 + }, + { + "epoch": 1.341482079843544, + "grad_norm": 9.180274029644583, + "learning_rate": 4.227331734485953e-06, + "loss": 0.4115, + "step": 3945 + }, + { + "epoch": 1.3431826878108923, + "grad_norm": 3.119289594195432, + "learning_rate": 4.225370018987741e-06, + "loss": 0.4362, + "step": 3950 + }, + { + "epoch": 1.3448832957782408, + "grad_norm": 7.274300467490672, + "learning_rate": 4.223406272793953e-06, + "loss": 0.4284, + "step": 3955 + }, + { + "epoch": 1.3465839037455891, + "grad_norm": 4.6503861534660675, + "learning_rate": 4.221440498215845e-06, + "loss": 0.426, + "step": 3960 + }, + { + "epoch": 1.3482845117129374, + "grad_norm": 5.703563707259275, + "learning_rate": 4.21947269756706e-06, + "loss": 0.4249, + "step": 3965 + }, + { + "epoch": 1.3499851196802857, + "grad_norm": 4.120422582463924, + "learning_rate": 4.217502873163626e-06, + "loss": 0.4319, + "step": 3970 + }, + { + "epoch": 1.351685727647634, + "grad_norm": 7.764067180947526, + "learning_rate": 4.215531027323952e-06, + "loss": 0.4291, + "step": 3975 + }, + { + "epoch": 1.3533863356149824, + "grad_norm": 3.2581402948638423, + "learning_rate": 4.2135571623688244e-06, + "loss": 0.4511, + "step": 3980 + }, + { + "epoch": 1.3550869435823307, + "grad_norm": 4.221269122403145, + "learning_rate": 4.211581280621411e-06, + "loss": 0.4295, + "step": 3985 + }, + { + "epoch": 1.356787551549679, + "grad_norm": 5.052878261691366, + "learning_rate": 4.209603384407248e-06, + "loss": 0.4488, + "step": 3990 + }, + { + "epoch": 1.3584881595170273, + "grad_norm": 9.378569205330985, + "learning_rate": 4.207623476054246e-06, + "loss": 0.4381, + "step": 3995 + }, + { + "epoch": 1.3601887674843756, + "grad_norm": 6.522064207623179, + "learning_rate": 4.205641557892682e-06, + "loss": 0.4298, + "step": 4000 + }, + { + "epoch": 1.361889375451724, + "grad_norm": 5.294462301922804, + "learning_rate": 4.203657632255199e-06, + "loss": 0.421, + "step": 4005 + }, + { + "epoch": 1.3635899834190723, + "grad_norm": 4.134239000036112, + "learning_rate": 4.201671701476803e-06, + "loss": 0.4444, + "step": 4010 + }, + { + "epoch": 1.3652905913864206, + "grad_norm": 6.035380347976283, + "learning_rate": 4.19968376789486e-06, + "loss": 0.4213, + "step": 4015 + }, + { + "epoch": 1.366991199353769, + "grad_norm": 4.7922651854375315, + "learning_rate": 4.1976938338490925e-06, + "loss": 0.4403, + "step": 4020 + }, + { + "epoch": 1.3686918073211172, + "grad_norm": 6.631140177235056, + "learning_rate": 4.195701901681579e-06, + "loss": 0.4352, + "step": 4025 + }, + { + "epoch": 1.3703924152884657, + "grad_norm": 2.7344288539898334, + "learning_rate": 4.193707973736747e-06, + "loss": 0.4093, + "step": 4030 + }, + { + "epoch": 1.372093023255814, + "grad_norm": 4.278791460437773, + "learning_rate": 4.1917120523613766e-06, + "loss": 0.4422, + "step": 4035 + }, + { + "epoch": 1.3737936312231622, + "grad_norm": 5.214406477643654, + "learning_rate": 4.18971413990459e-06, + "loss": 0.4579, + "step": 4040 + }, + { + "epoch": 1.3754942391905107, + "grad_norm": 6.063078786132071, + "learning_rate": 4.187714238717857e-06, + "loss": 0.4394, + "step": 4045 + }, + { + "epoch": 1.377194847157859, + "grad_norm": 63.69948112970655, + "learning_rate": 4.185712351154985e-06, + "loss": 0.439, + "step": 4050 + }, + { + "epoch": 1.3788954551252073, + "grad_norm": 3.2261921517248573, + "learning_rate": 4.18370847957212e-06, + "loss": 0.4198, + "step": 4055 + }, + { + "epoch": 1.3805960630925556, + "grad_norm": 3.634713290168712, + "learning_rate": 4.181702626327745e-06, + "loss": 0.461, + "step": 4060 + }, + { + "epoch": 1.3822966710599038, + "grad_norm": 3.277595544835171, + "learning_rate": 4.179694793782673e-06, + "loss": 0.4596, + "step": 4065 + }, + { + "epoch": 1.3839972790272523, + "grad_norm": 5.408346655373303, + "learning_rate": 4.177684984300046e-06, + "loss": 0.418, + "step": 4070 + }, + { + "epoch": 1.3856978869946006, + "grad_norm": 4.2892952787221015, + "learning_rate": 4.1756732002453345e-06, + "loss": 0.4087, + "step": 4075 + }, + { + "epoch": 1.387398494961949, + "grad_norm": 3.3627724696710706, + "learning_rate": 4.173659443986334e-06, + "loss": 0.4316, + "step": 4080 + }, + { + "epoch": 1.3890991029292972, + "grad_norm": 3.651825188574973, + "learning_rate": 4.17164371789316e-06, + "loss": 0.4455, + "step": 4085 + }, + { + "epoch": 1.3907997108966454, + "grad_norm": 3.366971683134897, + "learning_rate": 4.169626024338245e-06, + "loss": 0.4476, + "step": 4090 + }, + { + "epoch": 1.392500318863994, + "grad_norm": 4.027885573886271, + "learning_rate": 4.167606365696337e-06, + "loss": 0.4213, + "step": 4095 + }, + { + "epoch": 1.3942009268313422, + "grad_norm": 84.54064644279089, + "learning_rate": 4.165584744344502e-06, + "loss": 0.4188, + "step": 4100 + }, + { + "epoch": 1.3959015347986905, + "grad_norm": 3.550654174206856, + "learning_rate": 4.163561162662109e-06, + "loss": 0.4367, + "step": 4105 + }, + { + "epoch": 1.3976021427660388, + "grad_norm": 6.429938907038778, + "learning_rate": 4.161535623030839e-06, + "loss": 0.4347, + "step": 4110 + }, + { + "epoch": 1.399302750733387, + "grad_norm": 3.3469080043284527, + "learning_rate": 4.159508127834676e-06, + "loss": 0.429, + "step": 4115 + }, + { + "epoch": 1.4010033587007356, + "grad_norm": 3.932907324437497, + "learning_rate": 4.157478679459904e-06, + "loss": 0.45, + "step": 4120 + }, + { + "epoch": 1.4027039666680838, + "grad_norm": 3.0465101479498093, + "learning_rate": 4.155447280295109e-06, + "loss": 0.4143, + "step": 4125 + }, + { + "epoch": 1.4044045746354321, + "grad_norm": 6.2332284033665015, + "learning_rate": 4.153413932731172e-06, + "loss": 0.4155, + "step": 4130 + }, + { + "epoch": 1.4061051826027806, + "grad_norm": 4.594711738829712, + "learning_rate": 4.151378639161263e-06, + "loss": 0.4174, + "step": 4135 + }, + { + "epoch": 1.407805790570129, + "grad_norm": 8.369037345643367, + "learning_rate": 4.14934140198085e-06, + "loss": 0.4428, + "step": 4140 + }, + { + "epoch": 1.4095063985374772, + "grad_norm": 4.450892512279648, + "learning_rate": 4.147302223587683e-06, + "loss": 0.4323, + "step": 4145 + }, + { + "epoch": 1.4112070065048254, + "grad_norm": 3.5572501388195965, + "learning_rate": 4.145261106381797e-06, + "loss": 0.4364, + "step": 4150 + }, + { + "epoch": 1.4129076144721737, + "grad_norm": 3.5926379046106303, + "learning_rate": 4.1432180527655105e-06, + "loss": 0.4504, + "step": 4155 + }, + { + "epoch": 1.4146082224395222, + "grad_norm": 3.7409778582649875, + "learning_rate": 4.1411730651434224e-06, + "loss": 0.4366, + "step": 4160 + }, + { + "epoch": 1.4163088304068705, + "grad_norm": 3.3685418588380545, + "learning_rate": 4.1391261459224055e-06, + "loss": 0.4467, + "step": 4165 + }, + { + "epoch": 1.4180094383742188, + "grad_norm": 6.697596996121014, + "learning_rate": 4.137077297511606e-06, + "loss": 0.434, + "step": 4170 + }, + { + "epoch": 1.419710046341567, + "grad_norm": 4.224825510306204, + "learning_rate": 4.135026522322441e-06, + "loss": 0.4157, + "step": 4175 + }, + { + "epoch": 1.4214106543089153, + "grad_norm": 4.358559841751216, + "learning_rate": 4.132973822768597e-06, + "loss": 0.4242, + "step": 4180 + }, + { + "epoch": 1.4231112622762638, + "grad_norm": 4.658618977150568, + "learning_rate": 4.130919201266023e-06, + "loss": 0.4228, + "step": 4185 + }, + { + "epoch": 1.424811870243612, + "grad_norm": 4.397079121273564, + "learning_rate": 4.1288626602329316e-06, + "loss": 0.4458, + "step": 4190 + }, + { + "epoch": 1.4265124782109604, + "grad_norm": 8.910498551520558, + "learning_rate": 4.126804202089795e-06, + "loss": 0.4336, + "step": 4195 + }, + { + "epoch": 1.4282130861783087, + "grad_norm": 5.317988762383234, + "learning_rate": 4.12474382925934e-06, + "loss": 0.4274, + "step": 4200 + }, + { + "epoch": 1.429913694145657, + "grad_norm": 6.420028086835303, + "learning_rate": 4.122681544166548e-06, + "loss": 0.4249, + "step": 4205 + }, + { + "epoch": 1.4316143021130054, + "grad_norm": 3.3486351335128677, + "learning_rate": 4.120617349238651e-06, + "loss": 0.4311, + "step": 4210 + }, + { + "epoch": 1.4333149100803537, + "grad_norm": 3.765520622561078, + "learning_rate": 4.118551246905128e-06, + "loss": 0.404, + "step": 4215 + }, + { + "epoch": 1.435015518047702, + "grad_norm": 4.361629163635694, + "learning_rate": 4.116483239597706e-06, + "loss": 0.4358, + "step": 4220 + }, + { + "epoch": 1.4367161260150505, + "grad_norm": 8.161954348572857, + "learning_rate": 4.1144133297503495e-06, + "loss": 0.4396, + "step": 4225 + }, + { + "epoch": 1.4384167339823988, + "grad_norm": 5.343639505025115, + "learning_rate": 4.1123415197992645e-06, + "loss": 0.4488, + "step": 4230 + }, + { + "epoch": 1.440117341949747, + "grad_norm": 3.896952357426291, + "learning_rate": 4.1102678121828956e-06, + "loss": 0.4164, + "step": 4235 + }, + { + "epoch": 1.4418179499170953, + "grad_norm": 3.778144495904373, + "learning_rate": 4.108192209341916e-06, + "loss": 0.429, + "step": 4240 + }, + { + "epoch": 1.4435185578844436, + "grad_norm": 4.379626057470475, + "learning_rate": 4.1061147137192325e-06, + "loss": 0.4273, + "step": 4245 + }, + { + "epoch": 1.445219165851792, + "grad_norm": 3.3970834318244982, + "learning_rate": 4.104035327759981e-06, + "loss": 0.4324, + "step": 4250 + }, + { + "epoch": 1.4469197738191404, + "grad_norm": 6.877015746846206, + "learning_rate": 4.101954053911519e-06, + "loss": 0.427, + "step": 4255 + }, + { + "epoch": 1.4486203817864887, + "grad_norm": 8.771197237906213, + "learning_rate": 4.099870894623429e-06, + "loss": 0.427, + "step": 4260 + }, + { + "epoch": 1.450320989753837, + "grad_norm": 4.106539562178847, + "learning_rate": 4.097785852347509e-06, + "loss": 0.4327, + "step": 4265 + }, + { + "epoch": 1.4520215977211852, + "grad_norm": 3.0368575048625503, + "learning_rate": 4.095698929537776e-06, + "loss": 0.4247, + "step": 4270 + }, + { + "epoch": 1.4537222056885337, + "grad_norm": 4.406684997834663, + "learning_rate": 4.093610128650461e-06, + "loss": 0.4328, + "step": 4275 + }, + { + "epoch": 1.455422813655882, + "grad_norm": 10.807724313086286, + "learning_rate": 4.091519452144002e-06, + "loss": 0.4093, + "step": 4280 + }, + { + "epoch": 1.4571234216232303, + "grad_norm": 4.2489620858096995, + "learning_rate": 4.089426902479048e-06, + "loss": 0.4166, + "step": 4285 + }, + { + "epoch": 1.4588240295905786, + "grad_norm": 15.82562295022666, + "learning_rate": 4.0873324821184505e-06, + "loss": 0.4137, + "step": 4290 + }, + { + "epoch": 1.4605246375579268, + "grad_norm": 20.912345821951686, + "learning_rate": 4.085236193527264e-06, + "loss": 0.4524, + "step": 4295 + }, + { + "epoch": 1.4622252455252753, + "grad_norm": 10.918557580898346, + "learning_rate": 4.08313803917274e-06, + "loss": 0.4268, + "step": 4300 + }, + { + "epoch": 1.4639258534926236, + "grad_norm": 3.4822802477088337, + "learning_rate": 4.08103802152433e-06, + "loss": 0.4283, + "step": 4305 + }, + { + "epoch": 1.4656264614599719, + "grad_norm": 4.537596013386029, + "learning_rate": 4.078936143053673e-06, + "loss": 0.4502, + "step": 4310 + }, + { + "epoch": 1.4673270694273204, + "grad_norm": 4.038796537691099, + "learning_rate": 4.076832406234601e-06, + "loss": 0.4429, + "step": 4315 + }, + { + "epoch": 1.4690276773946687, + "grad_norm": 5.579890053055556, + "learning_rate": 4.074726813543134e-06, + "loss": 0.424, + "step": 4320 + }, + { + "epoch": 1.470728285362017, + "grad_norm": 6.688734469203853, + "learning_rate": 4.072619367457475e-06, + "loss": 0.4596, + "step": 4325 + }, + { + "epoch": 1.4724288933293652, + "grad_norm": 3.5175009361444314, + "learning_rate": 4.070510070458009e-06, + "loss": 0.4227, + "step": 4330 + }, + { + "epoch": 1.4741295012967135, + "grad_norm": 10.407473182628523, + "learning_rate": 4.068398925027299e-06, + "loss": 0.4277, + "step": 4335 + }, + { + "epoch": 1.475830109264062, + "grad_norm": 4.001806688070732, + "learning_rate": 4.0662859336500834e-06, + "loss": 0.4382, + "step": 4340 + }, + { + "epoch": 1.4775307172314103, + "grad_norm": 9.423854658257099, + "learning_rate": 4.064171098813274e-06, + "loss": 0.4325, + "step": 4345 + }, + { + "epoch": 1.4792313251987586, + "grad_norm": 2.8345757589574205, + "learning_rate": 4.062054423005952e-06, + "loss": 0.4203, + "step": 4350 + }, + { + "epoch": 1.4809319331661068, + "grad_norm": 6.113162495170087, + "learning_rate": 4.059935908719366e-06, + "loss": 0.4194, + "step": 4355 + }, + { + "epoch": 1.482632541133455, + "grad_norm": 3.5573072498137313, + "learning_rate": 4.0578155584469245e-06, + "loss": 0.4406, + "step": 4360 + }, + { + "epoch": 1.4843331491008036, + "grad_norm": 6.944016100572673, + "learning_rate": 4.055693374684203e-06, + "loss": 0.4426, + "step": 4365 + }, + { + "epoch": 1.4860337570681519, + "grad_norm": 3.0991563801030995, + "learning_rate": 4.05356935992893e-06, + "loss": 0.4286, + "step": 4370 + }, + { + "epoch": 1.4877343650355002, + "grad_norm": 3.876016849273567, + "learning_rate": 4.051443516680991e-06, + "loss": 0.4059, + "step": 4375 + }, + { + "epoch": 1.4894349730028484, + "grad_norm": 3.1618825548418514, + "learning_rate": 4.049315847442426e-06, + "loss": 0.4508, + "step": 4380 + }, + { + "epoch": 1.4911355809701967, + "grad_norm": 3.26722333104821, + "learning_rate": 4.047186354717419e-06, + "loss": 0.4377, + "step": 4385 + }, + { + "epoch": 1.4928361889375452, + "grad_norm": 4.949642233534566, + "learning_rate": 4.045055041012304e-06, + "loss": 0.4172, + "step": 4390 + }, + { + "epoch": 1.4945367969048935, + "grad_norm": 3.9737091600809586, + "learning_rate": 4.042921908835557e-06, + "loss": 0.425, + "step": 4395 + }, + { + "epoch": 1.4962374048722418, + "grad_norm": 5.237403020237026, + "learning_rate": 4.040786960697793e-06, + "loss": 0.4168, + "step": 4400 + }, + { + "epoch": 1.4979380128395903, + "grad_norm": 3.30834199256293, + "learning_rate": 4.038650199111766e-06, + "loss": 0.4019, + "step": 4405 + }, + { + "epoch": 1.4996386208069385, + "grad_norm": 9.193316798315635, + "learning_rate": 4.036511626592366e-06, + "loss": 0.4412, + "step": 4410 + }, + { + "epoch": 1.5013392287742868, + "grad_norm": 3.367832782080482, + "learning_rate": 4.034371245656611e-06, + "loss": 0.4338, + "step": 4415 + }, + { + "epoch": 1.503039836741635, + "grad_norm": 6.827626401756652, + "learning_rate": 4.0322290588236475e-06, + "loss": 0.4103, + "step": 4420 + }, + { + "epoch": 1.5047404447089834, + "grad_norm": 9.80059877372828, + "learning_rate": 4.03008506861475e-06, + "loss": 0.4311, + "step": 4425 + }, + { + "epoch": 1.5064410526763319, + "grad_norm": 4.0833672787431095, + "learning_rate": 4.027939277553314e-06, + "loss": 0.422, + "step": 4430 + }, + { + "epoch": 1.5081416606436802, + "grad_norm": 4.352980395173143, + "learning_rate": 4.025791688164856e-06, + "loss": 0.4165, + "step": 4435 + }, + { + "epoch": 1.5098422686110284, + "grad_norm": 3.6443666952119393, + "learning_rate": 4.023642302977007e-06, + "loss": 0.418, + "step": 4440 + }, + { + "epoch": 1.5115428765783767, + "grad_norm": 2.5535581373220566, + "learning_rate": 4.021491124519512e-06, + "loss": 0.4128, + "step": 4445 + }, + { + "epoch": 1.513243484545725, + "grad_norm": 2.5635629960000257, + "learning_rate": 4.0193381553242275e-06, + "loss": 0.4265, + "step": 4450 + }, + { + "epoch": 1.5149440925130735, + "grad_norm": 9.71155883976001, + "learning_rate": 4.017183397925116e-06, + "loss": 0.4059, + "step": 4455 + }, + { + "epoch": 1.5166447004804218, + "grad_norm": 3.963949214840473, + "learning_rate": 4.015026854858248e-06, + "loss": 0.4315, + "step": 4460 + }, + { + "epoch": 1.51834530844777, + "grad_norm": 10.098506319170221, + "learning_rate": 4.012868528661793e-06, + "loss": 0.4197, + "step": 4465 + }, + { + "epoch": 1.5200459164151185, + "grad_norm": 7.311071257965433, + "learning_rate": 4.01070842187602e-06, + "loss": 0.3961, + "step": 4470 + }, + { + "epoch": 1.5217465243824666, + "grad_norm": 2.5919745636282623, + "learning_rate": 4.0085465370432906e-06, + "loss": 0.4339, + "step": 4475 + }, + { + "epoch": 1.523447132349815, + "grad_norm": 9.414821359866528, + "learning_rate": 4.006382876708066e-06, + "loss": 0.4381, + "step": 4480 + }, + { + "epoch": 1.5251477403171634, + "grad_norm": 3.2069921113732196, + "learning_rate": 4.004217443416889e-06, + "loss": 0.4221, + "step": 4485 + }, + { + "epoch": 1.5268483482845117, + "grad_norm": 4.108172954869957, + "learning_rate": 4.0020502397183955e-06, + "loss": 0.4404, + "step": 4490 + }, + { + "epoch": 1.5285489562518602, + "grad_norm": 4.7412987043119506, + "learning_rate": 3.9998812681633036e-06, + "loss": 0.4076, + "step": 4495 + }, + { + "epoch": 1.5302495642192082, + "grad_norm": 5.177073536747887, + "learning_rate": 3.9977105313044084e-06, + "loss": 0.4039, + "step": 4500 + }, + { + "epoch": 1.5319501721865567, + "grad_norm": 3.6093301676901537, + "learning_rate": 3.995538031696588e-06, + "loss": 0.4138, + "step": 4505 + }, + { + "epoch": 1.533650780153905, + "grad_norm": 5.26144190406865, + "learning_rate": 3.99336377189679e-06, + "loss": 0.4079, + "step": 4510 + }, + { + "epoch": 1.5353513881212533, + "grad_norm": 4.173292833080453, + "learning_rate": 3.991187754464039e-06, + "loss": 0.437, + "step": 4515 + }, + { + "epoch": 1.5370519960886018, + "grad_norm": 4.696152943235458, + "learning_rate": 3.989009981959424e-06, + "loss": 0.4103, + "step": 4520 + }, + { + "epoch": 1.53875260405595, + "grad_norm": 4.3396528642715335, + "learning_rate": 3.986830456946102e-06, + "loss": 0.3966, + "step": 4525 + }, + { + "epoch": 1.5404532120232983, + "grad_norm": 9.149805076911427, + "learning_rate": 3.984649181989292e-06, + "loss": 0.4195, + "step": 4530 + }, + { + "epoch": 1.5421538199906466, + "grad_norm": 6.507804562542518, + "learning_rate": 3.982466159656271e-06, + "loss": 0.4295, + "step": 4535 + }, + { + "epoch": 1.5438544279579949, + "grad_norm": 3.6654863638989053, + "learning_rate": 3.980281392516376e-06, + "loss": 0.4303, + "step": 4540 + }, + { + "epoch": 1.5455550359253434, + "grad_norm": 4.5023392766309325, + "learning_rate": 3.978094883140996e-06, + "loss": 0.42, + "step": 4545 + }, + { + "epoch": 1.5472556438926917, + "grad_norm": 5.735810888404813, + "learning_rate": 3.975906634103569e-06, + "loss": 0.4228, + "step": 4550 + }, + { + "epoch": 1.54895625186004, + "grad_norm": 4.02035436801197, + "learning_rate": 3.973716647979581e-06, + "loss": 0.4124, + "step": 4555 + }, + { + "epoch": 1.5506568598273884, + "grad_norm": 4.557603410060842, + "learning_rate": 3.971524927346565e-06, + "loss": 0.4281, + "step": 4560 + }, + { + "epoch": 1.5523574677947365, + "grad_norm": 4.962119472804869, + "learning_rate": 3.969331474784092e-06, + "loss": 0.4406, + "step": 4565 + }, + { + "epoch": 1.554058075762085, + "grad_norm": 3.2416613132367713, + "learning_rate": 3.967136292873776e-06, + "loss": 0.4329, + "step": 4570 + }, + { + "epoch": 1.5557586837294333, + "grad_norm": 7.2371279744266115, + "learning_rate": 3.96493938419926e-06, + "loss": 0.4146, + "step": 4575 + }, + { + "epoch": 1.5574592916967815, + "grad_norm": 9.496380193325114, + "learning_rate": 3.962740751346224e-06, + "loss": 0.4391, + "step": 4580 + }, + { + "epoch": 1.55915989966413, + "grad_norm": 3.9368690731569314, + "learning_rate": 3.960540396902378e-06, + "loss": 0.4201, + "step": 4585 + }, + { + "epoch": 1.560860507631478, + "grad_norm": 11.53126552855203, + "learning_rate": 3.958338323457455e-06, + "loss": 0.4242, + "step": 4590 + }, + { + "epoch": 1.5625611155988266, + "grad_norm": 3.5540690187464805, + "learning_rate": 3.956134533603211e-06, + "loss": 0.4191, + "step": 4595 + }, + { + "epoch": 1.5642617235661749, + "grad_norm": 4.546387352827355, + "learning_rate": 3.953929029933427e-06, + "loss": 0.4361, + "step": 4600 + }, + { + "epoch": 1.5659623315335232, + "grad_norm": 4.516207050932684, + "learning_rate": 3.951721815043895e-06, + "loss": 0.4319, + "step": 4605 + }, + { + "epoch": 1.5676629395008717, + "grad_norm": 4.965601982415111, + "learning_rate": 3.949512891532424e-06, + "loss": 0.3922, + "step": 4610 + }, + { + "epoch": 1.56936354746822, + "grad_norm": 4.085020023753369, + "learning_rate": 3.9473022619988364e-06, + "loss": 0.4238, + "step": 4615 + }, + { + "epoch": 1.5710641554355682, + "grad_norm": 6.507811927998727, + "learning_rate": 3.945089929044957e-06, + "loss": 0.4134, + "step": 4620 + }, + { + "epoch": 1.5727647634029165, + "grad_norm": 3.126306855943486, + "learning_rate": 3.94287589527462e-06, + "loss": 0.4193, + "step": 4625 + }, + { + "epoch": 1.5744653713702648, + "grad_norm": 5.4206195687295295, + "learning_rate": 3.940660163293659e-06, + "loss": 0.4537, + "step": 4630 + }, + { + "epoch": 1.5761659793376133, + "grad_norm": 4.308804498979996, + "learning_rate": 3.9384427357099084e-06, + "loss": 0.4532, + "step": 4635 + }, + { + "epoch": 1.5778665873049615, + "grad_norm": 3.5101886165442937, + "learning_rate": 3.936223615133195e-06, + "loss": 0.3952, + "step": 4640 + }, + { + "epoch": 1.5795671952723098, + "grad_norm": 5.5337619345336915, + "learning_rate": 3.934002804175343e-06, + "loss": 0.4122, + "step": 4645 + }, + { + "epoch": 1.5812678032396583, + "grad_norm": 3.6597291462675376, + "learning_rate": 3.931780305450161e-06, + "loss": 0.4046, + "step": 4650 + }, + { + "epoch": 1.5829684112070064, + "grad_norm": 22.15305892455393, + "learning_rate": 3.929556121573447e-06, + "loss": 0.4326, + "step": 4655 + }, + { + "epoch": 1.5846690191743549, + "grad_norm": 3.649143816745209, + "learning_rate": 3.9273302551629825e-06, + "loss": 0.4141, + "step": 4660 + }, + { + "epoch": 1.5863696271417032, + "grad_norm": 6.823266912741127, + "learning_rate": 3.925102708838527e-06, + "loss": 0.4178, + "step": 4665 + }, + { + "epoch": 1.5880702351090514, + "grad_norm": 5.531710013288801, + "learning_rate": 3.92287348522182e-06, + "loss": 0.4406, + "step": 4670 + }, + { + "epoch": 1.5897708430764, + "grad_norm": 8.859640722590902, + "learning_rate": 3.920642586936573e-06, + "loss": 0.4097, + "step": 4675 + }, + { + "epoch": 1.591471451043748, + "grad_norm": 3.0945720240734005, + "learning_rate": 3.918410016608469e-06, + "loss": 0.4484, + "step": 4680 + }, + { + "epoch": 1.5931720590110965, + "grad_norm": 5.242209952001623, + "learning_rate": 3.916175776865161e-06, + "loss": 0.4035, + "step": 4685 + }, + { + "epoch": 1.5948726669784448, + "grad_norm": 4.970760265373998, + "learning_rate": 3.9139398703362635e-06, + "loss": 0.4475, + "step": 4690 + }, + { + "epoch": 1.596573274945793, + "grad_norm": 4.629251668056431, + "learning_rate": 3.911702299653355e-06, + "loss": 0.4346, + "step": 4695 + }, + { + "epoch": 1.5982738829131415, + "grad_norm": 6.736287650537567, + "learning_rate": 3.909463067449971e-06, + "loss": 0.4276, + "step": 4700 + }, + { + "epoch": 1.5999744908804898, + "grad_norm": 5.197489467524276, + "learning_rate": 3.907222176361605e-06, + "loss": 0.4235, + "step": 4705 + }, + { + "epoch": 1.601675098847838, + "grad_norm": 6.135582408058318, + "learning_rate": 3.9049796290257e-06, + "loss": 0.4121, + "step": 4710 + }, + { + "epoch": 1.6033757068151864, + "grad_norm": 6.332586568891612, + "learning_rate": 3.902735428081651e-06, + "loss": 0.3969, + "step": 4715 + }, + { + "epoch": 1.6050763147825347, + "grad_norm": 4.634540377565531, + "learning_rate": 3.900489576170798e-06, + "loss": 0.4181, + "step": 4720 + }, + { + "epoch": 1.6067769227498832, + "grad_norm": 2.8978019838432, + "learning_rate": 3.898242075936423e-06, + "loss": 0.4428, + "step": 4725 + }, + { + "epoch": 1.6084775307172314, + "grad_norm": 14.566279377899951, + "learning_rate": 3.895992930023751e-06, + "loss": 0.4039, + "step": 4730 + }, + { + "epoch": 1.6101781386845797, + "grad_norm": 3.483953536288087, + "learning_rate": 3.89374214107994e-06, + "loss": 0.4383, + "step": 4735 + }, + { + "epoch": 1.6118787466519282, + "grad_norm": 4.178882102249056, + "learning_rate": 3.891489711754085e-06, + "loss": 0.4253, + "step": 4740 + }, + { + "epoch": 1.6135793546192763, + "grad_norm": 3.37242894448145, + "learning_rate": 3.8892356446972115e-06, + "loss": 0.43, + "step": 4745 + }, + { + "epoch": 1.6152799625866248, + "grad_norm": 26.59330743195478, + "learning_rate": 3.8869799425622695e-06, + "loss": 0.3683, + "step": 4750 + }, + { + "epoch": 1.616980570553973, + "grad_norm": 3.2112027869457656, + "learning_rate": 3.884722608004137e-06, + "loss": 0.4278, + "step": 4755 + }, + { + "epoch": 1.6186811785213213, + "grad_norm": 3.344270702003903, + "learning_rate": 3.882463643679612e-06, + "loss": 0.4212, + "step": 4760 + }, + { + "epoch": 1.6203817864886698, + "grad_norm": 2.75101259170878, + "learning_rate": 3.880203052247409e-06, + "loss": 0.4329, + "step": 4765 + }, + { + "epoch": 1.6220823944560179, + "grad_norm": 3.9130782115066487, + "learning_rate": 3.8779408363681596e-06, + "loss": 0.4172, + "step": 4770 + }, + { + "epoch": 1.6237830024233664, + "grad_norm": 4.017189388326885, + "learning_rate": 3.875676998704408e-06, + "loss": 0.4262, + "step": 4775 + }, + { + "epoch": 1.6254836103907147, + "grad_norm": 3.8048248710539827, + "learning_rate": 3.873411541920604e-06, + "loss": 0.4205, + "step": 4780 + }, + { + "epoch": 1.627184218358063, + "grad_norm": 3.3326058573835655, + "learning_rate": 3.871144468683106e-06, + "loss": 0.4345, + "step": 4785 + }, + { + "epoch": 1.6288848263254114, + "grad_norm": 8.492427714369212, + "learning_rate": 3.8688757816601746e-06, + "loss": 0.4436, + "step": 4790 + }, + { + "epoch": 1.6305854342927597, + "grad_norm": 3.1452242135910007, + "learning_rate": 3.866605483521968e-06, + "loss": 0.4281, + "step": 4795 + }, + { + "epoch": 1.632286042260108, + "grad_norm": 5.627570929095714, + "learning_rate": 3.864333576940542e-06, + "loss": 0.4254, + "step": 4800 + }, + { + "epoch": 1.6339866502274563, + "grad_norm": 2.913161920879035, + "learning_rate": 3.862060064589845e-06, + "loss": 0.4113, + "step": 4805 + }, + { + "epoch": 1.6356872581948045, + "grad_norm": 3.7039601844045973, + "learning_rate": 3.859784949145715e-06, + "loss": 0.4204, + "step": 4810 + }, + { + "epoch": 1.637387866162153, + "grad_norm": 3.0916900895398345, + "learning_rate": 3.857508233285879e-06, + "loss": 0.4287, + "step": 4815 + }, + { + "epoch": 1.6390884741295013, + "grad_norm": 3.6692889623634755, + "learning_rate": 3.855229919689944e-06, + "loss": 0.4007, + "step": 4820 + }, + { + "epoch": 1.6407890820968496, + "grad_norm": 2.8591255231281116, + "learning_rate": 3.8529500110394e-06, + "loss": 0.4335, + "step": 4825 + }, + { + "epoch": 1.642489690064198, + "grad_norm": 4.056310941557023, + "learning_rate": 3.850668510017613e-06, + "loss": 0.4406, + "step": 4830 + }, + { + "epoch": 1.6441902980315461, + "grad_norm": 5.395369896084443, + "learning_rate": 3.848385419309826e-06, + "loss": 0.3904, + "step": 4835 + }, + { + "epoch": 1.6458909059988946, + "grad_norm": 4.861776683262736, + "learning_rate": 3.846100741603148e-06, + "loss": 0.4247, + "step": 4840 + }, + { + "epoch": 1.647591513966243, + "grad_norm": 3.41023038334364, + "learning_rate": 3.84381447958656e-06, + "loss": 0.4076, + "step": 4845 + }, + { + "epoch": 1.6492921219335912, + "grad_norm": 3.6805091047550236, + "learning_rate": 3.8415266359509086e-06, + "loss": 0.4053, + "step": 4850 + }, + { + "epoch": 1.6509927299009397, + "grad_norm": 20.0982243813851, + "learning_rate": 3.8392372133888955e-06, + "loss": 0.4016, + "step": 4855 + }, + { + "epoch": 1.6526933378682878, + "grad_norm": 3.7021790123051233, + "learning_rate": 3.836946214595087e-06, + "loss": 0.4285, + "step": 4860 + }, + { + "epoch": 1.6543939458356363, + "grad_norm": 44.60756019965565, + "learning_rate": 3.834653642265902e-06, + "loss": 0.4207, + "step": 4865 + }, + { + "epoch": 1.6560945538029845, + "grad_norm": 4.035211050978466, + "learning_rate": 3.832359499099613e-06, + "loss": 0.453, + "step": 4870 + }, + { + "epoch": 1.6577951617703328, + "grad_norm": 6.698228887542318, + "learning_rate": 3.830063787796339e-06, + "loss": 0.4306, + "step": 4875 + }, + { + "epoch": 1.6594957697376813, + "grad_norm": 4.749773815351558, + "learning_rate": 3.827766511058046e-06, + "loss": 0.4571, + "step": 4880 + }, + { + "epoch": 1.6611963777050296, + "grad_norm": 22.04169770775326, + "learning_rate": 3.8254676715885416e-06, + "loss": 0.4191, + "step": 4885 + }, + { + "epoch": 1.6628969856723779, + "grad_norm": 4.802174235750299, + "learning_rate": 3.823167272093475e-06, + "loss": 0.419, + "step": 4890 + }, + { + "epoch": 1.6645975936397261, + "grad_norm": 12.428634508926901, + "learning_rate": 3.820865315280329e-06, + "loss": 0.4048, + "step": 4895 + }, + { + "epoch": 1.6662982016070744, + "grad_norm": 4.007313720748017, + "learning_rate": 3.81856180385842e-06, + "loss": 0.4132, + "step": 4900 + }, + { + "epoch": 1.667998809574423, + "grad_norm": 4.2309464190047805, + "learning_rate": 3.816256740538894e-06, + "loss": 0.3967, + "step": 4905 + }, + { + "epoch": 1.6696994175417712, + "grad_norm": 11.11545501855301, + "learning_rate": 3.8139501280347243e-06, + "loss": 0.3825, + "step": 4910 + }, + { + "epoch": 1.6714000255091195, + "grad_norm": 3.4032753612741136, + "learning_rate": 3.8116419690607066e-06, + "loss": 0.422, + "step": 4915 + }, + { + "epoch": 1.673100633476468, + "grad_norm": 8.32912298085406, + "learning_rate": 3.8093322663334574e-06, + "loss": 0.4079, + "step": 4920 + }, + { + "epoch": 1.674801241443816, + "grad_norm": 8.136105962229143, + "learning_rate": 3.8070210225714092e-06, + "loss": 0.4225, + "step": 4925 + }, + { + "epoch": 1.6765018494111645, + "grad_norm": 6.552850991103475, + "learning_rate": 3.80470824049481e-06, + "loss": 0.4251, + "step": 4930 + }, + { + "epoch": 1.6782024573785128, + "grad_norm": 3.70278560798378, + "learning_rate": 3.802393922825717e-06, + "loss": 0.4231, + "step": 4935 + }, + { + "epoch": 1.679903065345861, + "grad_norm": 13.268090115376083, + "learning_rate": 3.8000780722879937e-06, + "loss": 0.4054, + "step": 4940 + }, + { + "epoch": 1.6816036733132096, + "grad_norm": 4.909682908882996, + "learning_rate": 3.7977606916073113e-06, + "loss": 0.4298, + "step": 4945 + }, + { + "epoch": 1.6833042812805576, + "grad_norm": 6.5152501229026605, + "learning_rate": 3.795441783511138e-06, + "loss": 0.4279, + "step": 4950 + }, + { + "epoch": 1.6850048892479061, + "grad_norm": 8.879458780565201, + "learning_rate": 3.7931213507287417e-06, + "loss": 0.4232, + "step": 4955 + }, + { + "epoch": 1.6867054972152544, + "grad_norm": 4.539443656122637, + "learning_rate": 3.790799395991185e-06, + "loss": 0.3957, + "step": 4960 + }, + { + "epoch": 1.6884061051826027, + "grad_norm": 3.314679697063856, + "learning_rate": 3.78847592203132e-06, + "loss": 0.4335, + "step": 4965 + }, + { + "epoch": 1.6901067131499512, + "grad_norm": 3.5744798605623167, + "learning_rate": 3.7861509315837898e-06, + "loss": 0.4205, + "step": 4970 + }, + { + "epoch": 1.6918073211172995, + "grad_norm": 4.249754872291892, + "learning_rate": 3.7838244273850187e-06, + "loss": 0.3945, + "step": 4975 + }, + { + "epoch": 1.6935079290846478, + "grad_norm": 3.409071184852231, + "learning_rate": 3.7814964121732164e-06, + "loss": 0.4317, + "step": 4980 + }, + { + "epoch": 1.695208537051996, + "grad_norm": 3.6628545875674976, + "learning_rate": 3.7791668886883675e-06, + "loss": 0.4161, + "step": 4985 + }, + { + "epoch": 1.6969091450193443, + "grad_norm": 7.581007499161183, + "learning_rate": 3.7768358596722356e-06, + "loss": 0.4171, + "step": 4990 + }, + { + "epoch": 1.6986097529866928, + "grad_norm": 3.765634764539352, + "learning_rate": 3.7745033278683506e-06, + "loss": 0.4222, + "step": 4995 + }, + { + "epoch": 1.700310360954041, + "grad_norm": 3.8136428107692986, + "learning_rate": 3.772169296022019e-06, + "loss": 0.4222, + "step": 5000 + }, + { + "epoch": 1.7020109689213894, + "grad_norm": 4.324620527193306, + "learning_rate": 3.7698337668803054e-06, + "loss": 0.4023, + "step": 5005 + }, + { + "epoch": 1.7037115768887379, + "grad_norm": 2.7411830689404, + "learning_rate": 3.767496743192042e-06, + "loss": 0.4215, + "step": 5010 + }, + { + "epoch": 1.705412184856086, + "grad_norm": 3.7118408685683497, + "learning_rate": 3.7651582277078148e-06, + "loss": 0.4369, + "step": 5015 + }, + { + "epoch": 1.7071127928234344, + "grad_norm": 9.36456704680626, + "learning_rate": 3.7628182231799703e-06, + "loss": 0.4033, + "step": 5020 + }, + { + "epoch": 1.7088134007907827, + "grad_norm": 4.523007688855365, + "learning_rate": 3.760476732362606e-06, + "loss": 0.4084, + "step": 5025 + }, + { + "epoch": 1.710514008758131, + "grad_norm": 3.6411870432062265, + "learning_rate": 3.7581337580115683e-06, + "loss": 0.3942, + "step": 5030 + }, + { + "epoch": 1.7122146167254795, + "grad_norm": 3.4921491078651363, + "learning_rate": 3.755789302884449e-06, + "loss": 0.4075, + "step": 5035 + }, + { + "epoch": 1.7139152246928275, + "grad_norm": 2.482904014486906, + "learning_rate": 3.7534433697405842e-06, + "loss": 0.417, + "step": 5040 + }, + { + "epoch": 1.715615832660176, + "grad_norm": 2.297692680568294, + "learning_rate": 3.751095961341049e-06, + "loss": 0.4106, + "step": 5045 + }, + { + "epoch": 1.7173164406275243, + "grad_norm": 3.4326386505188715, + "learning_rate": 3.748747080448654e-06, + "loss": 0.4226, + "step": 5050 + }, + { + "epoch": 1.7190170485948726, + "grad_norm": 4.785651921836002, + "learning_rate": 3.746396729827944e-06, + "loss": 0.4081, + "step": 5055 + }, + { + "epoch": 1.720717656562221, + "grad_norm": 3.0609624749389277, + "learning_rate": 3.744044912245194e-06, + "loss": 0.4008, + "step": 5060 + }, + { + "epoch": 1.7224182645295694, + "grad_norm": 4.091890461031847, + "learning_rate": 3.741691630468404e-06, + "loss": 0.4173, + "step": 5065 + }, + { + "epoch": 1.7241188724969176, + "grad_norm": 4.649672756554727, + "learning_rate": 3.739336887267298e-06, + "loss": 0.412, + "step": 5070 + }, + { + "epoch": 1.725819480464266, + "grad_norm": 5.44341678306046, + "learning_rate": 3.7369806854133204e-06, + "loss": 0.4137, + "step": 5075 + }, + { + "epoch": 1.7275200884316142, + "grad_norm": 3.474137553393895, + "learning_rate": 3.7346230276796325e-06, + "loss": 0.411, + "step": 5080 + }, + { + "epoch": 1.7292206963989627, + "grad_norm": 4.395765896168521, + "learning_rate": 3.7322639168411077e-06, + "loss": 0.4293, + "step": 5085 + }, + { + "epoch": 1.730921304366311, + "grad_norm": 7.0977714130839304, + "learning_rate": 3.729903355674332e-06, + "loss": 0.4142, + "step": 5090 + }, + { + "epoch": 1.7326219123336593, + "grad_norm": 7.852932063860036, + "learning_rate": 3.7275413469575955e-06, + "loss": 0.4282, + "step": 5095 + }, + { + "epoch": 1.7343225203010078, + "grad_norm": 4.817210240263915, + "learning_rate": 3.725177893470895e-06, + "loss": 0.4256, + "step": 5100 + }, + { + "epoch": 1.7360231282683558, + "grad_norm": 2.5794826397829436, + "learning_rate": 3.722812997995925e-06, + "loss": 0.4058, + "step": 5105 + }, + { + "epoch": 1.7377237362357043, + "grad_norm": 3.660698064573587, + "learning_rate": 3.7204466633160796e-06, + "loss": 0.4105, + "step": 5110 + }, + { + "epoch": 1.7394243442030526, + "grad_norm": 7.6636100537984335, + "learning_rate": 3.7180788922164446e-06, + "loss": 0.4082, + "step": 5115 + }, + { + "epoch": 1.7411249521704009, + "grad_norm": 3.8422655885497377, + "learning_rate": 3.7157096874837985e-06, + "loss": 0.4131, + "step": 5120 + }, + { + "epoch": 1.7428255601377494, + "grad_norm": 4.651148660746592, + "learning_rate": 3.7133390519066048e-06, + "loss": 0.4213, + "step": 5125 + }, + { + "epoch": 1.7445261681050974, + "grad_norm": 3.4297472647983045, + "learning_rate": 3.7109669882750145e-06, + "loss": 0.4318, + "step": 5130 + }, + { + "epoch": 1.746226776072446, + "grad_norm": 35.092387165068224, + "learning_rate": 3.7085934993808546e-06, + "loss": 0.4223, + "step": 5135 + }, + { + "epoch": 1.7479273840397942, + "grad_norm": 4.45847131877625, + "learning_rate": 3.706218588017635e-06, + "loss": 0.3913, + "step": 5140 + }, + { + "epoch": 1.7496279920071425, + "grad_norm": 4.7597757325098256, + "learning_rate": 3.7038422569805342e-06, + "loss": 0.4193, + "step": 5145 + }, + { + "epoch": 1.751328599974491, + "grad_norm": 4.462704348907563, + "learning_rate": 3.7014645090664065e-06, + "loss": 0.415, + "step": 5150 + }, + { + "epoch": 1.7530292079418393, + "grad_norm": 12.282193653179133, + "learning_rate": 3.6990853470737704e-06, + "loss": 0.4181, + "step": 5155 + }, + { + "epoch": 1.7547298159091875, + "grad_norm": 11.877335638069301, + "learning_rate": 3.6967047738028106e-06, + "loss": 0.4111, + "step": 5160 + }, + { + "epoch": 1.7564304238765358, + "grad_norm": 6.323596753835098, + "learning_rate": 3.6943227920553727e-06, + "loss": 0.4228, + "step": 5165 + }, + { + "epoch": 1.758131031843884, + "grad_norm": 5.0439197351461225, + "learning_rate": 3.6919394046349583e-06, + "loss": 0.4177, + "step": 5170 + }, + { + "epoch": 1.7598316398112326, + "grad_norm": 7.547777679994188, + "learning_rate": 3.6895546143467254e-06, + "loss": 0.4305, + "step": 5175 + }, + { + "epoch": 1.7615322477785809, + "grad_norm": 3.389867040154592, + "learning_rate": 3.6871684239974825e-06, + "loss": 0.4147, + "step": 5180 + }, + { + "epoch": 1.7632328557459291, + "grad_norm": 3.1099841932552965, + "learning_rate": 3.684780836395686e-06, + "loss": 0.4026, + "step": 5185 + }, + { + "epoch": 1.7649334637132776, + "grad_norm": 2.961474684532061, + "learning_rate": 3.6823918543514365e-06, + "loss": 0.4185, + "step": 5190 + }, + { + "epoch": 1.7666340716806257, + "grad_norm": 5.724440893942992, + "learning_rate": 3.680001480676475e-06, + "loss": 0.412, + "step": 5195 + }, + { + "epoch": 1.7683346796479742, + "grad_norm": 4.205497790469296, + "learning_rate": 3.677609718184183e-06, + "loss": 0.4054, + "step": 5200 + }, + { + "epoch": 1.7700352876153225, + "grad_norm": 2.683317009048377, + "learning_rate": 3.675216569689574e-06, + "loss": 0.4122, + "step": 5205 + }, + { + "epoch": 1.7717358955826708, + "grad_norm": 4.535946082169053, + "learning_rate": 3.672822038009294e-06, + "loss": 0.4146, + "step": 5210 + }, + { + "epoch": 1.7734365035500192, + "grad_norm": 3.380527665829096, + "learning_rate": 3.6704261259616164e-06, + "loss": 0.4213, + "step": 5215 + }, + { + "epoch": 1.7751371115173673, + "grad_norm": 3.870065998174079, + "learning_rate": 3.6680288363664394e-06, + "loss": 0.4074, + "step": 5220 + }, + { + "epoch": 1.7768377194847158, + "grad_norm": 3.1354538564619334, + "learning_rate": 3.6656301720452835e-06, + "loss": 0.4228, + "step": 5225 + }, + { + "epoch": 1.778538327452064, + "grad_norm": 5.400817932617311, + "learning_rate": 3.6632301358212853e-06, + "loss": 0.4246, + "step": 5230 + }, + { + "epoch": 1.7802389354194124, + "grad_norm": 3.8399689380174276, + "learning_rate": 3.6608287305191973e-06, + "loss": 0.4308, + "step": 5235 + }, + { + "epoch": 1.7819395433867609, + "grad_norm": 2.997454835558299, + "learning_rate": 3.6584259589653837e-06, + "loss": 0.4137, + "step": 5240 + }, + { + "epoch": 1.7836401513541091, + "grad_norm": 3.350556475913989, + "learning_rate": 3.656021823987815e-06, + "loss": 0.4005, + "step": 5245 + }, + { + "epoch": 1.7853407593214574, + "grad_norm": 2.8203194142875363, + "learning_rate": 3.6536163284160693e-06, + "loss": 0.4476, + "step": 5250 + }, + { + "epoch": 1.7870413672888057, + "grad_norm": 3.308402155803443, + "learning_rate": 3.6512094750813233e-06, + "loss": 0.3968, + "step": 5255 + }, + { + "epoch": 1.788741975256154, + "grad_norm": 5.763247278776057, + "learning_rate": 3.6488012668163524e-06, + "loss": 0.4151, + "step": 5260 + }, + { + "epoch": 1.7904425832235025, + "grad_norm": 5.29037745075565, + "learning_rate": 3.646391706455528e-06, + "loss": 0.4132, + "step": 5265 + }, + { + "epoch": 1.7921431911908507, + "grad_norm": 4.138638916673085, + "learning_rate": 3.6439807968348124e-06, + "loss": 0.4132, + "step": 5270 + }, + { + "epoch": 1.793843799158199, + "grad_norm": 6.493935903659294, + "learning_rate": 3.641568540791754e-06, + "loss": 0.4089, + "step": 5275 + }, + { + "epoch": 1.7955444071255475, + "grad_norm": 4.802012353682913, + "learning_rate": 3.639154941165488e-06, + "loss": 0.4227, + "step": 5280 + }, + { + "epoch": 1.7972450150928956, + "grad_norm": 3.4193309281326165, + "learning_rate": 3.6367400007967303e-06, + "loss": 0.4108, + "step": 5285 + }, + { + "epoch": 1.798945623060244, + "grad_norm": 4.888742378693622, + "learning_rate": 3.634323722527775e-06, + "loss": 0.4117, + "step": 5290 + }, + { + "epoch": 1.8006462310275924, + "grad_norm": 3.1983098757845716, + "learning_rate": 3.6319061092024908e-06, + "loss": 0.404, + "step": 5295 + }, + { + "epoch": 1.8023468389949406, + "grad_norm": 3.497239045953602, + "learning_rate": 3.629487163666317e-06, + "loss": 0.3974, + "step": 5300 + }, + { + "epoch": 1.8040474469622891, + "grad_norm": 3.389532929241317, + "learning_rate": 3.6270668887662617e-06, + "loss": 0.4099, + "step": 5305 + }, + { + "epoch": 1.8057480549296372, + "grad_norm": 5.75582582741186, + "learning_rate": 3.6246452873508974e-06, + "loss": 0.4205, + "step": 5310 + }, + { + "epoch": 1.8074486628969857, + "grad_norm": 4.053834889668838, + "learning_rate": 3.6222223622703588e-06, + "loss": 0.4371, + "step": 5315 + }, + { + "epoch": 1.809149270864334, + "grad_norm": 6.526089938179861, + "learning_rate": 3.6197981163763363e-06, + "loss": 0.4115, + "step": 5320 + }, + { + "epoch": 1.8108498788316822, + "grad_norm": 3.8633098332276345, + "learning_rate": 3.617372552522076e-06, + "loss": 0.4198, + "step": 5325 + }, + { + "epoch": 1.8125504867990307, + "grad_norm": 4.490269234498097, + "learning_rate": 3.614945673562376e-06, + "loss": 0.41, + "step": 5330 + }, + { + "epoch": 1.814251094766379, + "grad_norm": 3.729763555379225, + "learning_rate": 3.6125174823535814e-06, + "loss": 0.4261, + "step": 5335 + }, + { + "epoch": 1.8159517027337273, + "grad_norm": 4.209308445244779, + "learning_rate": 3.610087981753582e-06, + "loss": 0.3966, + "step": 5340 + }, + { + "epoch": 1.8176523107010756, + "grad_norm": 3.114516904920482, + "learning_rate": 3.607657174621807e-06, + "loss": 0.4314, + "step": 5345 + }, + { + "epoch": 1.8193529186684239, + "grad_norm": 4.563806015836235, + "learning_rate": 3.605225063819227e-06, + "loss": 0.3873, + "step": 5350 + }, + { + "epoch": 1.8210535266357724, + "grad_norm": 3.4794691991429563, + "learning_rate": 3.602791652208344e-06, + "loss": 0.4083, + "step": 5355 + }, + { + "epoch": 1.8227541346031206, + "grad_norm": 3.7479801450742025, + "learning_rate": 3.6003569426531913e-06, + "loss": 0.4211, + "step": 5360 + }, + { + "epoch": 1.824454742570469, + "grad_norm": 5.836597057493091, + "learning_rate": 3.597920938019332e-06, + "loss": 0.4115, + "step": 5365 + }, + { + "epoch": 1.8261553505378174, + "grad_norm": 4.694732953885266, + "learning_rate": 3.5954836411738497e-06, + "loss": 0.4068, + "step": 5370 + }, + { + "epoch": 1.8278559585051655, + "grad_norm": 3.8101990170781477, + "learning_rate": 3.5930450549853525e-06, + "loss": 0.4094, + "step": 5375 + }, + { + "epoch": 1.829556566472514, + "grad_norm": 3.51236481656127, + "learning_rate": 3.5906051823239646e-06, + "loss": 0.4285, + "step": 5380 + }, + { + "epoch": 1.8312571744398622, + "grad_norm": 3.94658556315043, + "learning_rate": 3.588164026061324e-06, + "loss": 0.4235, + "step": 5385 + }, + { + "epoch": 1.8329577824072105, + "grad_norm": 3.3174850981591684, + "learning_rate": 3.58572158907058e-06, + "loss": 0.41, + "step": 5390 + }, + { + "epoch": 1.834658390374559, + "grad_norm": 3.6492814087849252, + "learning_rate": 3.5832778742263887e-06, + "loss": 0.429, + "step": 5395 + }, + { + "epoch": 1.836358998341907, + "grad_norm": 2.8277702279887063, + "learning_rate": 3.58083288440491e-06, + "loss": 0.405, + "step": 5400 + }, + { + "epoch": 1.8380596063092556, + "grad_norm": 4.03173637344181, + "learning_rate": 3.5783866224838056e-06, + "loss": 0.4175, + "step": 5405 + }, + { + "epoch": 1.8397602142766039, + "grad_norm": 5.057067485565598, + "learning_rate": 3.575939091342233e-06, + "loss": 0.4081, + "step": 5410 + }, + { + "epoch": 1.8414608222439521, + "grad_norm": 5.22123889542372, + "learning_rate": 3.5734902938608464e-06, + "loss": 0.4166, + "step": 5415 + }, + { + "epoch": 1.8431614302113006, + "grad_norm": 4.538960817215644, + "learning_rate": 3.5710402329217853e-06, + "loss": 0.4129, + "step": 5420 + }, + { + "epoch": 1.844862038178649, + "grad_norm": 4.407674164743875, + "learning_rate": 3.568588911408681e-06, + "loss": 0.43, + "step": 5425 + }, + { + "epoch": 1.8465626461459972, + "grad_norm": 5.401174955894541, + "learning_rate": 3.5661363322066457e-06, + "loss": 0.4276, + "step": 5430 + }, + { + "epoch": 1.8482632541133455, + "grad_norm": 5.251158658037828, + "learning_rate": 3.5636824982022733e-06, + "loss": 0.4123, + "step": 5435 + }, + { + "epoch": 1.8499638620806937, + "grad_norm": 3.470885033365259, + "learning_rate": 3.5612274122836347e-06, + "loss": 0.4131, + "step": 5440 + }, + { + "epoch": 1.8516644700480422, + "grad_norm": 2.7229742474641583, + "learning_rate": 3.5587710773402728e-06, + "loss": 0.4181, + "step": 5445 + }, + { + "epoch": 1.8533650780153905, + "grad_norm": 5.8691764648272295, + "learning_rate": 3.556313496263202e-06, + "loss": 0.4139, + "step": 5450 + }, + { + "epoch": 1.8550656859827388, + "grad_norm": 3.825299235777345, + "learning_rate": 3.5538546719449016e-06, + "loss": 0.3916, + "step": 5455 + }, + { + "epoch": 1.8567662939500873, + "grad_norm": 2.5630247348522905, + "learning_rate": 3.551394607279317e-06, + "loss": 0.4, + "step": 5460 + }, + { + "epoch": 1.8584669019174354, + "grad_norm": 4.1369968485018696, + "learning_rate": 3.5489333051618502e-06, + "loss": 0.4556, + "step": 5465 + }, + { + "epoch": 1.8601675098847839, + "grad_norm": 3.8075109811197185, + "learning_rate": 3.546470768489362e-06, + "loss": 0.4064, + "step": 5470 + }, + { + "epoch": 1.8618681178521321, + "grad_norm": 2.6887363622082976, + "learning_rate": 3.5440070001601645e-06, + "loss": 0.4219, + "step": 5475 + }, + { + "epoch": 1.8635687258194804, + "grad_norm": 5.633259612800654, + "learning_rate": 3.5415420030740213e-06, + "loss": 0.4053, + "step": 5480 + }, + { + "epoch": 1.865269333786829, + "grad_norm": 8.192273971343418, + "learning_rate": 3.539075780132141e-06, + "loss": 0.4197, + "step": 5485 + }, + { + "epoch": 1.866969941754177, + "grad_norm": 8.451498019783399, + "learning_rate": 3.5366083342371736e-06, + "loss": 0.3956, + "step": 5490 + }, + { + "epoch": 1.8686705497215255, + "grad_norm": 3.479976888795481, + "learning_rate": 3.534139668293213e-06, + "loss": 0.425, + "step": 5495 + }, + { + "epoch": 1.8703711576888737, + "grad_norm": 5.866557178030037, + "learning_rate": 3.5316697852057837e-06, + "loss": 0.4102, + "step": 5500 + }, + { + "epoch": 1.872071765656222, + "grad_norm": 4.035005484628097, + "learning_rate": 3.5291986878818465e-06, + "loss": 0.4199, + "step": 5505 + }, + { + "epoch": 1.8737723736235705, + "grad_norm": 3.5002876163902474, + "learning_rate": 3.526726379229789e-06, + "loss": 0.3934, + "step": 5510 + }, + { + "epoch": 1.8754729815909188, + "grad_norm": 3.6304680834425684, + "learning_rate": 3.5242528621594258e-06, + "loss": 0.4432, + "step": 5515 + }, + { + "epoch": 1.877173589558267, + "grad_norm": 3.6584152163360937, + "learning_rate": 3.5217781395819933e-06, + "loss": 0.3992, + "step": 5520 + }, + { + "epoch": 1.8788741975256154, + "grad_norm": 4.457327747635903, + "learning_rate": 3.5193022144101474e-06, + "loss": 0.4117, + "step": 5525 + }, + { + "epoch": 1.8805748054929636, + "grad_norm": 4.636453776912117, + "learning_rate": 3.516825089557958e-06, + "loss": 0.3967, + "step": 5530 + }, + { + "epoch": 1.8822754134603121, + "grad_norm": 4.035134315793152, + "learning_rate": 3.5143467679409086e-06, + "loss": 0.4277, + "step": 5535 + }, + { + "epoch": 1.8839760214276604, + "grad_norm": 7.208813835387405, + "learning_rate": 3.5118672524758902e-06, + "loss": 0.4186, + "step": 5540 + }, + { + "epoch": 1.8856766293950087, + "grad_norm": 4.0132329719104645, + "learning_rate": 3.5093865460811986e-06, + "loss": 0.4216, + "step": 5545 + }, + { + "epoch": 1.8873772373623572, + "grad_norm": 2.7614191467947378, + "learning_rate": 3.506904651676532e-06, + "loss": 0.4247, + "step": 5550 + }, + { + "epoch": 1.8890778453297052, + "grad_norm": 3.494289269774637, + "learning_rate": 3.5044215721829877e-06, + "loss": 0.4194, + "step": 5555 + }, + { + "epoch": 1.8907784532970537, + "grad_norm": 5.42233312332927, + "learning_rate": 3.501937310523056e-06, + "loss": 0.3973, + "step": 5560 + }, + { + "epoch": 1.892479061264402, + "grad_norm": 5.442951689643182, + "learning_rate": 3.4994518696206193e-06, + "loss": 0.4116, + "step": 5565 + }, + { + "epoch": 1.8941796692317503, + "grad_norm": 5.737489385443094, + "learning_rate": 3.4969652524009484e-06, + "loss": 0.4183, + "step": 5570 + }, + { + "epoch": 1.8958802771990988, + "grad_norm": 4.014297820507776, + "learning_rate": 3.4944774617906985e-06, + "loss": 0.3938, + "step": 5575 + }, + { + "epoch": 1.8975808851664469, + "grad_norm": 4.66789293356542, + "learning_rate": 3.4919885007179045e-06, + "loss": 0.4294, + "step": 5580 + }, + { + "epoch": 1.8992814931337954, + "grad_norm": 4.700100407150136, + "learning_rate": 3.48949837211198e-06, + "loss": 0.426, + "step": 5585 + }, + { + "epoch": 1.9009821011011436, + "grad_norm": 4.840905102605415, + "learning_rate": 3.4870070789037137e-06, + "loss": 0.3858, + "step": 5590 + }, + { + "epoch": 1.902682709068492, + "grad_norm": 9.12091742025157, + "learning_rate": 3.484514624025263e-06, + "loss": 0.4147, + "step": 5595 + }, + { + "epoch": 1.9043833170358404, + "grad_norm": 4.724736678683829, + "learning_rate": 3.4820210104101537e-06, + "loss": 0.3841, + "step": 5600 + }, + { + "epoch": 1.9060839250031887, + "grad_norm": 5.18009895076808, + "learning_rate": 3.4795262409932755e-06, + "loss": 0.4282, + "step": 5605 + }, + { + "epoch": 1.907784532970537, + "grad_norm": 5.792537607638843, + "learning_rate": 3.4770303187108775e-06, + "loss": 0.4034, + "step": 5610 + }, + { + "epoch": 1.9094851409378852, + "grad_norm": 4.6349091095733295, + "learning_rate": 3.4745332465005673e-06, + "loss": 0.4028, + "step": 5615 + }, + { + "epoch": 1.9111857489052335, + "grad_norm": 6.723455838078338, + "learning_rate": 3.4720350273013037e-06, + "loss": 0.4166, + "step": 5620 + }, + { + "epoch": 1.912886356872582, + "grad_norm": 6.242344377413638, + "learning_rate": 3.469535664053397e-06, + "loss": 0.4031, + "step": 5625 + }, + { + "epoch": 1.9145869648399303, + "grad_norm": 6.84802664068361, + "learning_rate": 3.4670351596985046e-06, + "loss": 0.415, + "step": 5630 + }, + { + "epoch": 1.9162875728072786, + "grad_norm": 7.228260182496643, + "learning_rate": 3.464533517179625e-06, + "loss": 0.4315, + "step": 5635 + }, + { + "epoch": 1.917988180774627, + "grad_norm": 2.96606041432462, + "learning_rate": 3.4620307394410978e-06, + "loss": 0.4223, + "step": 5640 + }, + { + "epoch": 1.9196887887419751, + "grad_norm": 3.7918240411783857, + "learning_rate": 3.459526829428598e-06, + "loss": 0.403, + "step": 5645 + }, + { + "epoch": 1.9213893967093236, + "grad_norm": 3.1192412744558955, + "learning_rate": 3.4570217900891334e-06, + "loss": 0.4153, + "step": 5650 + }, + { + "epoch": 1.923090004676672, + "grad_norm": 9.22746745511448, + "learning_rate": 3.4545156243710416e-06, + "loss": 0.4118, + "step": 5655 + }, + { + "epoch": 1.9247906126440202, + "grad_norm": 4.8796467224589195, + "learning_rate": 3.4520083352239843e-06, + "loss": 0.4113, + "step": 5660 + }, + { + "epoch": 1.9264912206113687, + "grad_norm": 2.9985883082157896, + "learning_rate": 3.449499925598947e-06, + "loss": 0.403, + "step": 5665 + }, + { + "epoch": 1.9281918285787167, + "grad_norm": 3.2382926012581246, + "learning_rate": 3.446990398448233e-06, + "loss": 0.4105, + "step": 5670 + }, + { + "epoch": 1.9298924365460652, + "grad_norm": 4.399404706868584, + "learning_rate": 3.4444797567254618e-06, + "loss": 0.4302, + "step": 5675 + }, + { + "epoch": 1.9315930445134135, + "grad_norm": 3.5161582978925843, + "learning_rate": 3.4419680033855646e-06, + "loss": 0.4182, + "step": 5680 + }, + { + "epoch": 1.9332936524807618, + "grad_norm": 13.298000660642423, + "learning_rate": 3.43945514138478e-06, + "loss": 0.4229, + "step": 5685 + }, + { + "epoch": 1.9349942604481103, + "grad_norm": 6.021996835427646, + "learning_rate": 3.4369411736806518e-06, + "loss": 0.3972, + "step": 5690 + }, + { + "epoch": 1.9366948684154586, + "grad_norm": 3.5836568774514412, + "learning_rate": 3.4344261032320256e-06, + "loss": 0.4128, + "step": 5695 + }, + { + "epoch": 1.9383954763828068, + "grad_norm": 3.4951768750855345, + "learning_rate": 3.431909932999045e-06, + "loss": 0.4233, + "step": 5700 + }, + { + "epoch": 1.9400960843501551, + "grad_norm": 4.848115803542296, + "learning_rate": 3.4293926659431476e-06, + "loss": 0.4363, + "step": 5705 + }, + { + "epoch": 1.9417966923175034, + "grad_norm": 3.8444396087392927, + "learning_rate": 3.4268743050270615e-06, + "loss": 0.4047, + "step": 5710 + }, + { + "epoch": 1.943497300284852, + "grad_norm": 3.2226949571838714, + "learning_rate": 3.4243548532148023e-06, + "loss": 0.4106, + "step": 5715 + }, + { + "epoch": 1.9451979082522002, + "grad_norm": 3.3308747796425955, + "learning_rate": 3.4218343134716714e-06, + "loss": 0.3999, + "step": 5720 + }, + { + "epoch": 1.9468985162195485, + "grad_norm": 4.88703347692355, + "learning_rate": 3.419312688764248e-06, + "loss": 0.4048, + "step": 5725 + }, + { + "epoch": 1.948599124186897, + "grad_norm": 3.9874201659248003, + "learning_rate": 3.41678998206039e-06, + "loss": 0.4385, + "step": 5730 + }, + { + "epoch": 1.950299732154245, + "grad_norm": 6.1419864939233175, + "learning_rate": 3.414266196329228e-06, + "loss": 0.416, + "step": 5735 + }, + { + "epoch": 1.9520003401215935, + "grad_norm": 4.112898935066507, + "learning_rate": 3.411741334541163e-06, + "loss": 0.3927, + "step": 5740 + }, + { + "epoch": 1.9537009480889418, + "grad_norm": 3.953089011926561, + "learning_rate": 3.409215399667863e-06, + "loss": 0.4042, + "step": 5745 + }, + { + "epoch": 1.95540155605629, + "grad_norm": 5.767046903179801, + "learning_rate": 3.4066883946822566e-06, + "loss": 0.4006, + "step": 5750 + }, + { + "epoch": 1.9571021640236386, + "grad_norm": 3.341317702928936, + "learning_rate": 3.404160322558535e-06, + "loss": 0.4227, + "step": 5755 + }, + { + "epoch": 1.9588027719909866, + "grad_norm": 4.07190080441077, + "learning_rate": 3.401631186272143e-06, + "loss": 0.416, + "step": 5760 + }, + { + "epoch": 1.9605033799583351, + "grad_norm": 12.21280293374208, + "learning_rate": 3.39910098879978e-06, + "loss": 0.3751, + "step": 5765 + }, + { + "epoch": 1.9622039879256834, + "grad_norm": 4.172650121953233, + "learning_rate": 3.396569733119392e-06, + "loss": 0.4123, + "step": 5770 + }, + { + "epoch": 1.9639045958930317, + "grad_norm": 3.6288758910428904, + "learning_rate": 3.3940374222101718e-06, + "loss": 0.3984, + "step": 5775 + }, + { + "epoch": 1.9656052038603802, + "grad_norm": 3.3156182811454498, + "learning_rate": 3.391504059052555e-06, + "loss": 0.4042, + "step": 5780 + }, + { + "epoch": 1.9673058118277285, + "grad_norm": 4.9764018707293, + "learning_rate": 3.3889696466282133e-06, + "loss": 0.4323, + "step": 5785 + }, + { + "epoch": 1.9690064197950767, + "grad_norm": 3.0156131427562265, + "learning_rate": 3.3864341879200564e-06, + "loss": 0.4326, + "step": 5790 + }, + { + "epoch": 1.970707027762425, + "grad_norm": 2.8678588097789293, + "learning_rate": 3.3838976859122217e-06, + "loss": 0.3995, + "step": 5795 + }, + { + "epoch": 1.9724076357297733, + "grad_norm": 3.3362617100666356, + "learning_rate": 3.381360143590078e-06, + "loss": 0.3796, + "step": 5800 + }, + { + "epoch": 1.9741082436971218, + "grad_norm": 4.58668537659758, + "learning_rate": 3.3788215639402173e-06, + "loss": 0.4115, + "step": 5805 + }, + { + "epoch": 1.97580885166447, + "grad_norm": 4.345595276003884, + "learning_rate": 3.3762819499504517e-06, + "loss": 0.4085, + "step": 5810 + }, + { + "epoch": 1.9775094596318183, + "grad_norm": 5.3576993167657605, + "learning_rate": 3.3737413046098115e-06, + "loss": 0.4264, + "step": 5815 + }, + { + "epoch": 1.9792100675991668, + "grad_norm": 5.050489795627901, + "learning_rate": 3.371199630908541e-06, + "loss": 0.4042, + "step": 5820 + }, + { + "epoch": 1.980910675566515, + "grad_norm": 5.566547262030448, + "learning_rate": 3.3686569318380935e-06, + "loss": 0.4063, + "step": 5825 + }, + { + "epoch": 1.9826112835338634, + "grad_norm": 3.2549721138244974, + "learning_rate": 3.366113210391131e-06, + "loss": 0.4224, + "step": 5830 + }, + { + "epoch": 1.9843118915012117, + "grad_norm": 3.515436243577237, + "learning_rate": 3.3635684695615178e-06, + "loss": 0.4238, + "step": 5835 + }, + { + "epoch": 1.98601249946856, + "grad_norm": 7.536854238457131, + "learning_rate": 3.3610227123443175e-06, + "loss": 0.3986, + "step": 5840 + }, + { + "epoch": 1.9877131074359085, + "grad_norm": 3.8966337878486166, + "learning_rate": 3.358475941735791e-06, + "loss": 0.3911, + "step": 5845 + }, + { + "epoch": 1.9894137154032565, + "grad_norm": 3.690649105892383, + "learning_rate": 3.355928160733391e-06, + "loss": 0.4384, + "step": 5850 + }, + { + "epoch": 1.991114323370605, + "grad_norm": 4.7328145823776095, + "learning_rate": 3.3533793723357606e-06, + "loss": 0.3964, + "step": 5855 + }, + { + "epoch": 1.9928149313379533, + "grad_norm": 3.7226029893933834, + "learning_rate": 3.3508295795427275e-06, + "loss": 0.4241, + "step": 5860 + }, + { + "epoch": 1.9945155393053016, + "grad_norm": 2.9375452734873773, + "learning_rate": 3.3482787853553013e-06, + "loss": 0.4104, + "step": 5865 + }, + { + "epoch": 1.99621614727265, + "grad_norm": 2.990235322214759, + "learning_rate": 3.3457269927756714e-06, + "loss": 0.3995, + "step": 5870 + }, + { + "epoch": 1.9979167552399983, + "grad_norm": 3.893323342077673, + "learning_rate": 3.3431742048072013e-06, + "loss": 0.4212, + "step": 5875 + }, + { + "epoch": 1.9996173632073466, + "grad_norm": 3.4787084903115204, + "learning_rate": 3.340620424454427e-06, + "loss": 0.4086, + "step": 5880 + }, + { + "epoch": 2.001020364780409, + "grad_norm": 3.2811677378027113, + "learning_rate": 3.338065654723051e-06, + "loss": 0.3351, + "step": 5885 + }, + { + "epoch": 2.0027209727477575, + "grad_norm": 6.913529304657979, + "learning_rate": 3.335509898619942e-06, + "loss": 0.385, + "step": 5890 + }, + { + "epoch": 2.0044215807151056, + "grad_norm": 3.8715857028540532, + "learning_rate": 3.3329531591531276e-06, + "loss": 0.4009, + "step": 5895 + }, + { + "epoch": 2.006122188682454, + "grad_norm": 3.564027863152233, + "learning_rate": 3.330395439331795e-06, + "loss": 0.3685, + "step": 5900 + }, + { + "epoch": 2.007822796649802, + "grad_norm": 6.615848796909299, + "learning_rate": 3.327836742166284e-06, + "loss": 0.4048, + "step": 5905 + }, + { + "epoch": 2.0095234046171506, + "grad_norm": 2.8990099800711495, + "learning_rate": 3.325277070668084e-06, + "loss": 0.3822, + "step": 5910 + }, + { + "epoch": 2.011224012584499, + "grad_norm": 10.672180921903456, + "learning_rate": 3.3227164278498323e-06, + "loss": 0.3806, + "step": 5915 + }, + { + "epoch": 2.012924620551847, + "grad_norm": 6.1773874270484965, + "learning_rate": 3.32015481672531e-06, + "loss": 0.4091, + "step": 5920 + }, + { + "epoch": 2.0146252285191957, + "grad_norm": 6.321820591462762, + "learning_rate": 3.3175922403094356e-06, + "loss": 0.3809, + "step": 5925 + }, + { + "epoch": 2.0163258364865437, + "grad_norm": 3.0039511137847024, + "learning_rate": 3.315028701618267e-06, + "loss": 0.4017, + "step": 5930 + }, + { + "epoch": 2.0180264444538922, + "grad_norm": 4.7799065896337245, + "learning_rate": 3.312464203668991e-06, + "loss": 0.3925, + "step": 5935 + }, + { + "epoch": 2.0197270524212407, + "grad_norm": 3.6456258180111085, + "learning_rate": 3.309898749479926e-06, + "loss": 0.3817, + "step": 5940 + }, + { + "epoch": 2.021427660388589, + "grad_norm": 3.6402333636980315, + "learning_rate": 3.307332342070515e-06, + "loss": 0.4025, + "step": 5945 + }, + { + "epoch": 2.0231282683559373, + "grad_norm": 3.4980694486792325, + "learning_rate": 3.3047649844613227e-06, + "loss": 0.4214, + "step": 5950 + }, + { + "epoch": 2.024828876323286, + "grad_norm": 3.7702137274256247, + "learning_rate": 3.3021966796740322e-06, + "loss": 0.4031, + "step": 5955 + }, + { + "epoch": 2.026529484290634, + "grad_norm": 4.4118924243661, + "learning_rate": 3.2996274307314425e-06, + "loss": 0.3826, + "step": 5960 + }, + { + "epoch": 2.0282300922579823, + "grad_norm": 3.4502156659163647, + "learning_rate": 3.297057240657462e-06, + "loss": 0.381, + "step": 5965 + }, + { + "epoch": 2.0299307002253304, + "grad_norm": 2.7373229326317103, + "learning_rate": 3.294486112477108e-06, + "loss": 0.3835, + "step": 5970 + }, + { + "epoch": 2.031631308192679, + "grad_norm": 6.504240505072941, + "learning_rate": 3.291914049216501e-06, + "loss": 0.4086, + "step": 5975 + }, + { + "epoch": 2.0333319161600274, + "grad_norm": 3.76788660053163, + "learning_rate": 3.289341053902863e-06, + "loss": 0.4276, + "step": 5980 + }, + { + "epoch": 2.0350325241273755, + "grad_norm": 4.846638448314908, + "learning_rate": 3.2867671295645133e-06, + "loss": 0.3647, + "step": 5985 + }, + { + "epoch": 2.036733132094724, + "grad_norm": 3.2953006865029733, + "learning_rate": 3.2841922792308634e-06, + "loss": 0.3907, + "step": 5990 + }, + { + "epoch": 2.038433740062072, + "grad_norm": 5.026985716820102, + "learning_rate": 3.2816165059324152e-06, + "loss": 0.3981, + "step": 5995 + }, + { + "epoch": 2.0401343480294205, + "grad_norm": 3.30096376406445, + "learning_rate": 3.2790398127007574e-06, + "loss": 0.4084, + "step": 6000 + }, + { + "epoch": 2.041834955996769, + "grad_norm": 3.4806017478356446, + "learning_rate": 3.27646220256856e-06, + "loss": 0.4015, + "step": 6005 + }, + { + "epoch": 2.043535563964117, + "grad_norm": 3.1259456878617415, + "learning_rate": 3.273883678569574e-06, + "loss": 0.3957, + "step": 6010 + }, + { + "epoch": 2.0452361719314656, + "grad_norm": 3.761567331812402, + "learning_rate": 3.271304243738625e-06, + "loss": 0.3947, + "step": 6015 + }, + { + "epoch": 2.0469367798988136, + "grad_norm": 3.8267836804540933, + "learning_rate": 3.2687239011116105e-06, + "loss": 0.3958, + "step": 6020 + }, + { + "epoch": 2.048637387866162, + "grad_norm": 3.842535621197304, + "learning_rate": 3.266142653725497e-06, + "loss": 0.4056, + "step": 6025 + }, + { + "epoch": 2.0503379958335106, + "grad_norm": 2.8454844343863566, + "learning_rate": 3.263560504618315e-06, + "loss": 0.3951, + "step": 6030 + }, + { + "epoch": 2.0520386038008587, + "grad_norm": 6.166504416915158, + "learning_rate": 3.2609774568291565e-06, + "loss": 0.3822, + "step": 6035 + }, + { + "epoch": 2.053739211768207, + "grad_norm": 4.756891951256056, + "learning_rate": 3.2583935133981725e-06, + "loss": 0.3898, + "step": 6040 + }, + { + "epoch": 2.0554398197355557, + "grad_norm": 3.6888598585629535, + "learning_rate": 3.2558086773665665e-06, + "loss": 0.4139, + "step": 6045 + }, + { + "epoch": 2.0571404277029037, + "grad_norm": 3.8564068106904985, + "learning_rate": 3.2532229517765932e-06, + "loss": 0.4012, + "step": 6050 + }, + { + "epoch": 2.0588410356702522, + "grad_norm": 6.97753452107306, + "learning_rate": 3.2506363396715553e-06, + "loss": 0.4005, + "step": 6055 + }, + { + "epoch": 2.0605416436376003, + "grad_norm": 3.973785882545404, + "learning_rate": 3.248048844095797e-06, + "loss": 0.3909, + "step": 6060 + }, + { + "epoch": 2.062242251604949, + "grad_norm": 4.201427492027756, + "learning_rate": 3.2454604680947028e-06, + "loss": 0.3915, + "step": 6065 + }, + { + "epoch": 2.0639428595722973, + "grad_norm": 5.743366083731128, + "learning_rate": 3.2428712147146945e-06, + "loss": 0.4036, + "step": 6070 + }, + { + "epoch": 2.0656434675396453, + "grad_norm": 3.6015493587753746, + "learning_rate": 3.2402810870032266e-06, + "loss": 0.3851, + "step": 6075 + }, + { + "epoch": 2.067344075506994, + "grad_norm": 5.106010597540066, + "learning_rate": 3.2376900880087803e-06, + "loss": 0.4087, + "step": 6080 + }, + { + "epoch": 2.069044683474342, + "grad_norm": 6.80410315583813, + "learning_rate": 3.235098220780865e-06, + "loss": 0.3809, + "step": 6085 + }, + { + "epoch": 2.0707452914416904, + "grad_norm": 3.199373194447593, + "learning_rate": 3.2325054883700106e-06, + "loss": 0.3879, + "step": 6090 + }, + { + "epoch": 2.072445899409039, + "grad_norm": 5.716365369394804, + "learning_rate": 3.229911893827765e-06, + "loss": 0.4057, + "step": 6095 + }, + { + "epoch": 2.074146507376387, + "grad_norm": 22.332863912163635, + "learning_rate": 3.227317440206693e-06, + "loss": 0.3954, + "step": 6100 + }, + { + "epoch": 2.0758471153437354, + "grad_norm": 3.950985622322699, + "learning_rate": 3.224722130560367e-06, + "loss": 0.3993, + "step": 6105 + }, + { + "epoch": 2.0775477233110835, + "grad_norm": 4.89561391136477, + "learning_rate": 3.2221259679433693e-06, + "loss": 0.3784, + "step": 6110 + }, + { + "epoch": 2.079248331278432, + "grad_norm": 4.647117117745478, + "learning_rate": 3.219528955411286e-06, + "loss": 0.3839, + "step": 6115 + }, + { + "epoch": 2.0809489392457805, + "grad_norm": 3.601560868706918, + "learning_rate": 3.2169310960207034e-06, + "loss": 0.3793, + "step": 6120 + }, + { + "epoch": 2.0826495472131286, + "grad_norm": 7.657508120264403, + "learning_rate": 3.214332392829203e-06, + "loss": 0.4051, + "step": 6125 + }, + { + "epoch": 2.084350155180477, + "grad_norm": 3.133410726385583, + "learning_rate": 3.211732848895362e-06, + "loss": 0.3815, + "step": 6130 + }, + { + "epoch": 2.0860507631478256, + "grad_norm": 4.2111262336931725, + "learning_rate": 3.209132467278745e-06, + "loss": 0.382, + "step": 6135 + }, + { + "epoch": 2.0877513711151736, + "grad_norm": 3.693712293456679, + "learning_rate": 3.206531251039904e-06, + "loss": 0.3916, + "step": 6140 + }, + { + "epoch": 2.089451979082522, + "grad_norm": 5.942814279257445, + "learning_rate": 3.203929203240371e-06, + "loss": 0.3899, + "step": 6145 + }, + { + "epoch": 2.09115258704987, + "grad_norm": 15.8907229176879, + "learning_rate": 3.201326326942661e-06, + "loss": 0.3798, + "step": 6150 + }, + { + "epoch": 2.0928531950172187, + "grad_norm": 3.0203623611068093, + "learning_rate": 3.1987226252102588e-06, + "loss": 0.3756, + "step": 6155 + }, + { + "epoch": 2.094553802984567, + "grad_norm": 8.523617546410772, + "learning_rate": 3.196118101107624e-06, + "loss": 0.3834, + "step": 6160 + }, + { + "epoch": 2.0962544109519152, + "grad_norm": 6.317721577230588, + "learning_rate": 3.1935127577001845e-06, + "loss": 0.4033, + "step": 6165 + }, + { + "epoch": 2.0979550189192637, + "grad_norm": 2.8884097928072845, + "learning_rate": 3.19090659805433e-06, + "loss": 0.393, + "step": 6170 + }, + { + "epoch": 2.099655626886612, + "grad_norm": 28.933403887416787, + "learning_rate": 3.1882996252374143e-06, + "loss": 0.399, + "step": 6175 + }, + { + "epoch": 2.1013562348539603, + "grad_norm": 7.400556619958389, + "learning_rate": 3.1856918423177446e-06, + "loss": 0.3847, + "step": 6180 + }, + { + "epoch": 2.103056842821309, + "grad_norm": 5.381824996885983, + "learning_rate": 3.1830832523645836e-06, + "loss": 0.3923, + "step": 6185 + }, + { + "epoch": 2.104757450788657, + "grad_norm": 4.148318719654079, + "learning_rate": 3.1804738584481437e-06, + "loss": 0.3928, + "step": 6190 + }, + { + "epoch": 2.1064580587560053, + "grad_norm": 7.3137959695835235, + "learning_rate": 3.1778636636395833e-06, + "loss": 0.4074, + "step": 6195 + }, + { + "epoch": 2.1081586667233534, + "grad_norm": 5.23251922134586, + "learning_rate": 3.1752526710110032e-06, + "loss": 0.3702, + "step": 6200 + }, + { + "epoch": 2.109859274690702, + "grad_norm": 6.960677215599259, + "learning_rate": 3.1726408836354438e-06, + "loss": 0.3798, + "step": 6205 + }, + { + "epoch": 2.1115598826580504, + "grad_norm": 7.278541102535109, + "learning_rate": 3.1700283045868807e-06, + "loss": 0.3932, + "step": 6210 + }, + { + "epoch": 2.1132604906253984, + "grad_norm": 5.0662289021962765, + "learning_rate": 3.167414936940221e-06, + "loss": 0.3938, + "step": 6215 + }, + { + "epoch": 2.114961098592747, + "grad_norm": 4.225044206581991, + "learning_rate": 3.164800783771299e-06, + "loss": 0.3844, + "step": 6220 + }, + { + "epoch": 2.1166617065600954, + "grad_norm": 3.622337780704819, + "learning_rate": 3.1621858481568755e-06, + "loss": 0.3807, + "step": 6225 + }, + { + "epoch": 2.1183623145274435, + "grad_norm": 4.3833292474446495, + "learning_rate": 3.1595701331746313e-06, + "loss": 0.3765, + "step": 6230 + }, + { + "epoch": 2.120062922494792, + "grad_norm": 3.363567775993743, + "learning_rate": 3.156953641903165e-06, + "loss": 0.4002, + "step": 6235 + }, + { + "epoch": 2.12176353046214, + "grad_norm": 4.60752552686727, + "learning_rate": 3.1543363774219877e-06, + "loss": 0.3636, + "step": 6240 + }, + { + "epoch": 2.1234641384294886, + "grad_norm": 28.46386141345009, + "learning_rate": 3.151718342811521e-06, + "loss": 0.405, + "step": 6245 + }, + { + "epoch": 2.125164746396837, + "grad_norm": 6.232217445947908, + "learning_rate": 3.1490995411530936e-06, + "loss": 0.3801, + "step": 6250 + }, + { + "epoch": 2.126865354364185, + "grad_norm": 3.5755057375891424, + "learning_rate": 3.1464799755289367e-06, + "loss": 0.4139, + "step": 6255 + }, + { + "epoch": 2.1285659623315336, + "grad_norm": 3.6657643773760253, + "learning_rate": 3.1438596490221797e-06, + "loss": 0.3807, + "step": 6260 + }, + { + "epoch": 2.1302665702988817, + "grad_norm": 4.206142634849997, + "learning_rate": 3.141238564716848e-06, + "loss": 0.3787, + "step": 6265 + }, + { + "epoch": 2.13196717826623, + "grad_norm": 9.747712005275096, + "learning_rate": 3.1386167256978606e-06, + "loss": 0.3961, + "step": 6270 + }, + { + "epoch": 2.1336677862335787, + "grad_norm": 10.02961085346867, + "learning_rate": 3.135994135051022e-06, + "loss": 0.3995, + "step": 6275 + }, + { + "epoch": 2.1353683942009267, + "grad_norm": 2.920442716069924, + "learning_rate": 3.1333707958630232e-06, + "loss": 0.3877, + "step": 6280 + }, + { + "epoch": 2.1370690021682752, + "grad_norm": 4.088653138808995, + "learning_rate": 3.130746711221436e-06, + "loss": 0.3912, + "step": 6285 + }, + { + "epoch": 2.1387696101356233, + "grad_norm": 2.919705730672031, + "learning_rate": 3.128121884214709e-06, + "loss": 0.3757, + "step": 6290 + }, + { + "epoch": 2.1404702181029718, + "grad_norm": 5.139100889696049, + "learning_rate": 3.1254963179321645e-06, + "loss": 0.4051, + "step": 6295 + }, + { + "epoch": 2.1421708260703203, + "grad_norm": 8.86992930823528, + "learning_rate": 3.1228700154639957e-06, + "loss": 0.4186, + "step": 6300 + }, + { + "epoch": 2.1438714340376683, + "grad_norm": 4.007375688761407, + "learning_rate": 3.1202429799012612e-06, + "loss": 0.3816, + "step": 6305 + }, + { + "epoch": 2.145572042005017, + "grad_norm": 3.187399320395256, + "learning_rate": 3.117615214335884e-06, + "loss": 0.3822, + "step": 6310 + }, + { + "epoch": 2.147272649972365, + "grad_norm": 3.756238102309157, + "learning_rate": 3.1149867218606437e-06, + "loss": 0.3872, + "step": 6315 + }, + { + "epoch": 2.1489732579397134, + "grad_norm": 3.6426880756085707, + "learning_rate": 3.1123575055691786e-06, + "loss": 0.3747, + "step": 6320 + }, + { + "epoch": 2.150673865907062, + "grad_norm": 2.9304753447302168, + "learning_rate": 3.1097275685559764e-06, + "loss": 0.403, + "step": 6325 + }, + { + "epoch": 2.15237447387441, + "grad_norm": 2.8120536872009683, + "learning_rate": 3.1070969139163744e-06, + "loss": 0.3761, + "step": 6330 + }, + { + "epoch": 2.1540750818417584, + "grad_norm": 3.217746393107104, + "learning_rate": 3.1044655447465537e-06, + "loss": 0.3917, + "step": 6335 + }, + { + "epoch": 2.155775689809107, + "grad_norm": 8.022088518257592, + "learning_rate": 3.1018334641435365e-06, + "loss": 0.3975, + "step": 6340 + }, + { + "epoch": 2.157476297776455, + "grad_norm": 9.534585463758319, + "learning_rate": 3.099200675205184e-06, + "loss": 0.3926, + "step": 6345 + }, + { + "epoch": 2.1591769057438035, + "grad_norm": 3.0045397628551926, + "learning_rate": 3.096567181030188e-06, + "loss": 0.3936, + "step": 6350 + }, + { + "epoch": 2.1608775137111516, + "grad_norm": 3.9190507509718318, + "learning_rate": 3.0939329847180725e-06, + "loss": 0.3816, + "step": 6355 + }, + { + "epoch": 2.1625781216785, + "grad_norm": 4.673292029453938, + "learning_rate": 3.0912980893691883e-06, + "loss": 0.3907, + "step": 6360 + }, + { + "epoch": 2.1642787296458486, + "grad_norm": 3.315784859499087, + "learning_rate": 3.088662498084708e-06, + "loss": 0.3989, + "step": 6365 + }, + { + "epoch": 2.1659793376131966, + "grad_norm": 6.957493367825952, + "learning_rate": 3.086026213966622e-06, + "loss": 0.3893, + "step": 6370 + }, + { + "epoch": 2.167679945580545, + "grad_norm": 4.573861308675167, + "learning_rate": 3.083389240117739e-06, + "loss": 0.3838, + "step": 6375 + }, + { + "epoch": 2.169380553547893, + "grad_norm": 5.145518565607208, + "learning_rate": 3.0807515796416766e-06, + "loss": 0.3884, + "step": 6380 + }, + { + "epoch": 2.1710811615152417, + "grad_norm": 3.8807568929567764, + "learning_rate": 3.0781132356428633e-06, + "loss": 0.3845, + "step": 6385 + }, + { + "epoch": 2.17278176948259, + "grad_norm": 5.3203271906906915, + "learning_rate": 3.0754742112265294e-06, + "loss": 0.4042, + "step": 6390 + }, + { + "epoch": 2.174482377449938, + "grad_norm": 3.3306654186981595, + "learning_rate": 3.0728345094987078e-06, + "loss": 0.3845, + "step": 6395 + }, + { + "epoch": 2.1761829854172867, + "grad_norm": 3.1182587811158404, + "learning_rate": 3.070194133566229e-06, + "loss": 0.3963, + "step": 6400 + }, + { + "epoch": 2.177883593384635, + "grad_norm": 2.8494439811287555, + "learning_rate": 3.0675530865367143e-06, + "loss": 0.3861, + "step": 6405 + }, + { + "epoch": 2.1795842013519833, + "grad_norm": 25.04637976539768, + "learning_rate": 3.0649113715185772e-06, + "loss": 0.3752, + "step": 6410 + }, + { + "epoch": 2.1812848093193318, + "grad_norm": 6.638253290510534, + "learning_rate": 3.0622689916210185e-06, + "loss": 0.4129, + "step": 6415 + }, + { + "epoch": 2.18298541728668, + "grad_norm": 4.096836227802035, + "learning_rate": 3.0596259499540178e-06, + "loss": 0.4029, + "step": 6420 + }, + { + "epoch": 2.1846860252540283, + "grad_norm": 5.949966887843709, + "learning_rate": 3.056982249628337e-06, + "loss": 0.3671, + "step": 6425 + }, + { + "epoch": 2.186386633221377, + "grad_norm": 5.186398482392616, + "learning_rate": 3.0543378937555113e-06, + "loss": 0.3921, + "step": 6430 + }, + { + "epoch": 2.188087241188725, + "grad_norm": 4.8792204209101815, + "learning_rate": 3.0516928854478478e-06, + "loss": 0.4044, + "step": 6435 + }, + { + "epoch": 2.1897878491560734, + "grad_norm": 3.003562599707504, + "learning_rate": 3.0490472278184226e-06, + "loss": 0.3849, + "step": 6440 + }, + { + "epoch": 2.1914884571234214, + "grad_norm": 4.526727944224896, + "learning_rate": 3.0464009239810745e-06, + "loss": 0.3804, + "step": 6445 + }, + { + "epoch": 2.19318906509077, + "grad_norm": 28.20529948007441, + "learning_rate": 3.0437539770504038e-06, + "loss": 0.4042, + "step": 6450 + }, + { + "epoch": 2.1948896730581184, + "grad_norm": 4.058541629984453, + "learning_rate": 3.041106390141767e-06, + "loss": 0.3709, + "step": 6455 + }, + { + "epoch": 2.1965902810254665, + "grad_norm": 3.1362985280636346, + "learning_rate": 3.0384581663712747e-06, + "loss": 0.3841, + "step": 6460 + }, + { + "epoch": 2.198290888992815, + "grad_norm": 4.053826773580988, + "learning_rate": 3.0358093088557867e-06, + "loss": 0.3952, + "step": 6465 + }, + { + "epoch": 2.199991496960163, + "grad_norm": 6.578596836430825, + "learning_rate": 3.0331598207129078e-06, + "loss": 0.3776, + "step": 6470 + }, + { + "epoch": 2.2016921049275116, + "grad_norm": 3.073007554583588, + "learning_rate": 3.0305097050609868e-06, + "loss": 0.3831, + "step": 6475 + }, + { + "epoch": 2.20339271289486, + "grad_norm": 10.660662387188902, + "learning_rate": 3.0278589650191086e-06, + "loss": 0.4201, + "step": 6480 + }, + { + "epoch": 2.205093320862208, + "grad_norm": 3.634437435981205, + "learning_rate": 3.025207603707096e-06, + "loss": 0.4143, + "step": 6485 + }, + { + "epoch": 2.2067939288295566, + "grad_norm": 2.3541378661435775, + "learning_rate": 3.022555624245501e-06, + "loss": 0.3784, + "step": 6490 + }, + { + "epoch": 2.2084945367969047, + "grad_norm": 3.713354619796085, + "learning_rate": 3.019903029755604e-06, + "loss": 0.3941, + "step": 6495 + }, + { + "epoch": 2.210195144764253, + "grad_norm": 3.4344135334816386, + "learning_rate": 3.0172498233594085e-06, + "loss": 0.3905, + "step": 6500 + }, + { + "epoch": 2.2118957527316017, + "grad_norm": 5.643221608685602, + "learning_rate": 3.014596008179638e-06, + "loss": 0.3904, + "step": 6505 + }, + { + "epoch": 2.2135963606989497, + "grad_norm": 3.2816927802420754, + "learning_rate": 3.011941587339734e-06, + "loss": 0.3775, + "step": 6510 + }, + { + "epoch": 2.215296968666298, + "grad_norm": 6.472534170411608, + "learning_rate": 3.0092865639638496e-06, + "loss": 0.3884, + "step": 6515 + }, + { + "epoch": 2.2169975766336467, + "grad_norm": 5.360474824776912, + "learning_rate": 3.006630941176847e-06, + "loss": 0.3934, + "step": 6520 + }, + { + "epoch": 2.2186981846009948, + "grad_norm": 4.213214855907222, + "learning_rate": 3.0039747221042947e-06, + "loss": 0.4072, + "step": 6525 + }, + { + "epoch": 2.2203987925683433, + "grad_norm": 4.0313901632910305, + "learning_rate": 3.0013179098724626e-06, + "loss": 0.4198, + "step": 6530 + }, + { + "epoch": 2.2220994005356913, + "grad_norm": 6.320108057489615, + "learning_rate": 2.9986605076083185e-06, + "loss": 0.3846, + "step": 6535 + }, + { + "epoch": 2.22380000850304, + "grad_norm": 3.3568667388543942, + "learning_rate": 2.9960025184395248e-06, + "loss": 0.3887, + "step": 6540 + }, + { + "epoch": 2.2255006164703883, + "grad_norm": 6.005606524940383, + "learning_rate": 2.9933439454944346e-06, + "loss": 0.3782, + "step": 6545 + }, + { + "epoch": 2.2272012244377364, + "grad_norm": 5.17729596720581, + "learning_rate": 2.990684791902089e-06, + "loss": 0.3946, + "step": 6550 + }, + { + "epoch": 2.228901832405085, + "grad_norm": 2.794064491309928, + "learning_rate": 2.9880250607922107e-06, + "loss": 0.3722, + "step": 6555 + }, + { + "epoch": 2.230602440372433, + "grad_norm": 7.343356762429561, + "learning_rate": 2.9853647552952037e-06, + "loss": 0.3866, + "step": 6560 + }, + { + "epoch": 2.2323030483397814, + "grad_norm": 3.7349707455890697, + "learning_rate": 2.982703878542147e-06, + "loss": 0.3865, + "step": 6565 + }, + { + "epoch": 2.23400365630713, + "grad_norm": 7.457076904709625, + "learning_rate": 2.9800424336647933e-06, + "loss": 0.4007, + "step": 6570 + }, + { + "epoch": 2.235704264274478, + "grad_norm": 3.2239996372192397, + "learning_rate": 2.9773804237955616e-06, + "loss": 0.3824, + "step": 6575 + }, + { + "epoch": 2.2374048722418265, + "grad_norm": 2.9883472148059393, + "learning_rate": 2.974717852067539e-06, + "loss": 0.3784, + "step": 6580 + }, + { + "epoch": 2.239105480209175, + "grad_norm": 3.6203687143679373, + "learning_rate": 2.9720547216144714e-06, + "loss": 0.3922, + "step": 6585 + }, + { + "epoch": 2.240806088176523, + "grad_norm": 4.057120091702925, + "learning_rate": 2.9693910355707622e-06, + "loss": 0.4069, + "step": 6590 + }, + { + "epoch": 2.2425066961438715, + "grad_norm": 2.8483307786069685, + "learning_rate": 2.9667267970714714e-06, + "loss": 0.4098, + "step": 6595 + }, + { + "epoch": 2.2442073041112196, + "grad_norm": 3.3810780108085052, + "learning_rate": 2.9640620092523064e-06, + "loss": 0.4086, + "step": 6600 + }, + { + "epoch": 2.245907912078568, + "grad_norm": 4.052869027985857, + "learning_rate": 2.9613966752496215e-06, + "loss": 0.3857, + "step": 6605 + }, + { + "epoch": 2.2476085200459166, + "grad_norm": 4.073745702007441, + "learning_rate": 2.958730798200416e-06, + "loss": 0.3902, + "step": 6610 + }, + { + "epoch": 2.2493091280132647, + "grad_norm": 9.767610746705577, + "learning_rate": 2.9560643812423258e-06, + "loss": 0.3944, + "step": 6615 + }, + { + "epoch": 2.251009735980613, + "grad_norm": 3.8926434508545413, + "learning_rate": 2.953397427513624e-06, + "loss": 0.399, + "step": 6620 + }, + { + "epoch": 2.252710343947961, + "grad_norm": 4.857648522720984, + "learning_rate": 2.950729940153215e-06, + "loss": 0.3699, + "step": 6625 + }, + { + "epoch": 2.2544109519153097, + "grad_norm": 4.578974296469672, + "learning_rate": 2.9480619223006297e-06, + "loss": 0.4074, + "step": 6630 + }, + { + "epoch": 2.256111559882658, + "grad_norm": 2.9057422570205422, + "learning_rate": 2.9453933770960254e-06, + "loss": 0.4073, + "step": 6635 + }, + { + "epoch": 2.2578121678500063, + "grad_norm": 8.47568626328369, + "learning_rate": 2.9427243076801797e-06, + "loss": 0.3659, + "step": 6640 + }, + { + "epoch": 2.2595127758173548, + "grad_norm": 3.2068623693933263, + "learning_rate": 2.9400547171944864e-06, + "loss": 0.3765, + "step": 6645 + }, + { + "epoch": 2.2612133837847033, + "grad_norm": 3.742057678575811, + "learning_rate": 2.937384608780953e-06, + "loss": 0.3909, + "step": 6650 + }, + { + "epoch": 2.2629139917520513, + "grad_norm": 2.860519356877674, + "learning_rate": 2.9347139855821978e-06, + "loss": 0.3816, + "step": 6655 + }, + { + "epoch": 2.2646145997194, + "grad_norm": 2.580593782356621, + "learning_rate": 2.932042850741442e-06, + "loss": 0.3787, + "step": 6660 + }, + { + "epoch": 2.266315207686748, + "grad_norm": 4.783146635286192, + "learning_rate": 2.929371207402511e-06, + "loss": 0.3949, + "step": 6665 + }, + { + "epoch": 2.2680158156540964, + "grad_norm": 13.918648907622003, + "learning_rate": 2.9266990587098297e-06, + "loss": 0.384, + "step": 6670 + }, + { + "epoch": 2.2697164236214444, + "grad_norm": 3.798237473233975, + "learning_rate": 2.9240264078084163e-06, + "loss": 0.3929, + "step": 6675 + }, + { + "epoch": 2.271417031588793, + "grad_norm": 3.673668146030382, + "learning_rate": 2.9213532578438797e-06, + "loss": 0.3897, + "step": 6680 + }, + { + "epoch": 2.2731176395561414, + "grad_norm": 4.35686860967354, + "learning_rate": 2.9186796119624166e-06, + "loss": 0.3788, + "step": 6685 + }, + { + "epoch": 2.2748182475234895, + "grad_norm": 3.013009008419703, + "learning_rate": 2.9160054733108085e-06, + "loss": 0.375, + "step": 6690 + }, + { + "epoch": 2.276518855490838, + "grad_norm": 4.732871979512938, + "learning_rate": 2.913330845036415e-06, + "loss": 0.3877, + "step": 6695 + }, + { + "epoch": 2.2782194634581865, + "grad_norm": 2.8601369197988844, + "learning_rate": 2.9106557302871735e-06, + "loss": 0.3728, + "step": 6700 + }, + { + "epoch": 2.2799200714255345, + "grad_norm": 3.4292070952688776, + "learning_rate": 2.9079801322115938e-06, + "loss": 0.4044, + "step": 6705 + }, + { + "epoch": 2.281620679392883, + "grad_norm": 7.024904512911684, + "learning_rate": 2.905304053958753e-06, + "loss": 0.3762, + "step": 6710 + }, + { + "epoch": 2.283321287360231, + "grad_norm": 3.25399253417125, + "learning_rate": 2.902627498678295e-06, + "loss": 0.3875, + "step": 6715 + }, + { + "epoch": 2.2850218953275796, + "grad_norm": 30.94644471801256, + "learning_rate": 2.8999504695204246e-06, + "loss": 0.3727, + "step": 6720 + }, + { + "epoch": 2.286722503294928, + "grad_norm": 5.520919390543611, + "learning_rate": 2.8972729696359035e-06, + "loss": 0.3785, + "step": 6725 + }, + { + "epoch": 2.288423111262276, + "grad_norm": 4.206500382437017, + "learning_rate": 2.8945950021760504e-06, + "loss": 0.4015, + "step": 6730 + }, + { + "epoch": 2.2901237192296247, + "grad_norm": 3.5001389066297905, + "learning_rate": 2.891916570292731e-06, + "loss": 0.3736, + "step": 6735 + }, + { + "epoch": 2.2918243271969727, + "grad_norm": 4.046070751027091, + "learning_rate": 2.8892376771383586e-06, + "loss": 0.3694, + "step": 6740 + }, + { + "epoch": 2.293524935164321, + "grad_norm": 6.65064340381129, + "learning_rate": 2.8865583258658895e-06, + "loss": 0.3982, + "step": 6745 + }, + { + "epoch": 2.2952255431316697, + "grad_norm": 2.9008563706204176, + "learning_rate": 2.8838785196288205e-06, + "loss": 0.4032, + "step": 6750 + }, + { + "epoch": 2.2969261510990178, + "grad_norm": 3.406796171684056, + "learning_rate": 2.881198261581182e-06, + "loss": 0.3912, + "step": 6755 + }, + { + "epoch": 2.2986267590663663, + "grad_norm": 5.817153214684034, + "learning_rate": 2.878517554877536e-06, + "loss": 0.3682, + "step": 6760 + }, + { + "epoch": 2.3003273670337148, + "grad_norm": 5.904612058981914, + "learning_rate": 2.8758364026729742e-06, + "loss": 0.3938, + "step": 6765 + }, + { + "epoch": 2.302027975001063, + "grad_norm": 4.497725265776514, + "learning_rate": 2.8731548081231114e-06, + "loss": 0.4018, + "step": 6770 + }, + { + "epoch": 2.3037285829684113, + "grad_norm": 3.0306298954449185, + "learning_rate": 2.870472774384084e-06, + "loss": 0.3864, + "step": 6775 + }, + { + "epoch": 2.3054291909357594, + "grad_norm": 3.957901406872624, + "learning_rate": 2.867790304612545e-06, + "loss": 0.3897, + "step": 6780 + }, + { + "epoch": 2.307129798903108, + "grad_norm": 4.19134334326208, + "learning_rate": 2.8651074019656594e-06, + "loss": 0.3919, + "step": 6785 + }, + { + "epoch": 2.3088304068704564, + "grad_norm": 4.062622192145356, + "learning_rate": 2.862424069601103e-06, + "loss": 0.4088, + "step": 6790 + }, + { + "epoch": 2.3105310148378044, + "grad_norm": 7.4185656087697405, + "learning_rate": 2.859740310677058e-06, + "loss": 0.3816, + "step": 6795 + }, + { + "epoch": 2.312231622805153, + "grad_norm": 4.226402215845138, + "learning_rate": 2.8570561283522063e-06, + "loss": 0.3814, + "step": 6800 + }, + { + "epoch": 2.313932230772501, + "grad_norm": 5.2505222772393525, + "learning_rate": 2.8543715257857294e-06, + "loss": 0.3777, + "step": 6805 + }, + { + "epoch": 2.3156328387398495, + "grad_norm": 6.356064017229173, + "learning_rate": 2.851686506137305e-06, + "loss": 0.3712, + "step": 6810 + }, + { + "epoch": 2.317333446707198, + "grad_norm": 3.5856118910345103, + "learning_rate": 2.849001072567099e-06, + "loss": 0.3847, + "step": 6815 + }, + { + "epoch": 2.319034054674546, + "grad_norm": 4.549197226747274, + "learning_rate": 2.846315228235766e-06, + "loss": 0.4028, + "step": 6820 + }, + { + "epoch": 2.3207346626418945, + "grad_norm": 3.485550956628789, + "learning_rate": 2.8436289763044434e-06, + "loss": 0.3679, + "step": 6825 + }, + { + "epoch": 2.322435270609243, + "grad_norm": 3.378212405100649, + "learning_rate": 2.8409423199347484e-06, + "loss": 0.4032, + "step": 6830 + }, + { + "epoch": 2.324135878576591, + "grad_norm": 3.121843876887772, + "learning_rate": 2.8382552622887753e-06, + "loss": 0.3889, + "step": 6835 + }, + { + "epoch": 2.3258364865439396, + "grad_norm": 7.439721804395752, + "learning_rate": 2.8355678065290893e-06, + "loss": 0.3964, + "step": 6840 + }, + { + "epoch": 2.3275370945112877, + "grad_norm": 3.5065346203448944, + "learning_rate": 2.8328799558187237e-06, + "loss": 0.398, + "step": 6845 + }, + { + "epoch": 2.329237702478636, + "grad_norm": 4.170041707692432, + "learning_rate": 2.83019171332118e-06, + "loss": 0.377, + "step": 6850 + }, + { + "epoch": 2.330938310445984, + "grad_norm": 5.121636763633387, + "learning_rate": 2.8275030822004165e-06, + "loss": 0.3692, + "step": 6855 + }, + { + "epoch": 2.3326389184133327, + "grad_norm": 5.461538166371756, + "learning_rate": 2.824814065620851e-06, + "loss": 0.413, + "step": 6860 + }, + { + "epoch": 2.334339526380681, + "grad_norm": 3.3821381664776826, + "learning_rate": 2.822124666747356e-06, + "loss": 0.3692, + "step": 6865 + }, + { + "epoch": 2.3360401343480293, + "grad_norm": 3.14786263283993, + "learning_rate": 2.8194348887452518e-06, + "loss": 0.397, + "step": 6870 + }, + { + "epoch": 2.3377407423153778, + "grad_norm": 3.8455746251884926, + "learning_rate": 2.8167447347803057e-06, + "loss": 0.3823, + "step": 6875 + }, + { + "epoch": 2.3394413502827263, + "grad_norm": 3.294819905660892, + "learning_rate": 2.814054208018728e-06, + "loss": 0.3888, + "step": 6880 + }, + { + "epoch": 2.3411419582500743, + "grad_norm": 4.042913409326378, + "learning_rate": 2.811363311627168e-06, + "loss": 0.3892, + "step": 6885 + }, + { + "epoch": 2.342842566217423, + "grad_norm": 5.614812550089777, + "learning_rate": 2.808672048772709e-06, + "loss": 0.3675, + "step": 6890 + }, + { + "epoch": 2.344543174184771, + "grad_norm": 9.879426887704412, + "learning_rate": 2.8059804226228655e-06, + "loss": 0.4198, + "step": 6895 + }, + { + "epoch": 2.3462437821521194, + "grad_norm": 2.4671869965760216, + "learning_rate": 2.803288436345581e-06, + "loss": 0.3663, + "step": 6900 + }, + { + "epoch": 2.347944390119468, + "grad_norm": 3.8103039255669935, + "learning_rate": 2.8005960931092207e-06, + "loss": 0.3709, + "step": 6905 + }, + { + "epoch": 2.349644998086816, + "grad_norm": 3.0719064184061162, + "learning_rate": 2.7979033960825734e-06, + "loss": 0.3774, + "step": 6910 + }, + { + "epoch": 2.3513456060541644, + "grad_norm": 5.930297960854046, + "learning_rate": 2.7952103484348407e-06, + "loss": 0.37, + "step": 6915 + }, + { + "epoch": 2.3530462140215125, + "grad_norm": 4.240313783774013, + "learning_rate": 2.792516953335639e-06, + "loss": 0.3748, + "step": 6920 + }, + { + "epoch": 2.354746821988861, + "grad_norm": 5.371956477197449, + "learning_rate": 2.7898232139549917e-06, + "loss": 0.3681, + "step": 6925 + }, + { + "epoch": 2.3564474299562095, + "grad_norm": 4.448795807879105, + "learning_rate": 2.7871291334633305e-06, + "loss": 0.378, + "step": 6930 + }, + { + "epoch": 2.3581480379235575, + "grad_norm": 2.398314300258431, + "learning_rate": 2.784434715031486e-06, + "loss": 0.378, + "step": 6935 + }, + { + "epoch": 2.359848645890906, + "grad_norm": 4.053282477849935, + "learning_rate": 2.781739961830687e-06, + "loss": 0.3919, + "step": 6940 + }, + { + "epoch": 2.3615492538582545, + "grad_norm": 3.434192241461476, + "learning_rate": 2.779044877032556e-06, + "loss": 0.3864, + "step": 6945 + }, + { + "epoch": 2.3632498618256026, + "grad_norm": 3.5127030227379796, + "learning_rate": 2.7763494638091074e-06, + "loss": 0.3837, + "step": 6950 + }, + { + "epoch": 2.364950469792951, + "grad_norm": 12.549471937745547, + "learning_rate": 2.773653725332741e-06, + "loss": 0.4015, + "step": 6955 + }, + { + "epoch": 2.366651077760299, + "grad_norm": 4.330594942310785, + "learning_rate": 2.770957664776239e-06, + "loss": 0.3928, + "step": 6960 + }, + { + "epoch": 2.3683516857276476, + "grad_norm": 2.6158581072547653, + "learning_rate": 2.7682612853127634e-06, + "loss": 0.3895, + "step": 6965 + }, + { + "epoch": 2.370052293694996, + "grad_norm": 4.641906618466447, + "learning_rate": 2.7655645901158516e-06, + "loss": 0.3757, + "step": 6970 + }, + { + "epoch": 2.371752901662344, + "grad_norm": 10.877258540651997, + "learning_rate": 2.7628675823594132e-06, + "loss": 0.3811, + "step": 6975 + }, + { + "epoch": 2.3734535096296927, + "grad_norm": 3.0562321854812358, + "learning_rate": 2.7601702652177225e-06, + "loss": 0.3778, + "step": 6980 + }, + { + "epoch": 2.3751541175970408, + "grad_norm": 11.777994691598085, + "learning_rate": 2.7574726418654225e-06, + "loss": 0.4107, + "step": 6985 + }, + { + "epoch": 2.3768547255643893, + "grad_norm": 3.2157204196506908, + "learning_rate": 2.7547747154775133e-06, + "loss": 0.3915, + "step": 6990 + }, + { + "epoch": 2.3785553335317378, + "grad_norm": 4.373250719021987, + "learning_rate": 2.752076489229353e-06, + "loss": 0.3949, + "step": 6995 + }, + { + "epoch": 2.380255941499086, + "grad_norm": 4.667751747557542, + "learning_rate": 2.749377966296652e-06, + "loss": 0.3898, + "step": 7000 + }, + { + "epoch": 2.3819565494664343, + "grad_norm": 2.70105224071973, + "learning_rate": 2.74667914985547e-06, + "loss": 0.3866, + "step": 7005 + }, + { + "epoch": 2.383657157433783, + "grad_norm": 3.0249429323232353, + "learning_rate": 2.743980043082214e-06, + "loss": 0.369, + "step": 7010 + }, + { + "epoch": 2.385357765401131, + "grad_norm": 3.4397168493562216, + "learning_rate": 2.741280649153629e-06, + "loss": 0.396, + "step": 7015 + }, + { + "epoch": 2.3870583733684794, + "grad_norm": 5.2931519021161515, + "learning_rate": 2.738580971246801e-06, + "loss": 0.407, + "step": 7020 + }, + { + "epoch": 2.3887589813358274, + "grad_norm": 3.093863474694001, + "learning_rate": 2.735881012539149e-06, + "loss": 0.3708, + "step": 7025 + }, + { + "epoch": 2.390459589303176, + "grad_norm": 2.7433960769201233, + "learning_rate": 2.7331807762084236e-06, + "loss": 0.3787, + "step": 7030 + }, + { + "epoch": 2.392160197270524, + "grad_norm": 4.579031308987416, + "learning_rate": 2.7304802654327007e-06, + "loss": 0.3801, + "step": 7035 + }, + { + "epoch": 2.3938608052378725, + "grad_norm": 7.896593870725533, + "learning_rate": 2.727779483390379e-06, + "loss": 0.3988, + "step": 7040 + }, + { + "epoch": 2.395561413205221, + "grad_norm": 2.920203355513295, + "learning_rate": 2.7250784332601793e-06, + "loss": 0.3777, + "step": 7045 + }, + { + "epoch": 2.397262021172569, + "grad_norm": 3.2350813759464345, + "learning_rate": 2.722377118221135e-06, + "loss": 0.3658, + "step": 7050 + }, + { + "epoch": 2.3989626291399175, + "grad_norm": 6.238155764872062, + "learning_rate": 2.719675541452592e-06, + "loss": 0.4156, + "step": 7055 + }, + { + "epoch": 2.400663237107266, + "grad_norm": 2.976892920763275, + "learning_rate": 2.7169737061342044e-06, + "loss": 0.3966, + "step": 7060 + }, + { + "epoch": 2.402363845074614, + "grad_norm": 3.093711932280492, + "learning_rate": 2.7142716154459307e-06, + "loss": 0.3618, + "step": 7065 + }, + { + "epoch": 2.4040644530419626, + "grad_norm": 3.4625004458174695, + "learning_rate": 2.7115692725680304e-06, + "loss": 0.376, + "step": 7070 + }, + { + "epoch": 2.4057650610093106, + "grad_norm": 4.281537573703623, + "learning_rate": 2.708866680681059e-06, + "loss": 0.3883, + "step": 7075 + }, + { + "epoch": 2.407465668976659, + "grad_norm": 3.716958213713566, + "learning_rate": 2.7061638429658653e-06, + "loss": 0.3723, + "step": 7080 + }, + { + "epoch": 2.4091662769440076, + "grad_norm": 3.6006577849348447, + "learning_rate": 2.703460762603588e-06, + "loss": 0.3617, + "step": 7085 + }, + { + "epoch": 2.4108668849113557, + "grad_norm": 24.360757548436837, + "learning_rate": 2.700757442775651e-06, + "loss": 0.3924, + "step": 7090 + }, + { + "epoch": 2.412567492878704, + "grad_norm": 2.888222589544978, + "learning_rate": 2.6980538866637594e-06, + "loss": 0.3937, + "step": 7095 + }, + { + "epoch": 2.4142681008460523, + "grad_norm": 2.921687785117801, + "learning_rate": 2.695350097449897e-06, + "loss": 0.3888, + "step": 7100 + }, + { + "epoch": 2.4159687088134008, + "grad_norm": 4.1461127523046, + "learning_rate": 2.6926460783163223e-06, + "loss": 0.3968, + "step": 7105 + }, + { + "epoch": 2.4176693167807493, + "grad_norm": 4.712695647518259, + "learning_rate": 2.6899418324455643e-06, + "loss": 0.389, + "step": 7110 + }, + { + "epoch": 2.4193699247480973, + "grad_norm": 3.115151541680386, + "learning_rate": 2.6872373630204186e-06, + "loss": 0.3727, + "step": 7115 + }, + { + "epoch": 2.421070532715446, + "grad_norm": 3.157897674566553, + "learning_rate": 2.684532673223943e-06, + "loss": 0.3753, + "step": 7120 + }, + { + "epoch": 2.4227711406827943, + "grad_norm": 3.154771734393668, + "learning_rate": 2.6818277662394567e-06, + "loss": 0.3981, + "step": 7125 + }, + { + "epoch": 2.4244717486501424, + "grad_norm": 2.9277590011213386, + "learning_rate": 2.6791226452505326e-06, + "loss": 0.3768, + "step": 7130 + }, + { + "epoch": 2.426172356617491, + "grad_norm": 3.4408684506274283, + "learning_rate": 2.676417313440997e-06, + "loss": 0.3779, + "step": 7135 + }, + { + "epoch": 2.427872964584839, + "grad_norm": 3.7907855368258234, + "learning_rate": 2.673711773994923e-06, + "loss": 0.3543, + "step": 7140 + }, + { + "epoch": 2.4295735725521874, + "grad_norm": 2.7577432226511003, + "learning_rate": 2.671006030096629e-06, + "loss": 0.3591, + "step": 7145 + }, + { + "epoch": 2.431274180519536, + "grad_norm": 3.70322966299185, + "learning_rate": 2.668300084930674e-06, + "loss": 0.3771, + "step": 7150 + }, + { + "epoch": 2.432974788486884, + "grad_norm": 3.5439886993149985, + "learning_rate": 2.6655939416818534e-06, + "loss": 0.3956, + "step": 7155 + }, + { + "epoch": 2.4346753964542325, + "grad_norm": 3.1874622661076577, + "learning_rate": 2.6628876035351948e-06, + "loss": 0.3989, + "step": 7160 + }, + { + "epoch": 2.4363760044215805, + "grad_norm": 5.7706010189966985, + "learning_rate": 2.660181073675958e-06, + "loss": 0.3575, + "step": 7165 + }, + { + "epoch": 2.438076612388929, + "grad_norm": 3.431719425794485, + "learning_rate": 2.6574743552896266e-06, + "loss": 0.3687, + "step": 7170 + }, + { + "epoch": 2.4397772203562775, + "grad_norm": 15.30462636095367, + "learning_rate": 2.6547674515619053e-06, + "loss": 0.368, + "step": 7175 + }, + { + "epoch": 2.4414778283236256, + "grad_norm": 3.7179034277483787, + "learning_rate": 2.6520603656787187e-06, + "loss": 0.3843, + "step": 7180 + }, + { + "epoch": 2.443178436290974, + "grad_norm": 3.49585049531343, + "learning_rate": 2.6493531008262054e-06, + "loss": 0.3933, + "step": 7185 + }, + { + "epoch": 2.4448790442583226, + "grad_norm": 5.402626100935758, + "learning_rate": 2.6466456601907127e-06, + "loss": 0.3975, + "step": 7190 + }, + { + "epoch": 2.4465796522256706, + "grad_norm": 3.944811421987731, + "learning_rate": 2.643938046958797e-06, + "loss": 0.3778, + "step": 7195 + }, + { + "epoch": 2.448280260193019, + "grad_norm": 4.352984068739917, + "learning_rate": 2.6412302643172184e-06, + "loss": 0.3751, + "step": 7200 + }, + { + "epoch": 2.449980868160367, + "grad_norm": 5.990844269985125, + "learning_rate": 2.638522315452934e-06, + "loss": 0.384, + "step": 7205 + }, + { + "epoch": 2.4516814761277157, + "grad_norm": 4.424332453063507, + "learning_rate": 2.635814203553097e-06, + "loss": 0.3828, + "step": 7210 + }, + { + "epoch": 2.4533820840950638, + "grad_norm": 4.176743372483272, + "learning_rate": 2.6331059318050543e-06, + "loss": 0.3881, + "step": 7215 + }, + { + "epoch": 2.4550826920624123, + "grad_norm": 3.310052928713474, + "learning_rate": 2.6303975033963396e-06, + "loss": 0.3757, + "step": 7220 + }, + { + "epoch": 2.4567833000297608, + "grad_norm": 8.163648025882706, + "learning_rate": 2.627688921514672e-06, + "loss": 0.4053, + "step": 7225 + }, + { + "epoch": 2.458483907997109, + "grad_norm": 5.165457546817537, + "learning_rate": 2.6249801893479483e-06, + "loss": 0.3735, + "step": 7230 + }, + { + "epoch": 2.4601845159644573, + "grad_norm": 6.269256791836688, + "learning_rate": 2.622271310084246e-06, + "loss": 0.3573, + "step": 7235 + }, + { + "epoch": 2.461885123931806, + "grad_norm": 9.252976144429491, + "learning_rate": 2.619562286911814e-06, + "loss": 0.3759, + "step": 7240 + }, + { + "epoch": 2.463585731899154, + "grad_norm": 3.833607786003582, + "learning_rate": 2.6168531230190703e-06, + "loss": 0.3699, + "step": 7245 + }, + { + "epoch": 2.4652863398665024, + "grad_norm": 4.39713759616983, + "learning_rate": 2.6141438215945986e-06, + "loss": 0.3835, + "step": 7250 + }, + { + "epoch": 2.4669869478338504, + "grad_norm": 5.834908443228501, + "learning_rate": 2.6114343858271444e-06, + "loss": 0.3663, + "step": 7255 + }, + { + "epoch": 2.468687555801199, + "grad_norm": 3.411864572013808, + "learning_rate": 2.608724818905613e-06, + "loss": 0.3713, + "step": 7260 + }, + { + "epoch": 2.4703881637685474, + "grad_norm": 2.9180662262381007, + "learning_rate": 2.606015124019061e-06, + "loss": 0.3644, + "step": 7265 + }, + { + "epoch": 2.4720887717358955, + "grad_norm": 4.7665976923772435, + "learning_rate": 2.603305304356699e-06, + "loss": 0.3739, + "step": 7270 + }, + { + "epoch": 2.473789379703244, + "grad_norm": 4.417886027471119, + "learning_rate": 2.600595363107881e-06, + "loss": 0.4001, + "step": 7275 + }, + { + "epoch": 2.475489987670592, + "grad_norm": 3.611893878891544, + "learning_rate": 2.5978853034621068e-06, + "loss": 0.3836, + "step": 7280 + }, + { + "epoch": 2.4771905956379405, + "grad_norm": 7.797442710115348, + "learning_rate": 2.5951751286090147e-06, + "loss": 0.4007, + "step": 7285 + }, + { + "epoch": 2.478891203605289, + "grad_norm": 5.48946205897742, + "learning_rate": 2.5924648417383785e-06, + "loss": 0.3738, + "step": 7290 + }, + { + "epoch": 2.480591811572637, + "grad_norm": 8.268572046751316, + "learning_rate": 2.5897544460401035e-06, + "loss": 0.4021, + "step": 7295 + }, + { + "epoch": 2.4822924195399856, + "grad_norm": 3.1373030798338823, + "learning_rate": 2.5870439447042233e-06, + "loss": 0.3594, + "step": 7300 + }, + { + "epoch": 2.483993027507334, + "grad_norm": 6.809225971579622, + "learning_rate": 2.5843333409208965e-06, + "loss": 0.393, + "step": 7305 + }, + { + "epoch": 2.485693635474682, + "grad_norm": 3.408878613042235, + "learning_rate": 2.5816226378804016e-06, + "loss": 0.3796, + "step": 7310 + }, + { + "epoch": 2.4873942434420306, + "grad_norm": 3.403340689742314, + "learning_rate": 2.578911838773134e-06, + "loss": 0.3846, + "step": 7315 + }, + { + "epoch": 2.4890948514093787, + "grad_norm": 8.252308876674388, + "learning_rate": 2.5762009467896023e-06, + "loss": 0.3864, + "step": 7320 + }, + { + "epoch": 2.490795459376727, + "grad_norm": 8.842409358021643, + "learning_rate": 2.573489965120424e-06, + "loss": 0.3872, + "step": 7325 + }, + { + "epoch": 2.4924960673440757, + "grad_norm": 3.189248487541075, + "learning_rate": 2.570778896956322e-06, + "loss": 0.3687, + "step": 7330 + }, + { + "epoch": 2.4941966753114237, + "grad_norm": 4.395130713415741, + "learning_rate": 2.5680677454881233e-06, + "loss": 0.3742, + "step": 7335 + }, + { + "epoch": 2.4958972832787722, + "grad_norm": 6.024759124178671, + "learning_rate": 2.565356513906748e-06, + "loss": 0.3626, + "step": 7340 + }, + { + "epoch": 2.4975978912461203, + "grad_norm": 5.154437866323346, + "learning_rate": 2.5626452054032176e-06, + "loss": 0.3732, + "step": 7345 + }, + { + "epoch": 2.499298499213469, + "grad_norm": 5.064941238616173, + "learning_rate": 2.5599338231686377e-06, + "loss": 0.382, + "step": 7350 + }, + { + "epoch": 2.5009991071808173, + "grad_norm": 2.9330903967823887, + "learning_rate": 2.5572223703942035e-06, + "loss": 0.357, + "step": 7355 + }, + { + "epoch": 2.5026997151481654, + "grad_norm": 4.296783574336214, + "learning_rate": 2.554510850271193e-06, + "loss": 0.3939, + "step": 7360 + }, + { + "epoch": 2.504400323115514, + "grad_norm": 6.214571798228819, + "learning_rate": 2.5517992659909634e-06, + "loss": 0.382, + "step": 7365 + }, + { + "epoch": 2.5061009310828624, + "grad_norm": 3.970546982199594, + "learning_rate": 2.5490876207449475e-06, + "loss": 0.4019, + "step": 7370 + }, + { + "epoch": 2.5078015390502104, + "grad_norm": 39.025384810741045, + "learning_rate": 2.5463759177246495e-06, + "loss": 0.3722, + "step": 7375 + }, + { + "epoch": 2.509502147017559, + "grad_norm": 3.188083488241429, + "learning_rate": 2.5436641601216415e-06, + "loss": 0.3802, + "step": 7380 + }, + { + "epoch": 2.511202754984907, + "grad_norm": 3.14525105512372, + "learning_rate": 2.5409523511275606e-06, + "loss": 0.3773, + "step": 7385 + }, + { + "epoch": 2.5129033629522555, + "grad_norm": 3.4363663027066664, + "learning_rate": 2.5382404939341036e-06, + "loss": 0.386, + "step": 7390 + }, + { + "epoch": 2.5146039709196035, + "grad_norm": 3.2902728132282326, + "learning_rate": 2.5355285917330246e-06, + "loss": 0.3919, + "step": 7395 + }, + { + "epoch": 2.516304578886952, + "grad_norm": 3.3955336441336237, + "learning_rate": 2.53281664771613e-06, + "loss": 0.3862, + "step": 7400 + }, + { + "epoch": 2.5180051868543005, + "grad_norm": 30.81157843740032, + "learning_rate": 2.5301046650752763e-06, + "loss": 0.3748, + "step": 7405 + }, + { + "epoch": 2.5197057948216486, + "grad_norm": 6.625981841806447, + "learning_rate": 2.527392647002365e-06, + "loss": 0.3738, + "step": 7410 + }, + { + "epoch": 2.521406402788997, + "grad_norm": 3.0921998463324494, + "learning_rate": 2.5246805966893388e-06, + "loss": 0.3802, + "step": 7415 + }, + { + "epoch": 2.5231070107563456, + "grad_norm": 5.253948797961146, + "learning_rate": 2.5219685173281797e-06, + "loss": 0.3834, + "step": 7420 + }, + { + "epoch": 2.5248076187236936, + "grad_norm": 4.97384495475131, + "learning_rate": 2.5192564121109025e-06, + "loss": 0.3917, + "step": 7425 + }, + { + "epoch": 2.526508226691042, + "grad_norm": 3.525846323462171, + "learning_rate": 2.516544284229553e-06, + "loss": 0.4042, + "step": 7430 + }, + { + "epoch": 2.5282088346583906, + "grad_norm": 3.6262970555576683, + "learning_rate": 2.5138321368762036e-06, + "loss": 0.3846, + "step": 7435 + }, + { + "epoch": 2.5299094426257387, + "grad_norm": 7.761359044261236, + "learning_rate": 2.5111199732429497e-06, + "loss": 0.3536, + "step": 7440 + }, + { + "epoch": 2.531610050593087, + "grad_norm": 3.173733817605067, + "learning_rate": 2.5084077965219056e-06, + "loss": 0.3805, + "step": 7445 + }, + { + "epoch": 2.5333106585604352, + "grad_norm": 4.144913659058304, + "learning_rate": 2.5056956099052017e-06, + "loss": 0.3615, + "step": 7450 + }, + { + "epoch": 2.5350112665277837, + "grad_norm": 3.5670625717185755, + "learning_rate": 2.5029834165849787e-06, + "loss": 0.4021, + "step": 7455 + }, + { + "epoch": 2.536711874495132, + "grad_norm": 3.7333967813566105, + "learning_rate": 2.500271219753387e-06, + "loss": 0.3697, + "step": 7460 + }, + { + "epoch": 2.5384124824624803, + "grad_norm": 3.3655959893083276, + "learning_rate": 2.49755902260258e-06, + "loss": 0.4013, + "step": 7465 + }, + { + "epoch": 2.540113090429829, + "grad_norm": 3.864579886438611, + "learning_rate": 2.494846828324711e-06, + "loss": 0.3678, + "step": 7470 + }, + { + "epoch": 2.541813698397177, + "grad_norm": 3.570992827071064, + "learning_rate": 2.4921346401119317e-06, + "loss": 0.3591, + "step": 7475 + }, + { + "epoch": 2.5435143063645254, + "grad_norm": 5.462664500161082, + "learning_rate": 2.489422461156385e-06, + "loss": 0.3782, + "step": 7480 + }, + { + "epoch": 2.545214914331874, + "grad_norm": 3.398428072748691, + "learning_rate": 2.4867102946502034e-06, + "loss": 0.377, + "step": 7485 + }, + { + "epoch": 2.546915522299222, + "grad_norm": 3.4400743234994033, + "learning_rate": 2.4839981437855045e-06, + "loss": 0.3901, + "step": 7490 + }, + { + "epoch": 2.5486161302665704, + "grad_norm": 3.4595834831198524, + "learning_rate": 2.4812860117543883e-06, + "loss": 0.3861, + "step": 7495 + }, + { + "epoch": 2.5503167382339185, + "grad_norm": 5.267665892342849, + "learning_rate": 2.478573901748932e-06, + "loss": 0.4066, + "step": 7500 + }, + { + "epoch": 2.552017346201267, + "grad_norm": 5.184477130223167, + "learning_rate": 2.475861816961187e-06, + "loss": 0.4084, + "step": 7505 + }, + { + "epoch": 2.553717954168615, + "grad_norm": 2.513941647767793, + "learning_rate": 2.4731497605831747e-06, + "loss": 0.3718, + "step": 7510 + }, + { + "epoch": 2.5554185621359635, + "grad_norm": 3.5225126273980467, + "learning_rate": 2.470437735806884e-06, + "loss": 0.3827, + "step": 7515 + }, + { + "epoch": 2.557119170103312, + "grad_norm": 4.293542179809013, + "learning_rate": 2.4677257458242645e-06, + "loss": 0.3697, + "step": 7520 + }, + { + "epoch": 2.55881977807066, + "grad_norm": 5.69680532630941, + "learning_rate": 2.4650137938272285e-06, + "loss": 0.3802, + "step": 7525 + }, + { + "epoch": 2.5605203860380086, + "grad_norm": 2.967764381460697, + "learning_rate": 2.4623018830076405e-06, + "loss": 0.373, + "step": 7530 + }, + { + "epoch": 2.562220994005357, + "grad_norm": 7.904769990452013, + "learning_rate": 2.459590016557317e-06, + "loss": 0.347, + "step": 7535 + }, + { + "epoch": 2.563921601972705, + "grad_norm": 2.6712999240040443, + "learning_rate": 2.4568781976680233e-06, + "loss": 0.3489, + "step": 7540 + }, + { + "epoch": 2.5656222099400536, + "grad_norm": 4.817275571665213, + "learning_rate": 2.4541664295314677e-06, + "loss": 0.368, + "step": 7545 + }, + { + "epoch": 2.567322817907402, + "grad_norm": 2.9038353180708527, + "learning_rate": 2.4514547153392997e-06, + "loss": 0.3886, + "step": 7550 + }, + { + "epoch": 2.56902342587475, + "grad_norm": 3.755370708675757, + "learning_rate": 2.4487430582831047e-06, + "loss": 0.3554, + "step": 7555 + }, + { + "epoch": 2.5707240338420987, + "grad_norm": 4.313990405515008, + "learning_rate": 2.446031461554401e-06, + "loss": 0.3781, + "step": 7560 + }, + { + "epoch": 2.5724246418094467, + "grad_norm": 7.213752168555879, + "learning_rate": 2.4433199283446355e-06, + "loss": 0.3746, + "step": 7565 + }, + { + "epoch": 2.5741252497767952, + "grad_norm": 4.781137209485712, + "learning_rate": 2.4406084618451814e-06, + "loss": 0.3818, + "step": 7570 + }, + { + "epoch": 2.5758258577441433, + "grad_norm": 3.7032399148982402, + "learning_rate": 2.4378970652473326e-06, + "loss": 0.3767, + "step": 7575 + }, + { + "epoch": 2.577526465711492, + "grad_norm": 3.3414048650087937, + "learning_rate": 2.4351857417422997e-06, + "loss": 0.3805, + "step": 7580 + }, + { + "epoch": 2.5792270736788403, + "grad_norm": 3.0772607371490155, + "learning_rate": 2.43247449452121e-06, + "loss": 0.3903, + "step": 7585 + }, + { + "epoch": 2.5809276816461884, + "grad_norm": 3.571353844353706, + "learning_rate": 2.429763326775099e-06, + "loss": 0.3954, + "step": 7590 + }, + { + "epoch": 2.582628289613537, + "grad_norm": 4.5065287338944895, + "learning_rate": 2.4270522416949087e-06, + "loss": 0.3657, + "step": 7595 + }, + { + "epoch": 2.5843288975808854, + "grad_norm": 8.471981991390132, + "learning_rate": 2.4243412424714845e-06, + "loss": 0.4025, + "step": 7600 + }, + { + "epoch": 2.5860295055482334, + "grad_norm": 7.019370364912208, + "learning_rate": 2.42163033229557e-06, + "loss": 0.3711, + "step": 7605 + }, + { + "epoch": 2.587730113515582, + "grad_norm": 2.6919262297176045, + "learning_rate": 2.4189195143578055e-06, + "loss": 0.3839, + "step": 7610 + }, + { + "epoch": 2.5894307214829304, + "grad_norm": 4.008927772103962, + "learning_rate": 2.4162087918487207e-06, + "loss": 0.3821, + "step": 7615 + }, + { + "epoch": 2.5911313294502785, + "grad_norm": 5.902137336977824, + "learning_rate": 2.4134981679587342e-06, + "loss": 0.3883, + "step": 7620 + }, + { + "epoch": 2.592831937417627, + "grad_norm": 4.924077039933856, + "learning_rate": 2.4107876458781485e-06, + "loss": 0.3907, + "step": 7625 + }, + { + "epoch": 2.594532545384975, + "grad_norm": 2.8942783036038042, + "learning_rate": 2.4080772287971455e-06, + "loss": 0.3619, + "step": 7630 + }, + { + "epoch": 2.5962331533523235, + "grad_norm": 3.664197602241772, + "learning_rate": 2.405366919905785e-06, + "loss": 0.3676, + "step": 7635 + }, + { + "epoch": 2.5979337613196716, + "grad_norm": 3.3608785711897693, + "learning_rate": 2.4026567223939976e-06, + "loss": 0.395, + "step": 7640 + }, + { + "epoch": 2.59963436928702, + "grad_norm": 3.8505348636741195, + "learning_rate": 2.3999466394515846e-06, + "loss": 0.3761, + "step": 7645 + }, + { + "epoch": 2.6013349772543686, + "grad_norm": 5.337704928704311, + "learning_rate": 2.397236674268211e-06, + "loss": 0.3904, + "step": 7650 + }, + { + "epoch": 2.6030355852217166, + "grad_norm": 5.094246440451878, + "learning_rate": 2.3945268300334047e-06, + "loss": 0.3831, + "step": 7655 + }, + { + "epoch": 2.604736193189065, + "grad_norm": 26.06405211120412, + "learning_rate": 2.3918171099365493e-06, + "loss": 0.3683, + "step": 7660 + }, + { + "epoch": 2.6064368011564136, + "grad_norm": 4.381549185854137, + "learning_rate": 2.389107517166884e-06, + "loss": 0.3577, + "step": 7665 + }, + { + "epoch": 2.6081374091237617, + "grad_norm": 5.131982881725748, + "learning_rate": 2.386398054913497e-06, + "loss": 0.4, + "step": 7670 + }, + { + "epoch": 2.60983801709111, + "grad_norm": 3.852507064477302, + "learning_rate": 2.3836887263653246e-06, + "loss": 0.3688, + "step": 7675 + }, + { + "epoch": 2.6115386250584582, + "grad_norm": 5.5768234648563215, + "learning_rate": 2.380979534711143e-06, + "loss": 0.3816, + "step": 7680 + }, + { + "epoch": 2.6132392330258067, + "grad_norm": 6.229018627969279, + "learning_rate": 2.3782704831395694e-06, + "loss": 0.3757, + "step": 7685 + }, + { + "epoch": 2.614939840993155, + "grad_norm": 4.587671023782299, + "learning_rate": 2.3755615748390563e-06, + "loss": 0.3922, + "step": 7690 + }, + { + "epoch": 2.6166404489605033, + "grad_norm": 3.3821758897028174, + "learning_rate": 2.372852812997886e-06, + "loss": 0.3755, + "step": 7695 + }, + { + "epoch": 2.618341056927852, + "grad_norm": 5.019978799382523, + "learning_rate": 2.3701442008041682e-06, + "loss": 0.3646, + "step": 7700 + }, + { + "epoch": 2.6200416648952, + "grad_norm": 5.35780972776436, + "learning_rate": 2.3674357414458395e-06, + "loss": 0.3646, + "step": 7705 + }, + { + "epoch": 2.6217422728625484, + "grad_norm": 8.517200690934615, + "learning_rate": 2.364727438110654e-06, + "loss": 0.3682, + "step": 7710 + }, + { + "epoch": 2.623442880829897, + "grad_norm": 4.786010589800363, + "learning_rate": 2.3620192939861827e-06, + "loss": 0.3739, + "step": 7715 + }, + { + "epoch": 2.625143488797245, + "grad_norm": 6.383939384051332, + "learning_rate": 2.359311312259809e-06, + "loss": 0.3655, + "step": 7720 + }, + { + "epoch": 2.6268440967645934, + "grad_norm": 4.309724877465426, + "learning_rate": 2.356603496118726e-06, + "loss": 0.393, + "step": 7725 + }, + { + "epoch": 2.628544704731942, + "grad_norm": 5.480641163818891, + "learning_rate": 2.353895848749931e-06, + "loss": 0.3788, + "step": 7730 + }, + { + "epoch": 2.63024531269929, + "grad_norm": 4.328706468107978, + "learning_rate": 2.351188373340223e-06, + "loss": 0.377, + "step": 7735 + }, + { + "epoch": 2.6319459206666385, + "grad_norm": 7.62919261883074, + "learning_rate": 2.348481073076199e-06, + "loss": 0.4051, + "step": 7740 + }, + { + "epoch": 2.6336465286339865, + "grad_norm": 3.224620235847286, + "learning_rate": 2.345773951144249e-06, + "loss": 0.3943, + "step": 7745 + }, + { + "epoch": 2.635347136601335, + "grad_norm": 4.335437896054658, + "learning_rate": 2.343067010730554e-06, + "loss": 0.3917, + "step": 7750 + }, + { + "epoch": 2.637047744568683, + "grad_norm": 6.3436081714943935, + "learning_rate": 2.340360255021081e-06, + "loss": 0.3914, + "step": 7755 + }, + { + "epoch": 2.6387483525360316, + "grad_norm": 3.5420470161572357, + "learning_rate": 2.337653687201579e-06, + "loss": 0.4012, + "step": 7760 + }, + { + "epoch": 2.64044896050338, + "grad_norm": 5.199887613583664, + "learning_rate": 2.3349473104575775e-06, + "loss": 0.3701, + "step": 7765 + }, + { + "epoch": 2.642149568470728, + "grad_norm": 6.659111837995282, + "learning_rate": 2.3322411279743794e-06, + "loss": 0.3658, + "step": 7770 + }, + { + "epoch": 2.6438501764380766, + "grad_norm": 9.759159522946431, + "learning_rate": 2.32953514293706e-06, + "loss": 0.3669, + "step": 7775 + }, + { + "epoch": 2.645550784405425, + "grad_norm": 8.286491160452785, + "learning_rate": 2.3268293585304615e-06, + "loss": 0.3769, + "step": 7780 + }, + { + "epoch": 2.647251392372773, + "grad_norm": 2.889876656758775, + "learning_rate": 2.32412377793919e-06, + "loss": 0.4135, + "step": 7785 + }, + { + "epoch": 2.6489520003401217, + "grad_norm": 3.5824133480839353, + "learning_rate": 2.321418404347613e-06, + "loss": 0.394, + "step": 7790 + }, + { + "epoch": 2.65065260830747, + "grad_norm": 3.9983799200992594, + "learning_rate": 2.318713240939853e-06, + "loss": 0.3895, + "step": 7795 + }, + { + "epoch": 2.6523532162748182, + "grad_norm": 6.824853929282826, + "learning_rate": 2.316008290899785e-06, + "loss": 0.3843, + "step": 7800 + }, + { + "epoch": 2.6540538242421667, + "grad_norm": 3.310059658229797, + "learning_rate": 2.3133035574110338e-06, + "loss": 0.3845, + "step": 7805 + }, + { + "epoch": 2.655754432209515, + "grad_norm": 11.233557190194349, + "learning_rate": 2.310599043656969e-06, + "loss": 0.3888, + "step": 7810 + }, + { + "epoch": 2.6574550401768633, + "grad_norm": 6.894650044327275, + "learning_rate": 2.3078947528207012e-06, + "loss": 0.3815, + "step": 7815 + }, + { + "epoch": 2.6591556481442113, + "grad_norm": 4.2025389632182515, + "learning_rate": 2.3051906880850786e-06, + "loss": 0.3697, + "step": 7820 + }, + { + "epoch": 2.66085625611156, + "grad_norm": 5.769520512488445, + "learning_rate": 2.3024868526326846e-06, + "loss": 0.3762, + "step": 7825 + }, + { + "epoch": 2.6625568640789083, + "grad_norm": 4.6035878269965425, + "learning_rate": 2.299783249645832e-06, + "loss": 0.3663, + "step": 7830 + }, + { + "epoch": 2.6642574720462564, + "grad_norm": 3.1697397351640513, + "learning_rate": 2.297079882306558e-06, + "loss": 0.3667, + "step": 7835 + }, + { + "epoch": 2.665958080013605, + "grad_norm": 4.3110879400985604, + "learning_rate": 2.294376753796626e-06, + "loss": 0.3558, + "step": 7840 + }, + { + "epoch": 2.6676586879809534, + "grad_norm": 16.53847455813875, + "learning_rate": 2.2916738672975154e-06, + "loss": 0.3876, + "step": 7845 + }, + { + "epoch": 2.6693592959483015, + "grad_norm": 3.59045893067525, + "learning_rate": 2.2889712259904222e-06, + "loss": 0.3901, + "step": 7850 + }, + { + "epoch": 2.67105990391565, + "grad_norm": 4.1466790838667755, + "learning_rate": 2.286268833056254e-06, + "loss": 0.3908, + "step": 7855 + }, + { + "epoch": 2.672760511882998, + "grad_norm": 4.030465815283501, + "learning_rate": 2.283566691675625e-06, + "loss": 0.3778, + "step": 7860 + }, + { + "epoch": 2.6744611198503465, + "grad_norm": 4.993116520883105, + "learning_rate": 2.2808648050288535e-06, + "loss": 0.3882, + "step": 7865 + }, + { + "epoch": 2.6761617278176946, + "grad_norm": 3.8372994898270005, + "learning_rate": 2.2781631762959596e-06, + "loss": 0.3817, + "step": 7870 + }, + { + "epoch": 2.677862335785043, + "grad_norm": 3.519788857821788, + "learning_rate": 2.2754618086566572e-06, + "loss": 0.4041, + "step": 7875 + }, + { + "epoch": 2.6795629437523916, + "grad_norm": 14.842927848150968, + "learning_rate": 2.272760705290356e-06, + "loss": 0.3651, + "step": 7880 + }, + { + "epoch": 2.6812635517197396, + "grad_norm": 7.4190183694324965, + "learning_rate": 2.270059869376151e-06, + "loss": 0.3959, + "step": 7885 + }, + { + "epoch": 2.682964159687088, + "grad_norm": 2.8103680082154043, + "learning_rate": 2.267359304092826e-06, + "loss": 0.3607, + "step": 7890 + }, + { + "epoch": 2.6846647676544366, + "grad_norm": 7.937493753557344, + "learning_rate": 2.264659012618845e-06, + "loss": 0.385, + "step": 7895 + }, + { + "epoch": 2.6863653756217847, + "grad_norm": 6.71945655343373, + "learning_rate": 2.2619589981323483e-06, + "loss": 0.392, + "step": 7900 + }, + { + "epoch": 2.688065983589133, + "grad_norm": 6.118543164068601, + "learning_rate": 2.259259263811151e-06, + "loss": 0.3672, + "step": 7905 + }, + { + "epoch": 2.6897665915564817, + "grad_norm": 6.386964430087839, + "learning_rate": 2.2565598128327406e-06, + "loss": 0.4052, + "step": 7910 + }, + { + "epoch": 2.6914671995238297, + "grad_norm": 4.389010997821173, + "learning_rate": 2.2538606483742676e-06, + "loss": 0.4002, + "step": 7915 + }, + { + "epoch": 2.6931678074911782, + "grad_norm": 6.583744813712634, + "learning_rate": 2.2511617736125474e-06, + "loss": 0.4012, + "step": 7920 + }, + { + "epoch": 2.6948684154585263, + "grad_norm": 4.355606914465958, + "learning_rate": 2.2484631917240545e-06, + "loss": 0.3847, + "step": 7925 + }, + { + "epoch": 2.696569023425875, + "grad_norm": 5.6245867527012035, + "learning_rate": 2.245764905884918e-06, + "loss": 0.3712, + "step": 7930 + }, + { + "epoch": 2.698269631393223, + "grad_norm": 6.816235104622127, + "learning_rate": 2.2430669192709185e-06, + "loss": 0.3903, + "step": 7935 + }, + { + "epoch": 2.6999702393605713, + "grad_norm": 3.7447476154787367, + "learning_rate": 2.240369235057485e-06, + "loss": 0.3696, + "step": 7940 + }, + { + "epoch": 2.70167084732792, + "grad_norm": 3.505473103172748, + "learning_rate": 2.2376718564196893e-06, + "loss": 0.3748, + "step": 7945 + }, + { + "epoch": 2.703371455295268, + "grad_norm": 8.182338734493161, + "learning_rate": 2.2349747865322463e-06, + "loss": 0.3671, + "step": 7950 + }, + { + "epoch": 2.7050720632626164, + "grad_norm": 3.3738020778194224, + "learning_rate": 2.232278028569504e-06, + "loss": 0.3735, + "step": 7955 + }, + { + "epoch": 2.706772671229965, + "grad_norm": 6.372010091411438, + "learning_rate": 2.229581585705447e-06, + "loss": 0.3805, + "step": 7960 + }, + { + "epoch": 2.708473279197313, + "grad_norm": 3.721118322118046, + "learning_rate": 2.2268854611136853e-06, + "loss": 0.3487, + "step": 7965 + }, + { + "epoch": 2.7101738871646615, + "grad_norm": 4.129352252876067, + "learning_rate": 2.2241896579674563e-06, + "loss": 0.3678, + "step": 7970 + }, + { + "epoch": 2.71187449513201, + "grad_norm": 6.703226073088949, + "learning_rate": 2.22149417943962e-06, + "loss": 0.3751, + "step": 7975 + }, + { + "epoch": 2.713575103099358, + "grad_norm": 7.087381797195783, + "learning_rate": 2.2187990287026525e-06, + "loss": 0.3662, + "step": 7980 + }, + { + "epoch": 2.7152757110667065, + "grad_norm": 7.831322740182201, + "learning_rate": 2.2161042089286444e-06, + "loss": 0.4128, + "step": 7985 + }, + { + "epoch": 2.7169763190340546, + "grad_norm": 23.023277527211253, + "learning_rate": 2.2134097232892974e-06, + "loss": 0.373, + "step": 7990 + }, + { + "epoch": 2.718676927001403, + "grad_norm": 3.767570360293103, + "learning_rate": 2.21071557495592e-06, + "loss": 0.3876, + "step": 7995 + }, + { + "epoch": 2.720377534968751, + "grad_norm": 3.786013061698436, + "learning_rate": 2.208021767099423e-06, + "loss": 0.3597, + "step": 8000 + }, + { + "epoch": 2.7220781429360996, + "grad_norm": 4.453520170115316, + "learning_rate": 2.2053283028903174e-06, + "loss": 0.3893, + "step": 8005 + }, + { + "epoch": 2.723778750903448, + "grad_norm": 3.5345683175464413, + "learning_rate": 2.2026351854987084e-06, + "loss": 0.3609, + "step": 8010 + }, + { + "epoch": 2.725479358870796, + "grad_norm": 2.971594563933989, + "learning_rate": 2.1999424180942945e-06, + "loss": 0.3667, + "step": 8015 + }, + { + "epoch": 2.7271799668381447, + "grad_norm": 11.62878333088526, + "learning_rate": 2.1972500038463614e-06, + "loss": 0.372, + "step": 8020 + }, + { + "epoch": 2.728880574805493, + "grad_norm": 6.510889807932155, + "learning_rate": 2.1945579459237787e-06, + "loss": 0.3886, + "step": 8025 + }, + { + "epoch": 2.7305811827728412, + "grad_norm": 32.701936557309374, + "learning_rate": 2.1918662474949974e-06, + "loss": 0.3719, + "step": 8030 + }, + { + "epoch": 2.7322817907401897, + "grad_norm": 4.0058386725211, + "learning_rate": 2.1891749117280463e-06, + "loss": 0.3832, + "step": 8035 + }, + { + "epoch": 2.733982398707538, + "grad_norm": 3.019026214432123, + "learning_rate": 2.186483941790526e-06, + "loss": 0.389, + "step": 8040 + }, + { + "epoch": 2.7356830066748863, + "grad_norm": 3.770997398095723, + "learning_rate": 2.183793340849606e-06, + "loss": 0.3508, + "step": 8045 + }, + { + "epoch": 2.7373836146422343, + "grad_norm": 4.222862912262694, + "learning_rate": 2.181103112072023e-06, + "loss": 0.3507, + "step": 8050 + }, + { + "epoch": 2.739084222609583, + "grad_norm": 3.910203241840868, + "learning_rate": 2.1784132586240746e-06, + "loss": 0.3844, + "step": 8055 + }, + { + "epoch": 2.7407848305769313, + "grad_norm": 21.535114374675835, + "learning_rate": 2.1757237836716173e-06, + "loss": 0.3707, + "step": 8060 + }, + { + "epoch": 2.7424854385442794, + "grad_norm": 4.1731150684600555, + "learning_rate": 2.1730346903800625e-06, + "loss": 0.3872, + "step": 8065 + }, + { + "epoch": 2.744186046511628, + "grad_norm": 82.33717574896272, + "learning_rate": 2.170345981914371e-06, + "loss": 0.3993, + "step": 8070 + }, + { + "epoch": 2.7458866544789764, + "grad_norm": 6.524400159721665, + "learning_rate": 2.167657661439051e-06, + "loss": 0.3801, + "step": 8075 + }, + { + "epoch": 2.7475872624463245, + "grad_norm": 4.9763136904935115, + "learning_rate": 2.1649697321181555e-06, + "loss": 0.3828, + "step": 8080 + }, + { + "epoch": 2.749287870413673, + "grad_norm": 5.610444848264231, + "learning_rate": 2.1622821971152762e-06, + "loss": 0.3753, + "step": 8085 + }, + { + "epoch": 2.7509884783810215, + "grad_norm": 3.6915191716116125, + "learning_rate": 2.1595950595935393e-06, + "loss": 0.3803, + "step": 8090 + }, + { + "epoch": 2.7526890863483695, + "grad_norm": 6.5278947678428025, + "learning_rate": 2.1569083227156064e-06, + "loss": 0.3637, + "step": 8095 + }, + { + "epoch": 2.754389694315718, + "grad_norm": 5.5775103216434205, + "learning_rate": 2.1542219896436647e-06, + "loss": 0.3746, + "step": 8100 + }, + { + "epoch": 2.756090302283066, + "grad_norm": 4.85406609326352, + "learning_rate": 2.151536063539427e-06, + "loss": 0.3846, + "step": 8105 + }, + { + "epoch": 2.7577909102504146, + "grad_norm": 8.979352424237328, + "learning_rate": 2.148850547564128e-06, + "loss": 0.3774, + "step": 8110 + }, + { + "epoch": 2.7594915182177626, + "grad_norm": 4.647022479580419, + "learning_rate": 2.146165444878518e-06, + "loss": 0.3612, + "step": 8115 + }, + { + "epoch": 2.761192126185111, + "grad_norm": 4.257524473576935, + "learning_rate": 2.143480758642862e-06, + "loss": 0.3877, + "step": 8120 + }, + { + "epoch": 2.7628927341524596, + "grad_norm": 5.083975554617405, + "learning_rate": 2.140796492016935e-06, + "loss": 0.3972, + "step": 8125 + }, + { + "epoch": 2.7645933421198077, + "grad_norm": 3.7558709013901317, + "learning_rate": 2.1381126481600177e-06, + "loss": 0.3704, + "step": 8130 + }, + { + "epoch": 2.766293950087156, + "grad_norm": 6.936058030138578, + "learning_rate": 2.1354292302308934e-06, + "loss": 0.3993, + "step": 8135 + }, + { + "epoch": 2.7679945580545047, + "grad_norm": 3.8309644397369635, + "learning_rate": 2.1327462413878435e-06, + "loss": 0.3586, + "step": 8140 + }, + { + "epoch": 2.7696951660218527, + "grad_norm": 6.63698719991675, + "learning_rate": 2.1300636847886454e-06, + "loss": 0.3789, + "step": 8145 + }, + { + "epoch": 2.7713957739892012, + "grad_norm": 3.6810387030854956, + "learning_rate": 2.1273815635905665e-06, + "loss": 0.3583, + "step": 8150 + }, + { + "epoch": 2.7730963819565497, + "grad_norm": 5.340160023912234, + "learning_rate": 2.124699880950364e-06, + "loss": 0.3606, + "step": 8155 + }, + { + "epoch": 2.774796989923898, + "grad_norm": 3.3374867768954144, + "learning_rate": 2.122018640024276e-06, + "loss": 0.3534, + "step": 8160 + }, + { + "epoch": 2.7764975978912463, + "grad_norm": 23.05919247327434, + "learning_rate": 2.119337843968023e-06, + "loss": 0.366, + "step": 8165 + }, + { + "epoch": 2.7781982058585943, + "grad_norm": 10.548598893835438, + "learning_rate": 2.1166574959368007e-06, + "loss": 0.3637, + "step": 8170 + }, + { + "epoch": 2.779898813825943, + "grad_norm": 3.719602530650269, + "learning_rate": 2.1139775990852777e-06, + "loss": 0.376, + "step": 8175 + }, + { + "epoch": 2.781599421793291, + "grad_norm": 3.9314066322384407, + "learning_rate": 2.111298156567592e-06, + "loss": 0.3735, + "step": 8180 + }, + { + "epoch": 2.7833000297606394, + "grad_norm": 3.8470975640453413, + "learning_rate": 2.1086191715373465e-06, + "loss": 0.3667, + "step": 8185 + }, + { + "epoch": 2.785000637727988, + "grad_norm": 13.062597481548549, + "learning_rate": 2.105940647147606e-06, + "loss": 0.3639, + "step": 8190 + }, + { + "epoch": 2.786701245695336, + "grad_norm": 4.425463584564762, + "learning_rate": 2.1032625865508927e-06, + "loss": 0.3811, + "step": 8195 + }, + { + "epoch": 2.7884018536626844, + "grad_norm": 8.416501818865054, + "learning_rate": 2.1005849928991827e-06, + "loss": 0.3811, + "step": 8200 + }, + { + "epoch": 2.790102461630033, + "grad_norm": 8.558767887079613, + "learning_rate": 2.0979078693439038e-06, + "loss": 0.3711, + "step": 8205 + }, + { + "epoch": 2.791803069597381, + "grad_norm": 3.4236488145310986, + "learning_rate": 2.0952312190359287e-06, + "loss": 0.3891, + "step": 8210 + }, + { + "epoch": 2.7935036775647295, + "grad_norm": 5.493554420139218, + "learning_rate": 2.0925550451255747e-06, + "loss": 0.376, + "step": 8215 + }, + { + "epoch": 2.7952042855320776, + "grad_norm": 3.313325831993448, + "learning_rate": 2.089879350762598e-06, + "loss": 0.3685, + "step": 8220 + }, + { + "epoch": 2.796904893499426, + "grad_norm": 22.4153088648218, + "learning_rate": 2.08720413909619e-06, + "loss": 0.3602, + "step": 8225 + }, + { + "epoch": 2.798605501466774, + "grad_norm": 8.98408239788447, + "learning_rate": 2.0845294132749736e-06, + "loss": 0.3681, + "step": 8230 + }, + { + "epoch": 2.8003061094341226, + "grad_norm": 3.4347162415729353, + "learning_rate": 2.081855176447001e-06, + "loss": 0.3784, + "step": 8235 + }, + { + "epoch": 2.802006717401471, + "grad_norm": 4.215328915462785, + "learning_rate": 2.079181431759748e-06, + "loss": 0.3803, + "step": 8240 + }, + { + "epoch": 2.803707325368819, + "grad_norm": 119.95437737730414, + "learning_rate": 2.076508182360111e-06, + "loss": 0.3872, + "step": 8245 + }, + { + "epoch": 2.8054079333361677, + "grad_norm": 3.9604876165994285, + "learning_rate": 2.0738354313944055e-06, + "loss": 0.3849, + "step": 8250 + }, + { + "epoch": 2.807108541303516, + "grad_norm": 4.523244770758718, + "learning_rate": 2.0711631820083575e-06, + "loss": 0.3715, + "step": 8255 + }, + { + "epoch": 2.8088091492708642, + "grad_norm": 3.6297302660625874, + "learning_rate": 2.068491437347104e-06, + "loss": 0.3842, + "step": 8260 + }, + { + "epoch": 2.8105097572382127, + "grad_norm": 5.359805674286737, + "learning_rate": 2.065820200555188e-06, + "loss": 0.3387, + "step": 8265 + }, + { + "epoch": 2.8122103652055612, + "grad_norm": 4.557516040202472, + "learning_rate": 2.0631494747765546e-06, + "loss": 0.3659, + "step": 8270 + }, + { + "epoch": 2.8139109731729093, + "grad_norm": 6.7908173456117895, + "learning_rate": 2.0604792631545482e-06, + "loss": 0.3928, + "step": 8275 + }, + { + "epoch": 2.815611581140258, + "grad_norm": 11.121940372885511, + "learning_rate": 2.057809568831907e-06, + "loss": 0.3543, + "step": 8280 + }, + { + "epoch": 2.817312189107606, + "grad_norm": 30.478693939131286, + "learning_rate": 2.0551403949507604e-06, + "loss": 0.3676, + "step": 8285 + }, + { + "epoch": 2.8190127970749543, + "grad_norm": 10.862270715950094, + "learning_rate": 2.0524717446526264e-06, + "loss": 0.3529, + "step": 8290 + }, + { + "epoch": 2.8207134050423024, + "grad_norm": 9.092002991175214, + "learning_rate": 2.049803621078405e-06, + "loss": 0.3663, + "step": 8295 + }, + { + "epoch": 2.822414013009651, + "grad_norm": 3.4487696947423574, + "learning_rate": 2.047136027368378e-06, + "loss": 0.3568, + "step": 8300 + }, + { + "epoch": 2.8241146209769994, + "grad_norm": 3.5544329404182404, + "learning_rate": 2.044468966662202e-06, + "loss": 0.3785, + "step": 8305 + }, + { + "epoch": 2.8258152289443474, + "grad_norm": 14.164049469477636, + "learning_rate": 2.0418024420989075e-06, + "loss": 0.3825, + "step": 8310 + }, + { + "epoch": 2.827515836911696, + "grad_norm": 4.361080170557115, + "learning_rate": 2.0391364568168936e-06, + "loss": 0.3776, + "step": 8315 + }, + { + "epoch": 2.8292164448790444, + "grad_norm": 7.4345207036596115, + "learning_rate": 2.036471013953925e-06, + "loss": 0.3928, + "step": 8320 + }, + { + "epoch": 2.8309170528463925, + "grad_norm": 3.1225404452489536, + "learning_rate": 2.033806116647127e-06, + "loss": 0.3881, + "step": 8325 + }, + { + "epoch": 2.832617660813741, + "grad_norm": 7.796796229295892, + "learning_rate": 2.031141768032983e-06, + "loss": 0.3757, + "step": 8330 + }, + { + "epoch": 2.8343182687810895, + "grad_norm": 3.299102522694853, + "learning_rate": 2.028477971247332e-06, + "loss": 0.3402, + "step": 8335 + }, + { + "epoch": 2.8360188767484376, + "grad_norm": 4.067819755694867, + "learning_rate": 2.0258147294253627e-06, + "loss": 0.3761, + "step": 8340 + }, + { + "epoch": 2.837719484715786, + "grad_norm": 8.741701238488007, + "learning_rate": 2.02315204570161e-06, + "loss": 0.346, + "step": 8345 + }, + { + "epoch": 2.839420092683134, + "grad_norm": 4.864531791571482, + "learning_rate": 2.0204899232099527e-06, + "loss": 0.3809, + "step": 8350 + }, + { + "epoch": 2.8411207006504826, + "grad_norm": 4.759769443217969, + "learning_rate": 2.017828365083608e-06, + "loss": 0.3468, + "step": 8355 + }, + { + "epoch": 2.8428213086178307, + "grad_norm": 5.282635242019972, + "learning_rate": 2.0151673744551305e-06, + "loss": 0.3933, + "step": 8360 + }, + { + "epoch": 2.844521916585179, + "grad_norm": 6.999001062086554, + "learning_rate": 2.0125069544564057e-06, + "loss": 0.382, + "step": 8365 + }, + { + "epoch": 2.8462225245525277, + "grad_norm": 4.122810194501148, + "learning_rate": 2.009847108218648e-06, + "loss": 0.3712, + "step": 8370 + }, + { + "epoch": 2.8479231325198757, + "grad_norm": 3.8107728928149824, + "learning_rate": 2.007187838872396e-06, + "loss": 0.3765, + "step": 8375 + }, + { + "epoch": 2.849623740487224, + "grad_norm": 10.353358184774276, + "learning_rate": 2.00452914954751e-06, + "loss": 0.3617, + "step": 8380 + }, + { + "epoch": 2.8513243484545727, + "grad_norm": 7.173375709728723, + "learning_rate": 2.0018710433731667e-06, + "loss": 0.3511, + "step": 8385 + }, + { + "epoch": 2.8530249564219208, + "grad_norm": 4.87187528806106, + "learning_rate": 1.999213523477857e-06, + "loss": 0.3847, + "step": 8390 + }, + { + "epoch": 2.8547255643892693, + "grad_norm": 4.975178172526431, + "learning_rate": 1.9965565929893825e-06, + "loss": 0.3707, + "step": 8395 + }, + { + "epoch": 2.8564261723566173, + "grad_norm": 10.3854682049754, + "learning_rate": 1.9939002550348506e-06, + "loss": 0.3702, + "step": 8400 + }, + { + "epoch": 2.858126780323966, + "grad_norm": 20.460389393682824, + "learning_rate": 1.99124451274067e-06, + "loss": 0.3771, + "step": 8405 + }, + { + "epoch": 2.859827388291314, + "grad_norm": 5.005293028199917, + "learning_rate": 1.98858936923255e-06, + "loss": 0.3575, + "step": 8410 + }, + { + "epoch": 2.8615279962586624, + "grad_norm": 2.8275024075063144, + "learning_rate": 1.985934827635495e-06, + "loss": 0.3681, + "step": 8415 + }, + { + "epoch": 2.863228604226011, + "grad_norm": 4.977576052712527, + "learning_rate": 1.9832808910738e-06, + "loss": 0.3991, + "step": 8420 + }, + { + "epoch": 2.864929212193359, + "grad_norm": 3.3366440868337612, + "learning_rate": 1.9806275626710483e-06, + "loss": 0.3887, + "step": 8425 + }, + { + "epoch": 2.8666298201607074, + "grad_norm": 31.395805092917065, + "learning_rate": 1.977974845550108e-06, + "loss": 0.395, + "step": 8430 + }, + { + "epoch": 2.868330428128056, + "grad_norm": 2.9171081488360238, + "learning_rate": 1.975322742833127e-06, + "loss": 0.3638, + "step": 8435 + }, + { + "epoch": 2.870031036095404, + "grad_norm": 7.071623054198009, + "learning_rate": 1.972671257641531e-06, + "loss": 0.3572, + "step": 8440 + }, + { + "epoch": 2.8717316440627525, + "grad_norm": 4.116140867083445, + "learning_rate": 1.970020393096017e-06, + "loss": 0.3551, + "step": 8445 + }, + { + "epoch": 2.873432252030101, + "grad_norm": 4.582359505283773, + "learning_rate": 1.9673701523165537e-06, + "loss": 0.3589, + "step": 8450 + }, + { + "epoch": 2.875132859997449, + "grad_norm": 4.402633709070161, + "learning_rate": 1.964720538422375e-06, + "loss": 0.3824, + "step": 8455 + }, + { + "epoch": 2.8768334679647976, + "grad_norm": 3.686748929733052, + "learning_rate": 1.9620715545319763e-06, + "loss": 0.386, + "step": 8460 + }, + { + "epoch": 2.8785340759321456, + "grad_norm": 3.501304828830748, + "learning_rate": 1.959423203763112e-06, + "loss": 0.3814, + "step": 8465 + }, + { + "epoch": 2.880234683899494, + "grad_norm": 4.273306250942062, + "learning_rate": 1.9567754892327913e-06, + "loss": 0.3611, + "step": 8470 + }, + { + "epoch": 2.881935291866842, + "grad_norm": 4.377492367442627, + "learning_rate": 1.9541284140572747e-06, + "loss": 0.373, + "step": 8475 + }, + { + "epoch": 2.8836358998341907, + "grad_norm": 3.4896546835096607, + "learning_rate": 1.9514819813520697e-06, + "loss": 0.3733, + "step": 8480 + }, + { + "epoch": 2.885336507801539, + "grad_norm": 6.416417362163938, + "learning_rate": 1.9488361942319283e-06, + "loss": 0.3781, + "step": 8485 + }, + { + "epoch": 2.887037115768887, + "grad_norm": 4.073770948021414, + "learning_rate": 1.946191055810842e-06, + "loss": 0.3742, + "step": 8490 + }, + { + "epoch": 2.8887377237362357, + "grad_norm": 3.2692801404584086, + "learning_rate": 1.94354656920204e-06, + "loss": 0.38, + "step": 8495 + }, + { + "epoch": 2.890438331703584, + "grad_norm": 3.868075079047741, + "learning_rate": 1.9409027375179827e-06, + "loss": 0.3727, + "step": 8500 + }, + { + "epoch": 2.8921389396709323, + "grad_norm": 3.498489470199948, + "learning_rate": 1.9382595638703603e-06, + "loss": 0.3687, + "step": 8505 + }, + { + "epoch": 2.8938395476382808, + "grad_norm": 4.898602514999291, + "learning_rate": 1.935617051370089e-06, + "loss": 0.3813, + "step": 8510 + }, + { + "epoch": 2.8955401556056293, + "grad_norm": 3.7056569273722206, + "learning_rate": 1.9329752031273073e-06, + "loss": 0.3644, + "step": 8515 + }, + { + "epoch": 2.8972407635729773, + "grad_norm": 3.270896760191762, + "learning_rate": 1.93033402225137e-06, + "loss": 0.3922, + "step": 8520 + }, + { + "epoch": 2.898941371540326, + "grad_norm": 11.671782966595131, + "learning_rate": 1.927693511850849e-06, + "loss": 0.3801, + "step": 8525 + }, + { + "epoch": 2.900641979507674, + "grad_norm": 7.352868576954751, + "learning_rate": 1.925053675033524e-06, + "loss": 0.3963, + "step": 8530 + }, + { + "epoch": 2.9023425874750224, + "grad_norm": 14.006913439271788, + "learning_rate": 1.9224145149063845e-06, + "loss": 0.3601, + "step": 8535 + }, + { + "epoch": 2.9040431954423704, + "grad_norm": 3.647756897066697, + "learning_rate": 1.9197760345756227e-06, + "loss": 0.3596, + "step": 8540 + }, + { + "epoch": 2.905743803409719, + "grad_norm": 4.815506591854202, + "learning_rate": 1.9171382371466302e-06, + "loss": 0.3814, + "step": 8545 + }, + { + "epoch": 2.9074444113770674, + "grad_norm": 6.5540382195623375, + "learning_rate": 1.9145011257239957e-06, + "loss": 0.3995, + "step": 8550 + }, + { + "epoch": 2.9091450193444155, + "grad_norm": 4.2081097534769745, + "learning_rate": 1.9118647034115e-06, + "loss": 0.3768, + "step": 8555 + }, + { + "epoch": 2.910845627311764, + "grad_norm": 3.1412509842846186, + "learning_rate": 1.909228973312113e-06, + "loss": 0.3569, + "step": 8560 + }, + { + "epoch": 2.9125462352791125, + "grad_norm": 5.596007643345584, + "learning_rate": 1.9065939385279892e-06, + "loss": 0.371, + "step": 8565 + }, + { + "epoch": 2.9142468432464605, + "grad_norm": 5.501455976838975, + "learning_rate": 1.9039596021604654e-06, + "loss": 0.3831, + "step": 8570 + }, + { + "epoch": 2.915947451213809, + "grad_norm": 2.852898924760873, + "learning_rate": 1.9013259673100577e-06, + "loss": 0.3613, + "step": 8575 + }, + { + "epoch": 2.917648059181157, + "grad_norm": 5.966472806636236, + "learning_rate": 1.898693037076454e-06, + "loss": 0.38, + "step": 8580 + }, + { + "epoch": 2.9193486671485056, + "grad_norm": 5.9729388381891155, + "learning_rate": 1.8960608145585143e-06, + "loss": 0.3894, + "step": 8585 + }, + { + "epoch": 2.9210492751158537, + "grad_norm": 6.982534331399855, + "learning_rate": 1.8934293028542657e-06, + "loss": 0.353, + "step": 8590 + }, + { + "epoch": 2.922749883083202, + "grad_norm": 23.139564328892174, + "learning_rate": 1.8907985050608984e-06, + "loss": 0.3714, + "step": 8595 + }, + { + "epoch": 2.9244504910505507, + "grad_norm": 5.93645905058757, + "learning_rate": 1.8881684242747622e-06, + "loss": 0.3774, + "step": 8600 + }, + { + "epoch": 2.9261510990178987, + "grad_norm": 8.136608402056961, + "learning_rate": 1.8855390635913634e-06, + "loss": 0.3897, + "step": 8605 + }, + { + "epoch": 2.927851706985247, + "grad_norm": 3.021573214294128, + "learning_rate": 1.8829104261053602e-06, + "loss": 0.3736, + "step": 8610 + }, + { + "epoch": 2.9295523149525957, + "grad_norm": 4.933923209392697, + "learning_rate": 1.8802825149105603e-06, + "loss": 0.3653, + "step": 8615 + }, + { + "epoch": 2.9312529229199438, + "grad_norm": 4.706545349659176, + "learning_rate": 1.877655333099916e-06, + "loss": 0.3516, + "step": 8620 + }, + { + "epoch": 2.9329535308872923, + "grad_norm": 6.6339352896567165, + "learning_rate": 1.8750288837655218e-06, + "loss": 0.3747, + "step": 8625 + }, + { + "epoch": 2.9346541388546408, + "grad_norm": 6.618639590750141, + "learning_rate": 1.8724031699986089e-06, + "loss": 0.3814, + "step": 8630 + }, + { + "epoch": 2.936354746821989, + "grad_norm": 7.978336468342465, + "learning_rate": 1.8697781948895446e-06, + "loss": 0.3676, + "step": 8635 + }, + { + "epoch": 2.9380553547893373, + "grad_norm": 4.414106043923808, + "learning_rate": 1.8671539615278257e-06, + "loss": 0.389, + "step": 8640 + }, + { + "epoch": 2.9397559627566854, + "grad_norm": 3.3613193471561646, + "learning_rate": 1.8645304730020752e-06, + "loss": 0.3667, + "step": 8645 + }, + { + "epoch": 2.941456570724034, + "grad_norm": 4.364863672568588, + "learning_rate": 1.8619077324000414e-06, + "loss": 0.3637, + "step": 8650 + }, + { + "epoch": 2.943157178691382, + "grad_norm": 5.026429905846794, + "learning_rate": 1.8592857428085909e-06, + "loss": 0.3717, + "step": 8655 + }, + { + "epoch": 2.9448577866587304, + "grad_norm": 8.479583243415247, + "learning_rate": 1.8566645073137065e-06, + "loss": 0.379, + "step": 8660 + }, + { + "epoch": 2.946558394626079, + "grad_norm": 6.665050142493819, + "learning_rate": 1.854044029000484e-06, + "loss": 0.3963, + "step": 8665 + }, + { + "epoch": 2.948259002593427, + "grad_norm": 3.48398056319533, + "learning_rate": 1.8514243109531277e-06, + "loss": 0.3789, + "step": 8670 + }, + { + "epoch": 2.9499596105607755, + "grad_norm": 4.310777202329698, + "learning_rate": 1.848805356254947e-06, + "loss": 0.3705, + "step": 8675 + }, + { + "epoch": 2.951660218528124, + "grad_norm": 4.9980406559805415, + "learning_rate": 1.8461871679883531e-06, + "loss": 0.3973, + "step": 8680 + }, + { + "epoch": 2.953360826495472, + "grad_norm": 3.63822826338399, + "learning_rate": 1.843569749234855e-06, + "loss": 0.3765, + "step": 8685 + }, + { + "epoch": 2.9550614344628205, + "grad_norm": 4.862423352223616, + "learning_rate": 1.8409531030750563e-06, + "loss": 0.3962, + "step": 8690 + }, + { + "epoch": 2.956762042430169, + "grad_norm": 4.381906282945819, + "learning_rate": 1.83833723258865e-06, + "loss": 0.3688, + "step": 8695 + }, + { + "epoch": 2.958462650397517, + "grad_norm": 4.069621307312408, + "learning_rate": 1.835722140854419e-06, + "loss": 0.3868, + "step": 8700 + }, + { + "epoch": 2.9601632583648656, + "grad_norm": 4.377704750031197, + "learning_rate": 1.833107830950227e-06, + "loss": 0.3683, + "step": 8705 + }, + { + "epoch": 2.9618638663322137, + "grad_norm": 4.293619625964835, + "learning_rate": 1.8304943059530178e-06, + "loss": 0.3584, + "step": 8710 + }, + { + "epoch": 2.963564474299562, + "grad_norm": 6.002080159765941, + "learning_rate": 1.827881568938813e-06, + "loss": 0.3672, + "step": 8715 + }, + { + "epoch": 2.96526508226691, + "grad_norm": 9.916525104628843, + "learning_rate": 1.825269622982705e-06, + "loss": 0.3741, + "step": 8720 + }, + { + "epoch": 2.9669656902342587, + "grad_norm": 5.040093533159837, + "learning_rate": 1.8226584711588557e-06, + "loss": 0.3687, + "step": 8725 + }, + { + "epoch": 2.968666298201607, + "grad_norm": 7.20763291595424, + "learning_rate": 1.8200481165404932e-06, + "loss": 0.3708, + "step": 8730 + }, + { + "epoch": 2.9703669061689553, + "grad_norm": 9.340020113857413, + "learning_rate": 1.8174385621999064e-06, + "loss": 0.3835, + "step": 8735 + }, + { + "epoch": 2.9720675141363038, + "grad_norm": 8.400152105913959, + "learning_rate": 1.8148298112084425e-06, + "loss": 0.3856, + "step": 8740 + }, + { + "epoch": 2.9737681221036523, + "grad_norm": 3.43374814609035, + "learning_rate": 1.8122218666365032e-06, + "loss": 0.3865, + "step": 8745 + }, + { + "epoch": 2.9754687300710003, + "grad_norm": 10.47961502658653, + "learning_rate": 1.8096147315535409e-06, + "loss": 0.3608, + "step": 8750 + }, + { + "epoch": 2.977169338038349, + "grad_norm": 6.460121108404991, + "learning_rate": 1.8070084090280554e-06, + "loss": 0.3916, + "step": 8755 + }, + { + "epoch": 2.978869946005697, + "grad_norm": 5.4173512022009, + "learning_rate": 1.8044029021275905e-06, + "loss": 0.381, + "step": 8760 + }, + { + "epoch": 2.9805705539730454, + "grad_norm": 6.102582510431227, + "learning_rate": 1.8017982139187303e-06, + "loss": 0.3747, + "step": 8765 + }, + { + "epoch": 2.9822711619403934, + "grad_norm": 3.955489197067311, + "learning_rate": 1.7991943474670942e-06, + "loss": 0.3737, + "step": 8770 + }, + { + "epoch": 2.983971769907742, + "grad_norm": 9.085768480316174, + "learning_rate": 1.7965913058373346e-06, + "loss": 0.394, + "step": 8775 + }, + { + "epoch": 2.9856723778750904, + "grad_norm": 10.373678822487804, + "learning_rate": 1.7939890920931346e-06, + "loss": 0.3818, + "step": 8780 + }, + { + "epoch": 2.9873729858424385, + "grad_norm": 3.4127171232770994, + "learning_rate": 1.7913877092972009e-06, + "loss": 0.3655, + "step": 8785 + }, + { + "epoch": 2.989073593809787, + "grad_norm": 4.316849860712377, + "learning_rate": 1.7887871605112635e-06, + "loss": 0.3831, + "step": 8790 + }, + { + "epoch": 2.9907742017771355, + "grad_norm": 4.078438401282003, + "learning_rate": 1.7861874487960707e-06, + "loss": 0.3907, + "step": 8795 + }, + { + "epoch": 2.9924748097444835, + "grad_norm": 4.4078567477042405, + "learning_rate": 1.7835885772113846e-06, + "loss": 0.3705, + "step": 8800 + }, + { + "epoch": 2.994175417711832, + "grad_norm": 5.163696644043879, + "learning_rate": 1.7809905488159799e-06, + "loss": 0.3633, + "step": 8805 + }, + { + "epoch": 2.9958760256791805, + "grad_norm": 3.1201895676547964, + "learning_rate": 1.7783933666676378e-06, + "loss": 0.3684, + "step": 8810 + }, + { + "epoch": 2.9975766336465286, + "grad_norm": 4.596905642906636, + "learning_rate": 1.775797033823144e-06, + "loss": 0.3733, + "step": 8815 + }, + { + "epoch": 2.999277241613877, + "grad_norm": 4.953548102346647, + "learning_rate": 1.773201553338285e-06, + "loss": 0.3861, + "step": 8820 + }, + { + "epoch": 3.0006802431869395, + "grad_norm": 4.837213083684313, + "learning_rate": 1.7706069282678436e-06, + "loss": 0.3291, + "step": 8825 + }, + { + "epoch": 3.0023808511542875, + "grad_norm": 4.289894082584498, + "learning_rate": 1.7680131616655954e-06, + "loss": 0.3361, + "step": 8830 + }, + { + "epoch": 3.004081459121636, + "grad_norm": 3.67762501007464, + "learning_rate": 1.7654202565843065e-06, + "loss": 0.3633, + "step": 8835 + }, + { + "epoch": 3.005782067088984, + "grad_norm": 8.297568543931774, + "learning_rate": 1.762828216075728e-06, + "loss": 0.3682, + "step": 8840 + }, + { + "epoch": 3.0074826750563326, + "grad_norm": 53.25224778009304, + "learning_rate": 1.7602370431905952e-06, + "loss": 0.3527, + "step": 8845 + }, + { + "epoch": 3.009183283023681, + "grad_norm": 8.804512663857691, + "learning_rate": 1.7576467409786196e-06, + "loss": 0.3799, + "step": 8850 + }, + { + "epoch": 3.010883890991029, + "grad_norm": 7.103880302255103, + "learning_rate": 1.7550573124884901e-06, + "loss": 0.3329, + "step": 8855 + }, + { + "epoch": 3.0125844989583777, + "grad_norm": 3.330752538820018, + "learning_rate": 1.7524687607678666e-06, + "loss": 0.345, + "step": 8860 + }, + { + "epoch": 3.014285106925726, + "grad_norm": 3.869647579638839, + "learning_rate": 1.749881088863377e-06, + "loss": 0.3655, + "step": 8865 + }, + { + "epoch": 3.015985714893074, + "grad_norm": 6.557253327984917, + "learning_rate": 1.7472942998206137e-06, + "loss": 0.3451, + "step": 8870 + }, + { + "epoch": 3.0176863228604227, + "grad_norm": 4.054072341150636, + "learning_rate": 1.74470839668413e-06, + "loss": 0.3508, + "step": 8875 + }, + { + "epoch": 3.0193869308277708, + "grad_norm": 6.761454233909646, + "learning_rate": 1.7421233824974367e-06, + "loss": 0.3523, + "step": 8880 + }, + { + "epoch": 3.0210875387951193, + "grad_norm": 7.976619077262149, + "learning_rate": 1.7395392603029984e-06, + "loss": 0.3578, + "step": 8885 + }, + { + "epoch": 3.0227881467624678, + "grad_norm": 3.1686734317850247, + "learning_rate": 1.7369560331422292e-06, + "loss": 0.3558, + "step": 8890 + }, + { + "epoch": 3.024488754729816, + "grad_norm": 5.485257218129294, + "learning_rate": 1.7343737040554908e-06, + "loss": 0.3611, + "step": 8895 + }, + { + "epoch": 3.0261893626971643, + "grad_norm": 3.962374694461126, + "learning_rate": 1.7317922760820868e-06, + "loss": 0.3563, + "step": 8900 + }, + { + "epoch": 3.0278899706645124, + "grad_norm": 3.051993159505651, + "learning_rate": 1.7292117522602608e-06, + "loss": 0.3558, + "step": 8905 + }, + { + "epoch": 3.029590578631861, + "grad_norm": 8.579964201999829, + "learning_rate": 1.7266321356271929e-06, + "loss": 0.36, + "step": 8910 + }, + { + "epoch": 3.0312911865992094, + "grad_norm": 17.88966263136647, + "learning_rate": 1.7240534292189937e-06, + "loss": 0.371, + "step": 8915 + }, + { + "epoch": 3.0329917945665574, + "grad_norm": 3.9542284226956825, + "learning_rate": 1.7214756360707047e-06, + "loss": 0.3697, + "step": 8920 + }, + { + "epoch": 3.034692402533906, + "grad_norm": 3.5475828665981006, + "learning_rate": 1.7188987592162907e-06, + "loss": 0.3687, + "step": 8925 + }, + { + "epoch": 3.036393010501254, + "grad_norm": 11.781307466979413, + "learning_rate": 1.7163228016886388e-06, + "loss": 0.3838, + "step": 8930 + }, + { + "epoch": 3.0380936184686025, + "grad_norm": 6.924010342238102, + "learning_rate": 1.7137477665195538e-06, + "loss": 0.3548, + "step": 8935 + }, + { + "epoch": 3.039794226435951, + "grad_norm": 2.5011090104936735, + "learning_rate": 1.711173656739756e-06, + "loss": 0.3519, + "step": 8940 + }, + { + "epoch": 3.041494834403299, + "grad_norm": 5.426408581554422, + "learning_rate": 1.7086004753788755e-06, + "loss": 0.3363, + "step": 8945 + }, + { + "epoch": 3.0431954423706475, + "grad_norm": 4.791366672605923, + "learning_rate": 1.7060282254654497e-06, + "loss": 0.3527, + "step": 8950 + }, + { + "epoch": 3.044896050337996, + "grad_norm": 6.066732269091804, + "learning_rate": 1.70345691002692e-06, + "loss": 0.3684, + "step": 8955 + }, + { + "epoch": 3.046596658305344, + "grad_norm": 4.22017798132994, + "learning_rate": 1.7008865320896279e-06, + "loss": 0.3665, + "step": 8960 + }, + { + "epoch": 3.0482972662726926, + "grad_norm": 3.464794634492069, + "learning_rate": 1.6983170946788114e-06, + "loss": 0.3529, + "step": 8965 + }, + { + "epoch": 3.0499978742400407, + "grad_norm": 3.9222375322099614, + "learning_rate": 1.6957486008186019e-06, + "loss": 0.3619, + "step": 8970 + }, + { + "epoch": 3.051698482207389, + "grad_norm": 4.633984968672792, + "learning_rate": 1.6931810535320194e-06, + "loss": 0.3632, + "step": 8975 + }, + { + "epoch": 3.0533990901747377, + "grad_norm": 6.470570887351088, + "learning_rate": 1.690614455840971e-06, + "loss": 0.3528, + "step": 8980 + }, + { + "epoch": 3.0550996981420857, + "grad_norm": 4.116975735798495, + "learning_rate": 1.6880488107662457e-06, + "loss": 0.3654, + "step": 8985 + }, + { + "epoch": 3.056800306109434, + "grad_norm": 5.905237632834165, + "learning_rate": 1.6854841213275105e-06, + "loss": 0.3465, + "step": 8990 + }, + { + "epoch": 3.0585009140767823, + "grad_norm": 3.8943219832538514, + "learning_rate": 1.6829203905433084e-06, + "loss": 0.3335, + "step": 8995 + }, + { + "epoch": 3.0602015220441308, + "grad_norm": 3.687716783594247, + "learning_rate": 1.680357621431055e-06, + "loss": 0.3576, + "step": 9000 + }, + { + "epoch": 3.0619021300114793, + "grad_norm": 5.149070151852963, + "learning_rate": 1.677795817007032e-06, + "loss": 0.3671, + "step": 9005 + }, + { + "epoch": 3.0636027379788273, + "grad_norm": 3.494475439567705, + "learning_rate": 1.6752349802863877e-06, + "loss": 0.3471, + "step": 9010 + }, + { + "epoch": 3.065303345946176, + "grad_norm": 4.954298151400518, + "learning_rate": 1.67267511428313e-06, + "loss": 0.3656, + "step": 9015 + }, + { + "epoch": 3.067003953913524, + "grad_norm": 28.806765576574186, + "learning_rate": 1.6701162220101249e-06, + "loss": 0.3686, + "step": 9020 + }, + { + "epoch": 3.0687045618808724, + "grad_norm": 6.240541900027824, + "learning_rate": 1.6675583064790923e-06, + "loss": 0.3642, + "step": 9025 + }, + { + "epoch": 3.070405169848221, + "grad_norm": 8.834333729157482, + "learning_rate": 1.665001370700603e-06, + "loss": 0.3629, + "step": 9030 + }, + { + "epoch": 3.072105777815569, + "grad_norm": 5.592392981972255, + "learning_rate": 1.6624454176840732e-06, + "loss": 0.3448, + "step": 9035 + }, + { + "epoch": 3.0738063857829174, + "grad_norm": 4.3555719604118615, + "learning_rate": 1.6598904504377638e-06, + "loss": 0.3514, + "step": 9040 + }, + { + "epoch": 3.075506993750266, + "grad_norm": 7.081636040320824, + "learning_rate": 1.6573364719687758e-06, + "loss": 0.3624, + "step": 9045 + }, + { + "epoch": 3.077207601717614, + "grad_norm": 2.918298030062138, + "learning_rate": 1.6547834852830447e-06, + "loss": 0.3239, + "step": 9050 + }, + { + "epoch": 3.0789082096849625, + "grad_norm": 9.006194751909424, + "learning_rate": 1.6522314933853395e-06, + "loss": 0.3435, + "step": 9055 + }, + { + "epoch": 3.0806088176523105, + "grad_norm": 5.497911481021905, + "learning_rate": 1.6496804992792604e-06, + "loss": 0.3404, + "step": 9060 + }, + { + "epoch": 3.082309425619659, + "grad_norm": 3.4235907173639317, + "learning_rate": 1.64713050596723e-06, + "loss": 0.3373, + "step": 9065 + }, + { + "epoch": 3.0840100335870075, + "grad_norm": 4.502392674589181, + "learning_rate": 1.6445815164504947e-06, + "loss": 0.3454, + "step": 9070 + }, + { + "epoch": 3.0857106415543556, + "grad_norm": 5.548983937554486, + "learning_rate": 1.6420335337291197e-06, + "loss": 0.3647, + "step": 9075 + }, + { + "epoch": 3.087411249521704, + "grad_norm": 4.505852938618983, + "learning_rate": 1.6394865608019842e-06, + "loss": 0.3529, + "step": 9080 + }, + { + "epoch": 3.089111857489052, + "grad_norm": 3.760432711472048, + "learning_rate": 1.6369406006667795e-06, + "loss": 0.3572, + "step": 9085 + }, + { + "epoch": 3.0908124654564006, + "grad_norm": 17.584180935267348, + "learning_rate": 1.6343956563200053e-06, + "loss": 0.3559, + "step": 9090 + }, + { + "epoch": 3.092513073423749, + "grad_norm": 3.4865540887340956, + "learning_rate": 1.6318517307569648e-06, + "loss": 0.3491, + "step": 9095 + }, + { + "epoch": 3.094213681391097, + "grad_norm": 9.388707399553276, + "learning_rate": 1.6293088269717633e-06, + "loss": 0.3304, + "step": 9100 + }, + { + "epoch": 3.0959142893584457, + "grad_norm": 4.835587502782633, + "learning_rate": 1.6267669479573023e-06, + "loss": 0.3427, + "step": 9105 + }, + { + "epoch": 3.0976148973257938, + "grad_norm": 6.611333595318636, + "learning_rate": 1.6242260967052776e-06, + "loss": 0.3757, + "step": 9110 + }, + { + "epoch": 3.0993155052931423, + "grad_norm": 7.241838210451506, + "learning_rate": 1.6216862762061753e-06, + "loss": 0.3755, + "step": 9115 + }, + { + "epoch": 3.1010161132604908, + "grad_norm": 7.020118083200031, + "learning_rate": 1.6191474894492698e-06, + "loss": 0.3425, + "step": 9120 + }, + { + "epoch": 3.102716721227839, + "grad_norm": 25.608830701241615, + "learning_rate": 1.6166097394226165e-06, + "loss": 0.3485, + "step": 9125 + }, + { + "epoch": 3.1044173291951873, + "grad_norm": 4.755580662343598, + "learning_rate": 1.6140730291130518e-06, + "loss": 0.3482, + "step": 9130 + }, + { + "epoch": 3.106117937162536, + "grad_norm": 4.701139521392903, + "learning_rate": 1.6115373615061886e-06, + "loss": 0.3559, + "step": 9135 + }, + { + "epoch": 3.107818545129884, + "grad_norm": 4.272936049205998, + "learning_rate": 1.6090027395864122e-06, + "loss": 0.3609, + "step": 9140 + }, + { + "epoch": 3.1095191530972324, + "grad_norm": 3.775254133893892, + "learning_rate": 1.606469166336877e-06, + "loss": 0.3529, + "step": 9145 + }, + { + "epoch": 3.1112197610645804, + "grad_norm": 4.470528534439289, + "learning_rate": 1.603936644739503e-06, + "loss": 0.349, + "step": 9150 + }, + { + "epoch": 3.112920369031929, + "grad_norm": 4.541302046969266, + "learning_rate": 1.6014051777749734e-06, + "loss": 0.3489, + "step": 9155 + }, + { + "epoch": 3.1146209769992774, + "grad_norm": 6.250817802123763, + "learning_rate": 1.5988747684227296e-06, + "loss": 0.3377, + "step": 9160 + }, + { + "epoch": 3.1163215849666255, + "grad_norm": 3.6038813728861294, + "learning_rate": 1.5963454196609673e-06, + "loss": 0.346, + "step": 9165 + }, + { + "epoch": 3.118022192933974, + "grad_norm": 6.335899443657108, + "learning_rate": 1.593817134466636e-06, + "loss": 0.3402, + "step": 9170 + }, + { + "epoch": 3.119722800901322, + "grad_norm": 6.969567561661201, + "learning_rate": 1.591289915815431e-06, + "loss": 0.3756, + "step": 9175 + }, + { + "epoch": 3.1214234088686705, + "grad_norm": 4.221959987706928, + "learning_rate": 1.588763766681794e-06, + "loss": 0.344, + "step": 9180 + }, + { + "epoch": 3.123124016836019, + "grad_norm": 4.604883109011117, + "learning_rate": 1.5862386900389081e-06, + "loss": 0.3714, + "step": 9185 + }, + { + "epoch": 3.124824624803367, + "grad_norm": 6.435163478598471, + "learning_rate": 1.5837146888586929e-06, + "loss": 0.3589, + "step": 9190 + }, + { + "epoch": 3.1265252327707156, + "grad_norm": 4.786425219960417, + "learning_rate": 1.581191766111803e-06, + "loss": 0.3357, + "step": 9195 + }, + { + "epoch": 3.1282258407380636, + "grad_norm": 23.43921705701697, + "learning_rate": 1.5786699247676232e-06, + "loss": 0.3623, + "step": 9200 + }, + { + "epoch": 3.129926448705412, + "grad_norm": 7.913970059170245, + "learning_rate": 1.5761491677942664e-06, + "loss": 0.3452, + "step": 9205 + }, + { + "epoch": 3.1316270566727606, + "grad_norm": 4.869166593219783, + "learning_rate": 1.573629498158568e-06, + "loss": 0.3327, + "step": 9210 + }, + { + "epoch": 3.1333276646401087, + "grad_norm": 5.034791596752279, + "learning_rate": 1.571110918826085e-06, + "loss": 0.3744, + "step": 9215 + }, + { + "epoch": 3.135028272607457, + "grad_norm": 6.145183450030516, + "learning_rate": 1.5685934327610902e-06, + "loss": 0.3625, + "step": 9220 + }, + { + "epoch": 3.1367288805748057, + "grad_norm": 6.360077325313443, + "learning_rate": 1.5660770429265696e-06, + "loss": 0.3476, + "step": 9225 + }, + { + "epoch": 3.1384294885421538, + "grad_norm": 5.018401042849134, + "learning_rate": 1.5635617522842197e-06, + "loss": 0.3413, + "step": 9230 + }, + { + "epoch": 3.1401300965095023, + "grad_norm": 26.970733338180644, + "learning_rate": 1.5610475637944428e-06, + "loss": 0.3496, + "step": 9235 + }, + { + "epoch": 3.1418307044768503, + "grad_norm": 3.169329874562929, + "learning_rate": 1.5585344804163443e-06, + "loss": 0.3591, + "step": 9240 + }, + { + "epoch": 3.143531312444199, + "grad_norm": 5.80691730548006, + "learning_rate": 1.5560225051077284e-06, + "loss": 0.3648, + "step": 9245 + }, + { + "epoch": 3.1452319204115473, + "grad_norm": 9.811921458054437, + "learning_rate": 1.5535116408250962e-06, + "loss": 0.3505, + "step": 9250 + }, + { + "epoch": 3.1469325283788954, + "grad_norm": 4.216795739393517, + "learning_rate": 1.5510018905236395e-06, + "loss": 0.3748, + "step": 9255 + }, + { + "epoch": 3.148633136346244, + "grad_norm": 8.489736019684033, + "learning_rate": 1.5484932571572397e-06, + "loss": 0.3417, + "step": 9260 + }, + { + "epoch": 3.150333744313592, + "grad_norm": 2.752951299093961, + "learning_rate": 1.5459857436784655e-06, + "loss": 0.3281, + "step": 9265 + }, + { + "epoch": 3.1520343522809404, + "grad_norm": 4.242891501647164, + "learning_rate": 1.543479353038565e-06, + "loss": 0.362, + "step": 9270 + }, + { + "epoch": 3.153734960248289, + "grad_norm": 6.2034575198668955, + "learning_rate": 1.5409740881874655e-06, + "loss": 0.343, + "step": 9275 + }, + { + "epoch": 3.155435568215637, + "grad_norm": 5.08187694241715, + "learning_rate": 1.5384699520737694e-06, + "loss": 0.3532, + "step": 9280 + }, + { + "epoch": 3.1571361761829855, + "grad_norm": 4.3705969329428545, + "learning_rate": 1.535966947644751e-06, + "loss": 0.3694, + "step": 9285 + }, + { + "epoch": 3.158836784150334, + "grad_norm": 8.073484197259168, + "learning_rate": 1.5334650778463522e-06, + "loss": 0.3432, + "step": 9290 + }, + { + "epoch": 3.160537392117682, + "grad_norm": 2.939116319643779, + "learning_rate": 1.5309643456231793e-06, + "loss": 0.3537, + "step": 9295 + }, + { + "epoch": 3.1622380000850305, + "grad_norm": 4.847115814511223, + "learning_rate": 1.5284647539185003e-06, + "loss": 0.3517, + "step": 9300 + }, + { + "epoch": 3.1639386080523786, + "grad_norm": 4.707389413634161, + "learning_rate": 1.5259663056742403e-06, + "loss": 0.3503, + "step": 9305 + }, + { + "epoch": 3.165639216019727, + "grad_norm": 5.03051762560974, + "learning_rate": 1.5234690038309791e-06, + "loss": 0.3575, + "step": 9310 + }, + { + "epoch": 3.1673398239870756, + "grad_norm": 3.3528175997137977, + "learning_rate": 1.520972851327947e-06, + "loss": 0.3639, + "step": 9315 + }, + { + "epoch": 3.1690404319544236, + "grad_norm": 4.350561925961647, + "learning_rate": 1.518477851103021e-06, + "loss": 0.3315, + "step": 9320 + }, + { + "epoch": 3.170741039921772, + "grad_norm": 18.377674093948617, + "learning_rate": 1.5159840060927234e-06, + "loss": 0.3701, + "step": 9325 + }, + { + "epoch": 3.17244164788912, + "grad_norm": 4.034533545384697, + "learning_rate": 1.5134913192322153e-06, + "loss": 0.3704, + "step": 9330 + }, + { + "epoch": 3.1741422558564687, + "grad_norm": 5.015297989412574, + "learning_rate": 1.5109997934552957e-06, + "loss": 0.355, + "step": 9335 + }, + { + "epoch": 3.175842863823817, + "grad_norm": 3.4254130478277465, + "learning_rate": 1.508509431694396e-06, + "loss": 0.3618, + "step": 9340 + }, + { + "epoch": 3.1775434717911653, + "grad_norm": 7.713471497146769, + "learning_rate": 1.506020236880579e-06, + "loss": 0.3788, + "step": 9345 + }, + { + "epoch": 3.1792440797585138, + "grad_norm": 4.7558887015027755, + "learning_rate": 1.503532211943533e-06, + "loss": 0.3519, + "step": 9350 + }, + { + "epoch": 3.180944687725862, + "grad_norm": 5.434857137728759, + "learning_rate": 1.5010453598115694e-06, + "loss": 0.369, + "step": 9355 + }, + { + "epoch": 3.1826452956932103, + "grad_norm": 5.727333889997782, + "learning_rate": 1.49855968341162e-06, + "loss": 0.3473, + "step": 9360 + }, + { + "epoch": 3.184345903660559, + "grad_norm": 4.348652363906678, + "learning_rate": 1.4960751856692323e-06, + "loss": 0.3323, + "step": 9365 + }, + { + "epoch": 3.186046511627907, + "grad_norm": 12.327728424966066, + "learning_rate": 1.4935918695085667e-06, + "loss": 0.3443, + "step": 9370 + }, + { + "epoch": 3.1877471195952554, + "grad_norm": 3.4232141587420175, + "learning_rate": 1.4911097378523926e-06, + "loss": 0.3536, + "step": 9375 + }, + { + "epoch": 3.1894477275626034, + "grad_norm": 7.428770846564405, + "learning_rate": 1.4886287936220851e-06, + "loss": 0.3467, + "step": 9380 + }, + { + "epoch": 3.191148335529952, + "grad_norm": 6.069640666537968, + "learning_rate": 1.4861490397376234e-06, + "loss": 0.3441, + "step": 9385 + }, + { + "epoch": 3.1928489434973004, + "grad_norm": 6.296746254965375, + "learning_rate": 1.4836704791175835e-06, + "loss": 0.3456, + "step": 9390 + }, + { + "epoch": 3.1945495514646485, + "grad_norm": 3.763029207673051, + "learning_rate": 1.4811931146791386e-06, + "loss": 0.3437, + "step": 9395 + }, + { + "epoch": 3.196250159431997, + "grad_norm": 3.93215412662426, + "learning_rate": 1.4787169493380529e-06, + "loss": 0.3565, + "step": 9400 + }, + { + "epoch": 3.1979507673993455, + "grad_norm": 3.2221714589571095, + "learning_rate": 1.4762419860086802e-06, + "loss": 0.355, + "step": 9405 + }, + { + "epoch": 3.1996513753666935, + "grad_norm": 5.083692562089596, + "learning_rate": 1.4737682276039589e-06, + "loss": 0.3467, + "step": 9410 + }, + { + "epoch": 3.201351983334042, + "grad_norm": 7.054467328041976, + "learning_rate": 1.4712956770354097e-06, + "loss": 0.3569, + "step": 9415 + }, + { + "epoch": 3.20305259130139, + "grad_norm": 3.813157998820209, + "learning_rate": 1.4688243372131314e-06, + "loss": 0.3686, + "step": 9420 + }, + { + "epoch": 3.2047531992687386, + "grad_norm": 5.237275286277567, + "learning_rate": 1.466354211045798e-06, + "loss": 0.358, + "step": 9425 + }, + { + "epoch": 3.206453807236087, + "grad_norm": 4.193675720923958, + "learning_rate": 1.4638853014406554e-06, + "loss": 0.3578, + "step": 9430 + }, + { + "epoch": 3.208154415203435, + "grad_norm": 6.747402085669682, + "learning_rate": 1.4614176113035166e-06, + "loss": 0.3744, + "step": 9435 + }, + { + "epoch": 3.2098550231707836, + "grad_norm": 4.575456821912026, + "learning_rate": 1.458951143538761e-06, + "loss": 0.3575, + "step": 9440 + }, + { + "epoch": 3.2115556311381317, + "grad_norm": 6.358850914941462, + "learning_rate": 1.4564859010493265e-06, + "loss": 0.3799, + "step": 9445 + }, + { + "epoch": 3.21325623910548, + "grad_norm": 3.402734864262026, + "learning_rate": 1.4540218867367134e-06, + "loss": 0.3557, + "step": 9450 + }, + { + "epoch": 3.2149568470728287, + "grad_norm": 5.46462070508825, + "learning_rate": 1.4515591035009713e-06, + "loss": 0.3759, + "step": 9455 + }, + { + "epoch": 3.2166574550401767, + "grad_norm": 5.384845485189256, + "learning_rate": 1.4490975542407054e-06, + "loss": 0.3587, + "step": 9460 + }, + { + "epoch": 3.2183580630075252, + "grad_norm": 11.201007213966772, + "learning_rate": 1.4466372418530644e-06, + "loss": 0.351, + "step": 9465 + }, + { + "epoch": 3.2200586709748737, + "grad_norm": 4.594061434244031, + "learning_rate": 1.4441781692337449e-06, + "loss": 0.3614, + "step": 9470 + }, + { + "epoch": 3.221759278942222, + "grad_norm": 3.8160947458124452, + "learning_rate": 1.441720339276983e-06, + "loss": 0.3412, + "step": 9475 + }, + { + "epoch": 3.2234598869095703, + "grad_norm": 3.6042225339848706, + "learning_rate": 1.4392637548755508e-06, + "loss": 0.3687, + "step": 9480 + }, + { + "epoch": 3.2251604948769184, + "grad_norm": 7.343903966138617, + "learning_rate": 1.4368084189207576e-06, + "loss": 0.3617, + "step": 9485 + }, + { + "epoch": 3.226861102844267, + "grad_norm": 5.543673064140564, + "learning_rate": 1.4343543343024388e-06, + "loss": 0.3608, + "step": 9490 + }, + { + "epoch": 3.2285617108116154, + "grad_norm": 7.081596418583588, + "learning_rate": 1.4319015039089623e-06, + "loss": 0.3261, + "step": 9495 + }, + { + "epoch": 3.2302623187789634, + "grad_norm": 4.026350638570551, + "learning_rate": 1.4294499306272147e-06, + "loss": 0.3636, + "step": 9500 + }, + { + "epoch": 3.231962926746312, + "grad_norm": 4.6889529787433135, + "learning_rate": 1.4269996173426081e-06, + "loss": 0.3603, + "step": 9505 + }, + { + "epoch": 3.23366353471366, + "grad_norm": 10.962216911027449, + "learning_rate": 1.4245505669390664e-06, + "loss": 0.3586, + "step": 9510 + }, + { + "epoch": 3.2353641426810085, + "grad_norm": 4.408765691773431, + "learning_rate": 1.422102782299032e-06, + "loss": 0.3623, + "step": 9515 + }, + { + "epoch": 3.237064750648357, + "grad_norm": 12.213129830596777, + "learning_rate": 1.419656266303453e-06, + "loss": 0.3354, + "step": 9520 + }, + { + "epoch": 3.238765358615705, + "grad_norm": 11.184768597148008, + "learning_rate": 1.4172110218317891e-06, + "loss": 0.3478, + "step": 9525 + }, + { + "epoch": 3.2404659665830535, + "grad_norm": 11.388174070470491, + "learning_rate": 1.4147670517619989e-06, + "loss": 0.3602, + "step": 9530 + }, + { + "epoch": 3.2421665745504016, + "grad_norm": 3.659338530912207, + "learning_rate": 1.4123243589705438e-06, + "loss": 0.3482, + "step": 9535 + }, + { + "epoch": 3.24386718251775, + "grad_norm": 3.228756400523898, + "learning_rate": 1.4098829463323827e-06, + "loss": 0.3728, + "step": 9540 + }, + { + "epoch": 3.2455677904850986, + "grad_norm": 4.017379303108377, + "learning_rate": 1.4074428167209641e-06, + "loss": 0.3634, + "step": 9545 + }, + { + "epoch": 3.2472683984524466, + "grad_norm": 20.13883721091286, + "learning_rate": 1.405003973008231e-06, + "loss": 0.3712, + "step": 9550 + }, + { + "epoch": 3.248969006419795, + "grad_norm": 7.343815939433381, + "learning_rate": 1.4025664180646088e-06, + "loss": 0.3542, + "step": 9555 + }, + { + "epoch": 3.250669614387143, + "grad_norm": 3.139097530497291, + "learning_rate": 1.4001301547590096e-06, + "loss": 0.3487, + "step": 9560 + }, + { + "epoch": 3.2523702223544917, + "grad_norm": 8.051406942129448, + "learning_rate": 1.3976951859588214e-06, + "loss": 0.3451, + "step": 9565 + }, + { + "epoch": 3.25407083032184, + "grad_norm": 4.9757182624069, + "learning_rate": 1.395261514529913e-06, + "loss": 0.3605, + "step": 9570 + }, + { + "epoch": 3.2557714382891882, + "grad_norm": 8.39339333496793, + "learning_rate": 1.3928291433366225e-06, + "loss": 0.3593, + "step": 9575 + }, + { + "epoch": 3.2574720462565367, + "grad_norm": 4.108719744608423, + "learning_rate": 1.3903980752417612e-06, + "loss": 0.3517, + "step": 9580 + }, + { + "epoch": 3.2591726542238852, + "grad_norm": 4.483900114111096, + "learning_rate": 1.387968313106602e-06, + "loss": 0.3529, + "step": 9585 + }, + { + "epoch": 3.2608732621912333, + "grad_norm": 6.602313068657708, + "learning_rate": 1.3855398597908865e-06, + "loss": 0.3715, + "step": 9590 + }, + { + "epoch": 3.262573870158582, + "grad_norm": 11.186997491144231, + "learning_rate": 1.3831127181528097e-06, + "loss": 0.3609, + "step": 9595 + }, + { + "epoch": 3.26427447812593, + "grad_norm": 3.702255195437605, + "learning_rate": 1.380686891049028e-06, + "loss": 0.34, + "step": 9600 + }, + { + "epoch": 3.2659750860932784, + "grad_norm": 3.111270454859789, + "learning_rate": 1.378262381334649e-06, + "loss": 0.3577, + "step": 9605 + }, + { + "epoch": 3.267675694060627, + "grad_norm": 3.9015262777944586, + "learning_rate": 1.3758391918632274e-06, + "loss": 0.366, + "step": 9610 + }, + { + "epoch": 3.269376302027975, + "grad_norm": 3.2201790060582733, + "learning_rate": 1.3734173254867686e-06, + "loss": 0.3388, + "step": 9615 + }, + { + "epoch": 3.2710769099953234, + "grad_norm": 5.846711105583726, + "learning_rate": 1.3709967850557155e-06, + "loss": 0.3732, + "step": 9620 + }, + { + "epoch": 3.2727775179626715, + "grad_norm": 5.302735003789467, + "learning_rate": 1.3685775734189554e-06, + "loss": 0.3364, + "step": 9625 + }, + { + "epoch": 3.27447812593002, + "grad_norm": 9.045329764476312, + "learning_rate": 1.3661596934238076e-06, + "loss": 0.3507, + "step": 9630 + }, + { + "epoch": 3.2761787338973685, + "grad_norm": 7.541951413082083, + "learning_rate": 1.3637431479160268e-06, + "loss": 0.355, + "step": 9635 + }, + { + "epoch": 3.2778793418647165, + "grad_norm": 5.455160556892848, + "learning_rate": 1.3613279397397954e-06, + "loss": 0.3548, + "step": 9640 + }, + { + "epoch": 3.279579949832065, + "grad_norm": 5.56820279064728, + "learning_rate": 1.358914071737724e-06, + "loss": 0.362, + "step": 9645 + }, + { + "epoch": 3.2812805577994135, + "grad_norm": 6.458486233565775, + "learning_rate": 1.356501546750842e-06, + "loss": 0.3503, + "step": 9650 + }, + { + "epoch": 3.2829811657667616, + "grad_norm": 16.080185721494754, + "learning_rate": 1.3540903676186021e-06, + "loss": 0.3605, + "step": 9655 + }, + { + "epoch": 3.28468177373411, + "grad_norm": 3.379459157677767, + "learning_rate": 1.3516805371788721e-06, + "loss": 0.3438, + "step": 9660 + }, + { + "epoch": 3.286382381701458, + "grad_norm": 4.213221873676639, + "learning_rate": 1.3492720582679297e-06, + "loss": 0.3657, + "step": 9665 + }, + { + "epoch": 3.2880829896688066, + "grad_norm": 5.550378842694407, + "learning_rate": 1.3468649337204665e-06, + "loss": 0.3413, + "step": 9670 + }, + { + "epoch": 3.2897835976361547, + "grad_norm": 4.188266219864308, + "learning_rate": 1.3444591663695743e-06, + "loss": 0.3623, + "step": 9675 + }, + { + "epoch": 3.291484205603503, + "grad_norm": 6.504902750656069, + "learning_rate": 1.3420547590467538e-06, + "loss": 0.3505, + "step": 9680 + }, + { + "epoch": 3.2931848135708517, + "grad_norm": 11.903424870560336, + "learning_rate": 1.3396517145818996e-06, + "loss": 0.3404, + "step": 9685 + }, + { + "epoch": 3.2948854215381997, + "grad_norm": 4.97108654701699, + "learning_rate": 1.3372500358033064e-06, + "loss": 0.3527, + "step": 9690 + }, + { + "epoch": 3.2965860295055482, + "grad_norm": 4.756803819783444, + "learning_rate": 1.334849725537658e-06, + "loss": 0.3654, + "step": 9695 + }, + { + "epoch": 3.2982866374728967, + "grad_norm": 3.5908380327207152, + "learning_rate": 1.3324507866100312e-06, + "loss": 0.3286, + "step": 9700 + }, + { + "epoch": 3.299987245440245, + "grad_norm": 4.592058291952547, + "learning_rate": 1.3300532218438848e-06, + "loss": 0.3373, + "step": 9705 + }, + { + "epoch": 3.3016878534075933, + "grad_norm": 3.1249667027320087, + "learning_rate": 1.3276570340610639e-06, + "loss": 0.353, + "step": 9710 + }, + { + "epoch": 3.3033884613749414, + "grad_norm": 10.738793625412239, + "learning_rate": 1.32526222608179e-06, + "loss": 0.3331, + "step": 9715 + }, + { + "epoch": 3.30508906934229, + "grad_norm": 4.611524702720749, + "learning_rate": 1.3228688007246627e-06, + "loss": 0.3527, + "step": 9720 + }, + { + "epoch": 3.3067896773096384, + "grad_norm": 7.857897100716977, + "learning_rate": 1.3204767608066543e-06, + "loss": 0.366, + "step": 9725 + }, + { + "epoch": 3.3084902852769864, + "grad_norm": 4.918137208400486, + "learning_rate": 1.3180861091431041e-06, + "loss": 0.3543, + "step": 9730 + }, + { + "epoch": 3.310190893244335, + "grad_norm": 6.304405555203345, + "learning_rate": 1.3156968485477207e-06, + "loss": 0.3586, + "step": 9735 + }, + { + "epoch": 3.311891501211683, + "grad_norm": 5.407731517296218, + "learning_rate": 1.3133089818325725e-06, + "loss": 0.3224, + "step": 9740 + }, + { + "epoch": 3.3135921091790315, + "grad_norm": 4.254600301547921, + "learning_rate": 1.3109225118080904e-06, + "loss": 0.375, + "step": 9745 + }, + { + "epoch": 3.31529271714638, + "grad_norm": 5.3879622240552445, + "learning_rate": 1.308537441283058e-06, + "loss": 0.3409, + "step": 9750 + }, + { + "epoch": 3.316993325113728, + "grad_norm": 5.503722668980672, + "learning_rate": 1.306153773064615e-06, + "loss": 0.3644, + "step": 9755 + }, + { + "epoch": 3.3186939330810765, + "grad_norm": 3.7292060947964063, + "learning_rate": 1.3037715099582477e-06, + "loss": 0.3719, + "step": 9760 + }, + { + "epoch": 3.320394541048425, + "grad_norm": 4.9213167491874135, + "learning_rate": 1.3013906547677923e-06, + "loss": 0.3705, + "step": 9765 + }, + { + "epoch": 3.322095149015773, + "grad_norm": 3.145540184588686, + "learning_rate": 1.299011210295423e-06, + "loss": 0.3474, + "step": 9770 + }, + { + "epoch": 3.3237957569831216, + "grad_norm": 4.681201076613843, + "learning_rate": 1.2966331793416581e-06, + "loss": 0.3492, + "step": 9775 + }, + { + "epoch": 3.3254963649504696, + "grad_norm": 15.673425550163644, + "learning_rate": 1.2942565647053513e-06, + "loss": 0.3586, + "step": 9780 + }, + { + "epoch": 3.327196972917818, + "grad_norm": 3.5449959702708687, + "learning_rate": 1.2918813691836862e-06, + "loss": 0.3597, + "step": 9785 + }, + { + "epoch": 3.3288975808851666, + "grad_norm": 5.776623181868395, + "learning_rate": 1.2895075955721812e-06, + "loss": 0.3443, + "step": 9790 + }, + { + "epoch": 3.3305981888525147, + "grad_norm": 5.284604667205001, + "learning_rate": 1.2871352466646762e-06, + "loss": 0.3454, + "step": 9795 + }, + { + "epoch": 3.332298796819863, + "grad_norm": 2.9524565603519894, + "learning_rate": 1.284764325253338e-06, + "loss": 0.3324, + "step": 9800 + }, + { + "epoch": 3.3339994047872112, + "grad_norm": 5.483482066166741, + "learning_rate": 1.282394834128651e-06, + "loss": 0.3702, + "step": 9805 + }, + { + "epoch": 3.3357000127545597, + "grad_norm": 3.6052555779675934, + "learning_rate": 1.2800267760794182e-06, + "loss": 0.3557, + "step": 9810 + }, + { + "epoch": 3.3374006207219082, + "grad_norm": 3.95121512021213, + "learning_rate": 1.2776601538927533e-06, + "loss": 0.3679, + "step": 9815 + }, + { + "epoch": 3.3391012286892563, + "grad_norm": 3.5054855290377427, + "learning_rate": 1.2752949703540831e-06, + "loss": 0.3335, + "step": 9820 + }, + { + "epoch": 3.340801836656605, + "grad_norm": 3.3110519465444477, + "learning_rate": 1.2729312282471379e-06, + "loss": 0.3564, + "step": 9825 + }, + { + "epoch": 3.3425024446239533, + "grad_norm": 5.756438579662786, + "learning_rate": 1.2705689303539553e-06, + "loss": 0.3464, + "step": 9830 + }, + { + "epoch": 3.3442030525913014, + "grad_norm": 5.266044233708819, + "learning_rate": 1.2682080794548687e-06, + "loss": 0.3399, + "step": 9835 + }, + { + "epoch": 3.34590366055865, + "grad_norm": 4.226321802166191, + "learning_rate": 1.2658486783285118e-06, + "loss": 0.3524, + "step": 9840 + }, + { + "epoch": 3.347604268525998, + "grad_norm": 3.425918661818654, + "learning_rate": 1.2634907297518122e-06, + "loss": 0.3493, + "step": 9845 + }, + { + "epoch": 3.3493048764933464, + "grad_norm": 33.576944515185474, + "learning_rate": 1.2611342364999843e-06, + "loss": 0.3627, + "step": 9850 + }, + { + "epoch": 3.3510054844606945, + "grad_norm": 14.704585043928068, + "learning_rate": 1.258779201346534e-06, + "loss": 0.3816, + "step": 9855 + }, + { + "epoch": 3.352706092428043, + "grad_norm": 6.061828813618805, + "learning_rate": 1.2564256270632474e-06, + "loss": 0.3325, + "step": 9860 + }, + { + "epoch": 3.3544067003953915, + "grad_norm": 5.616879925172779, + "learning_rate": 1.2540735164201945e-06, + "loss": 0.3661, + "step": 9865 + }, + { + "epoch": 3.3561073083627395, + "grad_norm": 3.4960002907362444, + "learning_rate": 1.2517228721857194e-06, + "loss": 0.3524, + "step": 9870 + }, + { + "epoch": 3.357807916330088, + "grad_norm": 6.37061786854243, + "learning_rate": 1.249373697126443e-06, + "loss": 0.3616, + "step": 9875 + }, + { + "epoch": 3.3595085242974365, + "grad_norm": 14.589626654492173, + "learning_rate": 1.2470259940072552e-06, + "loss": 0.3476, + "step": 9880 + }, + { + "epoch": 3.3612091322647846, + "grad_norm": 5.493479668859717, + "learning_rate": 1.244679765591315e-06, + "loss": 0.3585, + "step": 9885 + }, + { + "epoch": 3.362909740232133, + "grad_norm": 5.210868639805221, + "learning_rate": 1.2423350146400436e-06, + "loss": 0.3569, + "step": 9890 + }, + { + "epoch": 3.364610348199481, + "grad_norm": 5.493687640765632, + "learning_rate": 1.2399917439131257e-06, + "loss": 0.352, + "step": 9895 + }, + { + "epoch": 3.3663109561668296, + "grad_norm": 4.33835279527087, + "learning_rate": 1.2376499561685034e-06, + "loss": 0.35, + "step": 9900 + }, + { + "epoch": 3.368011564134178, + "grad_norm": 5.224804654043163, + "learning_rate": 1.235309654162371e-06, + "loss": 0.3589, + "step": 9905 + }, + { + "epoch": 3.369712172101526, + "grad_norm": 3.573205850458747, + "learning_rate": 1.2329708406491775e-06, + "loss": 0.345, + "step": 9910 + }, + { + "epoch": 3.3714127800688747, + "grad_norm": 4.358838772350418, + "learning_rate": 1.230633518381617e-06, + "loss": 0.3485, + "step": 9915 + }, + { + "epoch": 3.3731133880362227, + "grad_norm": 4.149921011912954, + "learning_rate": 1.2282976901106314e-06, + "loss": 0.353, + "step": 9920 + }, + { + "epoch": 3.3748139960035712, + "grad_norm": 4.861952397773383, + "learning_rate": 1.2259633585854006e-06, + "loss": 0.3715, + "step": 9925 + }, + { + "epoch": 3.3765146039709197, + "grad_norm": 4.230447350254913, + "learning_rate": 1.2236305265533472e-06, + "loss": 0.3388, + "step": 9930 + }, + { + "epoch": 3.378215211938268, + "grad_norm": 3.869994835859092, + "learning_rate": 1.2212991967601245e-06, + "loss": 0.371, + "step": 9935 + }, + { + "epoch": 3.3799158199056163, + "grad_norm": 5.2455469496553775, + "learning_rate": 1.218969371949622e-06, + "loss": 0.3621, + "step": 9940 + }, + { + "epoch": 3.381616427872965, + "grad_norm": 8.880842044601593, + "learning_rate": 1.216641054863954e-06, + "loss": 0.34, + "step": 9945 + }, + { + "epoch": 3.383317035840313, + "grad_norm": 4.18076871396702, + "learning_rate": 1.214314248243464e-06, + "loss": 0.3585, + "step": 9950 + }, + { + "epoch": 3.3850176438076613, + "grad_norm": 4.122412595315365, + "learning_rate": 1.2119889548267136e-06, + "loss": 0.3672, + "step": 9955 + }, + { + "epoch": 3.3867182517750094, + "grad_norm": 4.058013648340633, + "learning_rate": 1.2096651773504866e-06, + "loss": 0.3467, + "step": 9960 + }, + { + "epoch": 3.388418859742358, + "grad_norm": 3.5657280622355563, + "learning_rate": 1.2073429185497832e-06, + "loss": 0.3624, + "step": 9965 + }, + { + "epoch": 3.3901194677097064, + "grad_norm": 3.001269457589452, + "learning_rate": 1.205022181157812e-06, + "loss": 0.3391, + "step": 9970 + }, + { + "epoch": 3.3918200756770545, + "grad_norm": 11.074556669455953, + "learning_rate": 1.2027029679059958e-06, + "loss": 0.3506, + "step": 9975 + }, + { + "epoch": 3.393520683644403, + "grad_norm": 4.002246206732605, + "learning_rate": 1.2003852815239592e-06, + "loss": 0.3511, + "step": 9980 + }, + { + "epoch": 3.395221291611751, + "grad_norm": 7.74038371051314, + "learning_rate": 1.1980691247395341e-06, + "loss": 0.3362, + "step": 9985 + }, + { + "epoch": 3.3969218995790995, + "grad_norm": 6.430265373445128, + "learning_rate": 1.1957545002787475e-06, + "loss": 0.3665, + "step": 9990 + }, + { + "epoch": 3.398622507546448, + "grad_norm": 5.8787842931823855, + "learning_rate": 1.1934414108658273e-06, + "loss": 0.3347, + "step": 9995 + }, + { + "epoch": 3.400323115513796, + "grad_norm": 4.069184550986305, + "learning_rate": 1.1911298592231912e-06, + "loss": 0.3519, + "step": 10000 + }, + { + "epoch": 3.4020237234811446, + "grad_norm": 6.865449984895776, + "learning_rate": 1.18881984807145e-06, + "loss": 0.3487, + "step": 10005 + }, + { + "epoch": 3.403724331448493, + "grad_norm": 7.765969170122917, + "learning_rate": 1.1865113801293978e-06, + "loss": 0.3181, + "step": 10010 + }, + { + "epoch": 3.405424939415841, + "grad_norm": 5.891701530959368, + "learning_rate": 1.184204458114016e-06, + "loss": 0.3313, + "step": 10015 + }, + { + "epoch": 3.4071255473831896, + "grad_norm": 4.097355014099184, + "learning_rate": 1.1818990847404657e-06, + "loss": 0.3298, + "step": 10020 + }, + { + "epoch": 3.4088261553505377, + "grad_norm": 3.3676007474296843, + "learning_rate": 1.1795952627220825e-06, + "loss": 0.3368, + "step": 10025 + }, + { + "epoch": 3.410526763317886, + "grad_norm": 8.050189484594268, + "learning_rate": 1.1772929947703802e-06, + "loss": 0.3581, + "step": 10030 + }, + { + "epoch": 3.4122273712852342, + "grad_norm": 12.988329727387205, + "learning_rate": 1.1749922835950398e-06, + "loss": 0.3351, + "step": 10035 + }, + { + "epoch": 3.4139279792525827, + "grad_norm": 4.452551336342145, + "learning_rate": 1.1726931319039133e-06, + "loss": 0.3596, + "step": 10040 + }, + { + "epoch": 3.4156285872199312, + "grad_norm": 4.7390840314669775, + "learning_rate": 1.1703955424030142e-06, + "loss": 0.3524, + "step": 10045 + }, + { + "epoch": 3.4173291951872793, + "grad_norm": 3.853577268844155, + "learning_rate": 1.1680995177965205e-06, + "loss": 0.3579, + "step": 10050 + }, + { + "epoch": 3.419029803154628, + "grad_norm": 11.34337877012613, + "learning_rate": 1.165805060786765e-06, + "loss": 0.3589, + "step": 10055 + }, + { + "epoch": 3.4207304111219763, + "grad_norm": 6.19306633545983, + "learning_rate": 1.1635121740742391e-06, + "loss": 0.3604, + "step": 10060 + }, + { + "epoch": 3.4224310190893243, + "grad_norm": 4.282172329915891, + "learning_rate": 1.1612208603575822e-06, + "loss": 0.3643, + "step": 10065 + }, + { + "epoch": 3.424131627056673, + "grad_norm": 4.714097822547551, + "learning_rate": 1.1589311223335864e-06, + "loss": 0.3367, + "step": 10070 + }, + { + "epoch": 3.425832235024021, + "grad_norm": 6.438695059124917, + "learning_rate": 1.156642962697185e-06, + "loss": 0.3488, + "step": 10075 + }, + { + "epoch": 3.4275328429913694, + "grad_norm": 5.0249657678160125, + "learning_rate": 1.1543563841414571e-06, + "loss": 0.3586, + "step": 10080 + }, + { + "epoch": 3.429233450958718, + "grad_norm": 4.565364515731503, + "learning_rate": 1.1520713893576199e-06, + "loss": 0.3433, + "step": 10085 + }, + { + "epoch": 3.430934058926066, + "grad_norm": 6.5535294321935655, + "learning_rate": 1.1497879810350251e-06, + "loss": 0.3728, + "step": 10090 + }, + { + "epoch": 3.4326346668934145, + "grad_norm": 4.757141860105625, + "learning_rate": 1.14750616186116e-06, + "loss": 0.3664, + "step": 10095 + }, + { + "epoch": 3.4343352748607625, + "grad_norm": 3.2389630049290354, + "learning_rate": 1.1452259345216377e-06, + "loss": 0.3397, + "step": 10100 + }, + { + "epoch": 3.436035882828111, + "grad_norm": 3.3426138556572704, + "learning_rate": 1.1429473017002022e-06, + "loss": 0.3531, + "step": 10105 + }, + { + "epoch": 3.4377364907954595, + "grad_norm": 5.8805448197995815, + "learning_rate": 1.1406702660787163e-06, + "loss": 0.3621, + "step": 10110 + }, + { + "epoch": 3.4394370987628076, + "grad_norm": 3.438221317751023, + "learning_rate": 1.1383948303371675e-06, + "loss": 0.3341, + "step": 10115 + }, + { + "epoch": 3.441137706730156, + "grad_norm": 4.173544496895103, + "learning_rate": 1.136120997153656e-06, + "loss": 0.379, + "step": 10120 + }, + { + "epoch": 3.4428383146975046, + "grad_norm": 15.351690047816835, + "learning_rate": 1.1338487692044e-06, + "loss": 0.3483, + "step": 10125 + }, + { + "epoch": 3.4445389226648526, + "grad_norm": 4.903093752091242, + "learning_rate": 1.131578149163724e-06, + "loss": 0.3541, + "step": 10130 + }, + { + "epoch": 3.446239530632201, + "grad_norm": 4.0036016634813025, + "learning_rate": 1.1293091397040635e-06, + "loss": 0.3513, + "step": 10135 + }, + { + "epoch": 3.447940138599549, + "grad_norm": 3.691397103097567, + "learning_rate": 1.127041743495958e-06, + "loss": 0.3499, + "step": 10140 + }, + { + "epoch": 3.4496407465668977, + "grad_norm": 4.672776733697925, + "learning_rate": 1.1247759632080456e-06, + "loss": 0.3558, + "step": 10145 + }, + { + "epoch": 3.451341354534246, + "grad_norm": 4.709387888832122, + "learning_rate": 1.122511801507067e-06, + "loss": 0.3354, + "step": 10150 + }, + { + "epoch": 3.4530419625015942, + "grad_norm": 4.020983262420783, + "learning_rate": 1.120249261057852e-06, + "loss": 0.3786, + "step": 10155 + }, + { + "epoch": 3.4547425704689427, + "grad_norm": 3.9715982397829372, + "learning_rate": 1.117988344523329e-06, + "loss": 0.3354, + "step": 10160 + }, + { + "epoch": 3.456443178436291, + "grad_norm": 4.288485704118769, + "learning_rate": 1.1157290545645088e-06, + "loss": 0.3595, + "step": 10165 + }, + { + "epoch": 3.4581437864036393, + "grad_norm": 9.768926750109651, + "learning_rate": 1.113471393840493e-06, + "loss": 0.3412, + "step": 10170 + }, + { + "epoch": 3.459844394370988, + "grad_norm": 4.138834652364292, + "learning_rate": 1.1112153650084608e-06, + "loss": 0.3586, + "step": 10175 + }, + { + "epoch": 3.461545002338336, + "grad_norm": 3.753789179270736, + "learning_rate": 1.108960970723676e-06, + "loss": 0.3445, + "step": 10180 + }, + { + "epoch": 3.4632456103056843, + "grad_norm": 6.0473161120725045, + "learning_rate": 1.1067082136394732e-06, + "loss": 0.3383, + "step": 10185 + }, + { + "epoch": 3.464946218273033, + "grad_norm": 3.87336866761418, + "learning_rate": 1.1044570964072649e-06, + "loss": 0.349, + "step": 10190 + }, + { + "epoch": 3.466646826240381, + "grad_norm": 6.983398625443584, + "learning_rate": 1.1022076216765295e-06, + "loss": 0.3548, + "step": 10195 + }, + { + "epoch": 3.4683474342077294, + "grad_norm": 7.512321705522319, + "learning_rate": 1.0999597920948149e-06, + "loss": 0.3485, + "step": 10200 + }, + { + "epoch": 3.4700480421750775, + "grad_norm": 6.463333767875482, + "learning_rate": 1.097713610307733e-06, + "loss": 0.365, + "step": 10205 + }, + { + "epoch": 3.471748650142426, + "grad_norm": 4.028670236526878, + "learning_rate": 1.0954690789589533e-06, + "loss": 0.3472, + "step": 10210 + }, + { + "epoch": 3.473449258109774, + "grad_norm": 5.201598757551068, + "learning_rate": 1.0932262006902064e-06, + "loss": 0.353, + "step": 10215 + }, + { + "epoch": 3.4751498660771225, + "grad_norm": 5.152639228499577, + "learning_rate": 1.090984978141274e-06, + "loss": 0.344, + "step": 10220 + }, + { + "epoch": 3.476850474044471, + "grad_norm": 5.823956212740816, + "learning_rate": 1.0887454139499925e-06, + "loss": 0.3519, + "step": 10225 + }, + { + "epoch": 3.478551082011819, + "grad_norm": 4.225798446296788, + "learning_rate": 1.086507510752243e-06, + "loss": 0.3634, + "step": 10230 + }, + { + "epoch": 3.4802516899791676, + "grad_norm": 3.635547643893639, + "learning_rate": 1.0842712711819548e-06, + "loss": 0.357, + "step": 10235 + }, + { + "epoch": 3.481952297946516, + "grad_norm": 4.665937181986643, + "learning_rate": 1.0820366978710959e-06, + "loss": 0.3306, + "step": 10240 + }, + { + "epoch": 3.483652905913864, + "grad_norm": 4.106378454272665, + "learning_rate": 1.079803793449677e-06, + "loss": 0.3356, + "step": 10245 + }, + { + "epoch": 3.4853535138812126, + "grad_norm": 7.357366696829599, + "learning_rate": 1.0775725605457404e-06, + "loss": 0.3687, + "step": 10250 + }, + { + "epoch": 3.4870541218485607, + "grad_norm": 4.677520590767949, + "learning_rate": 1.0753430017853646e-06, + "loss": 0.3455, + "step": 10255 + }, + { + "epoch": 3.488754729815909, + "grad_norm": 5.516153393659592, + "learning_rate": 1.0731151197926573e-06, + "loss": 0.3439, + "step": 10260 + }, + { + "epoch": 3.4904553377832577, + "grad_norm": 5.04066022640554, + "learning_rate": 1.0708889171897497e-06, + "loss": 0.3464, + "step": 10265 + }, + { + "epoch": 3.4921559457506057, + "grad_norm": 4.141417524214379, + "learning_rate": 1.0686643965968002e-06, + "loss": 0.3646, + "step": 10270 + }, + { + "epoch": 3.4938565537179542, + "grad_norm": 5.619200791285557, + "learning_rate": 1.0664415606319843e-06, + "loss": 0.3681, + "step": 10275 + }, + { + "epoch": 3.4955571616853023, + "grad_norm": 5.899705389378892, + "learning_rate": 1.0642204119114976e-06, + "loss": 0.3381, + "step": 10280 + }, + { + "epoch": 3.497257769652651, + "grad_norm": 4.039755263654937, + "learning_rate": 1.0620009530495473e-06, + "loss": 0.3418, + "step": 10285 + }, + { + "epoch": 3.4989583776199993, + "grad_norm": 4.3477768599212085, + "learning_rate": 1.0597831866583547e-06, + "loss": 0.3375, + "step": 10290 + }, + { + "epoch": 3.5006589855873473, + "grad_norm": 6.6985895763333785, + "learning_rate": 1.057567115348145e-06, + "loss": 0.3603, + "step": 10295 + }, + { + "epoch": 3.502359593554696, + "grad_norm": 19.698218539245715, + "learning_rate": 1.055352741727153e-06, + "loss": 0.3559, + "step": 10300 + }, + { + "epoch": 3.5040602015220443, + "grad_norm": 5.609784918032436, + "learning_rate": 1.0531400684016114e-06, + "loss": 0.3465, + "step": 10305 + }, + { + "epoch": 3.5057608094893924, + "grad_norm": 3.851627555281216, + "learning_rate": 1.0509290979757548e-06, + "loss": 0.3581, + "step": 10310 + }, + { + "epoch": 3.507461417456741, + "grad_norm": 9.753059698528306, + "learning_rate": 1.0487198330518105e-06, + "loss": 0.3395, + "step": 10315 + }, + { + "epoch": 3.509162025424089, + "grad_norm": 19.66444586754753, + "learning_rate": 1.0465122762300015e-06, + "loss": 0.3558, + "step": 10320 + }, + { + "epoch": 3.5108626333914374, + "grad_norm": 4.973830782952403, + "learning_rate": 1.0443064301085394e-06, + "loss": 0.3457, + "step": 10325 + }, + { + "epoch": 3.5125632413587855, + "grad_norm": 4.6413672186230635, + "learning_rate": 1.0421022972836206e-06, + "loss": 0.3549, + "step": 10330 + }, + { + "epoch": 3.514263849326134, + "grad_norm": 47.803694846795985, + "learning_rate": 1.0398998803494282e-06, + "loss": 0.3592, + "step": 10335 + }, + { + "epoch": 3.5159644572934825, + "grad_norm": 36.10983220537593, + "learning_rate": 1.0376991818981225e-06, + "loss": 0.3571, + "step": 10340 + }, + { + "epoch": 3.5176650652608306, + "grad_norm": 7.619601200166429, + "learning_rate": 1.035500204519844e-06, + "loss": 0.3404, + "step": 10345 + }, + { + "epoch": 3.519365673228179, + "grad_norm": 20.895655303154214, + "learning_rate": 1.0333029508027048e-06, + "loss": 0.3671, + "step": 10350 + }, + { + "epoch": 3.5210662811955276, + "grad_norm": 6.555757587556768, + "learning_rate": 1.031107423332792e-06, + "loss": 0.3567, + "step": 10355 + }, + { + "epoch": 3.5227668891628756, + "grad_norm": 6.097681358369745, + "learning_rate": 1.028913624694156e-06, + "loss": 0.34, + "step": 10360 + }, + { + "epoch": 3.524467497130224, + "grad_norm": 5.384410281123741, + "learning_rate": 1.0267215574688183e-06, + "loss": 0.339, + "step": 10365 + }, + { + "epoch": 3.5261681050975726, + "grad_norm": 3.609206753858732, + "learning_rate": 1.024531224236757e-06, + "loss": 0.3541, + "step": 10370 + }, + { + "epoch": 3.5278687130649207, + "grad_norm": 4.68829761586958, + "learning_rate": 1.0223426275759126e-06, + "loss": 0.3696, + "step": 10375 + }, + { + "epoch": 3.529569321032269, + "grad_norm": 3.441930989952324, + "learning_rate": 1.0201557700621822e-06, + "loss": 0.355, + "step": 10380 + }, + { + "epoch": 3.5312699289996172, + "grad_norm": 4.185920112694374, + "learning_rate": 1.0179706542694131e-06, + "loss": 0.3331, + "step": 10385 + }, + { + "epoch": 3.5329705369669657, + "grad_norm": 5.302726584312579, + "learning_rate": 1.0157872827694059e-06, + "loss": 0.3426, + "step": 10390 + }, + { + "epoch": 3.534671144934314, + "grad_norm": 4.0486171277544445, + "learning_rate": 1.0136056581319054e-06, + "loss": 0.341, + "step": 10395 + }, + { + "epoch": 3.5363717529016623, + "grad_norm": 7.751605073146907, + "learning_rate": 1.0114257829246026e-06, + "loss": 0.3352, + "step": 10400 + }, + { + "epoch": 3.538072360869011, + "grad_norm": 3.840718121416357, + "learning_rate": 1.0092476597131274e-06, + "loss": 0.3685, + "step": 10405 + }, + { + "epoch": 3.539772968836359, + "grad_norm": 34.9378034765729, + "learning_rate": 1.0070712910610495e-06, + "loss": 0.3263, + "step": 10410 + }, + { + "epoch": 3.5414735768037073, + "grad_norm": 5.3549959797705275, + "learning_rate": 1.0048966795298731e-06, + "loss": 0.36, + "step": 10415 + }, + { + "epoch": 3.543174184771056, + "grad_norm": 4.467630253567017, + "learning_rate": 1.0027238276790348e-06, + "loss": 0.3355, + "step": 10420 + }, + { + "epoch": 3.544874792738404, + "grad_norm": 4.163273802824049, + "learning_rate": 1.0005527380658978e-06, + "loss": 0.3453, + "step": 10425 + }, + { + "epoch": 3.5465754007057524, + "grad_norm": 7.726891359050451, + "learning_rate": 9.983834132457549e-07, + "loss": 0.3289, + "step": 10430 + }, + { + "epoch": 3.548276008673101, + "grad_norm": 3.029339493191297, + "learning_rate": 9.962158557718172e-07, + "loss": 0.3581, + "step": 10435 + }, + { + "epoch": 3.549976616640449, + "grad_norm": 4.362906572985257, + "learning_rate": 9.940500681952208e-07, + "loss": 0.3826, + "step": 10440 + }, + { + "epoch": 3.5516772246077974, + "grad_norm": 4.018003641401903, + "learning_rate": 9.91886053065014e-07, + "loss": 0.3375, + "step": 10445 + }, + { + "epoch": 3.5533778325751455, + "grad_norm": 4.878837546624064, + "learning_rate": 9.897238129281633e-07, + "loss": 0.3217, + "step": 10450 + }, + { + "epoch": 3.555078440542494, + "grad_norm": 14.33326882112203, + "learning_rate": 9.875633503295417e-07, + "loss": 0.3542, + "step": 10455 + }, + { + "epoch": 3.556779048509842, + "grad_norm": 10.72392420053456, + "learning_rate": 9.854046678119347e-07, + "loss": 0.3526, + "step": 10460 + }, + { + "epoch": 3.5584796564771906, + "grad_norm": 3.5231895845560977, + "learning_rate": 9.83247767916028e-07, + "loss": 0.3417, + "step": 10465 + }, + { + "epoch": 3.560180264444539, + "grad_norm": 3.614269750006329, + "learning_rate": 9.810926531804129e-07, + "loss": 0.3525, + "step": 10470 + }, + { + "epoch": 3.561880872411887, + "grad_norm": 20.10042035284421, + "learning_rate": 9.789393261415792e-07, + "loss": 0.3569, + "step": 10475 + }, + { + "epoch": 3.5635814803792356, + "grad_norm": 3.64653363386056, + "learning_rate": 9.767877893339097e-07, + "loss": 0.3637, + "step": 10480 + }, + { + "epoch": 3.565282088346584, + "grad_norm": 5.791402748117439, + "learning_rate": 9.746380452896844e-07, + "loss": 0.3673, + "step": 10485 + }, + { + "epoch": 3.566982696313932, + "grad_norm": 3.9820409485030317, + "learning_rate": 9.72490096539069e-07, + "loss": 0.3451, + "step": 10490 + }, + { + "epoch": 3.5686833042812807, + "grad_norm": 5.764970808084148, + "learning_rate": 9.703439456101205e-07, + "loss": 0.3671, + "step": 10495 + }, + { + "epoch": 3.5703839122486287, + "grad_norm": 10.613146957278644, + "learning_rate": 9.681995950287756e-07, + "loss": 0.3462, + "step": 10500 + }, + { + "epoch": 3.572084520215977, + "grad_norm": 4.444097200241279, + "learning_rate": 9.660570473188565e-07, + "loss": 0.3362, + "step": 10505 + }, + { + "epoch": 3.5737851281833253, + "grad_norm": 4.151581287292085, + "learning_rate": 9.639163050020589e-07, + "loss": 0.3446, + "step": 10510 + }, + { + "epoch": 3.5754857361506738, + "grad_norm": 3.4422524347308134, + "learning_rate": 9.61777370597958e-07, + "loss": 0.331, + "step": 10515 + }, + { + "epoch": 3.5771863441180223, + "grad_norm": 6.291695215625101, + "learning_rate": 9.596402466239973e-07, + "loss": 0.3373, + "step": 10520 + }, + { + "epoch": 3.5788869520853703, + "grad_norm": 8.458224481826973, + "learning_rate": 9.57504935595492e-07, + "loss": 0.3491, + "step": 10525 + }, + { + "epoch": 3.580587560052719, + "grad_norm": 7.847005455939928, + "learning_rate": 9.55371440025624e-07, + "loss": 0.3554, + "step": 10530 + }, + { + "epoch": 3.5822881680200673, + "grad_norm": 4.161632621025964, + "learning_rate": 9.532397624254353e-07, + "loss": 0.3358, + "step": 10535 + }, + { + "epoch": 3.5839887759874154, + "grad_norm": 5.333937407524186, + "learning_rate": 9.511099053038319e-07, + "loss": 0.3403, + "step": 10540 + }, + { + "epoch": 3.585689383954764, + "grad_norm": 47.99447831985307, + "learning_rate": 9.489818711675742e-07, + "loss": 0.3723, + "step": 10545 + }, + { + "epoch": 3.5873899919221124, + "grad_norm": 4.823602301167504, + "learning_rate": 9.468556625212791e-07, + "loss": 0.3429, + "step": 10550 + }, + { + "epoch": 3.5890905998894604, + "grad_norm": 5.119510687553085, + "learning_rate": 9.447312818674134e-07, + "loss": 0.3423, + "step": 10555 + }, + { + "epoch": 3.590791207856809, + "grad_norm": 5.457065843868638, + "learning_rate": 9.426087317062943e-07, + "loss": 0.3309, + "step": 10560 + }, + { + "epoch": 3.592491815824157, + "grad_norm": 3.4550953372137454, + "learning_rate": 9.40488014536082e-07, + "loss": 0.365, + "step": 10565 + }, + { + "epoch": 3.5941924237915055, + "grad_norm": 5.41452648877141, + "learning_rate": 9.383691328527824e-07, + "loss": 0.3508, + "step": 10570 + }, + { + "epoch": 3.5958930317588536, + "grad_norm": 5.534474014926724, + "learning_rate": 9.362520891502377e-07, + "loss": 0.371, + "step": 10575 + }, + { + "epoch": 3.597593639726202, + "grad_norm": 4.940924121708319, + "learning_rate": 9.341368859201308e-07, + "loss": 0.3467, + "step": 10580 + }, + { + "epoch": 3.5992942476935506, + "grad_norm": 17.61714640188424, + "learning_rate": 9.320235256519741e-07, + "loss": 0.3715, + "step": 10585 + }, + { + "epoch": 3.6009948556608986, + "grad_norm": 5.751312456441521, + "learning_rate": 9.299120108331142e-07, + "loss": 0.3339, + "step": 10590 + }, + { + "epoch": 3.602695463628247, + "grad_norm": 5.166376894454055, + "learning_rate": 9.278023439487252e-07, + "loss": 0.363, + "step": 10595 + }, + { + "epoch": 3.6043960715955956, + "grad_norm": 3.863575377984854, + "learning_rate": 9.256945274818038e-07, + "loss": 0.3496, + "step": 10600 + }, + { + "epoch": 3.6060966795629437, + "grad_norm": 5.513614259293148, + "learning_rate": 9.23588563913173e-07, + "loss": 0.3335, + "step": 10605 + }, + { + "epoch": 3.607797287530292, + "grad_norm": 4.952598970243949, + "learning_rate": 9.214844557214705e-07, + "loss": 0.3779, + "step": 10610 + }, + { + "epoch": 3.6094978954976407, + "grad_norm": 9.341478820658665, + "learning_rate": 9.193822053831542e-07, + "loss": 0.3565, + "step": 10615 + }, + { + "epoch": 3.6111985034649887, + "grad_norm": 4.086573306032363, + "learning_rate": 9.172818153724919e-07, + "loss": 0.3567, + "step": 10620 + }, + { + "epoch": 3.612899111432337, + "grad_norm": 6.5010472580613925, + "learning_rate": 9.151832881615652e-07, + "loss": 0.3656, + "step": 10625 + }, + { + "epoch": 3.6145997193996853, + "grad_norm": 3.354870904762796, + "learning_rate": 9.130866262202603e-07, + "loss": 0.3638, + "step": 10630 + }, + { + "epoch": 3.6163003273670338, + "grad_norm": 4.1115236060147575, + "learning_rate": 9.109918320162708e-07, + "loss": 0.3469, + "step": 10635 + }, + { + "epoch": 3.618000935334382, + "grad_norm": 4.607088368228926, + "learning_rate": 9.08898908015089e-07, + "loss": 0.3366, + "step": 10640 + }, + { + "epoch": 3.6197015433017303, + "grad_norm": 18.186133697983188, + "learning_rate": 9.068078566800084e-07, + "loss": 0.3574, + "step": 10645 + }, + { + "epoch": 3.621402151269079, + "grad_norm": 3.9147702267000333, + "learning_rate": 9.047186804721189e-07, + "loss": 0.3317, + "step": 10650 + }, + { + "epoch": 3.623102759236427, + "grad_norm": 7.557513551869926, + "learning_rate": 9.026313818503002e-07, + "loss": 0.3489, + "step": 10655 + }, + { + "epoch": 3.6248033672037754, + "grad_norm": 5.3993015624251095, + "learning_rate": 9.005459632712263e-07, + "loss": 0.372, + "step": 10660 + }, + { + "epoch": 3.626503975171124, + "grad_norm": 5.053809013474026, + "learning_rate": 8.984624271893544e-07, + "loss": 0.3557, + "step": 10665 + }, + { + "epoch": 3.628204583138472, + "grad_norm": 6.460565333738014, + "learning_rate": 8.963807760569296e-07, + "loss": 0.347, + "step": 10670 + }, + { + "epoch": 3.6299051911058204, + "grad_norm": 4.628311970760183, + "learning_rate": 8.943010123239756e-07, + "loss": 0.3636, + "step": 10675 + }, + { + "epoch": 3.6316057990731685, + "grad_norm": 5.234295870033782, + "learning_rate": 8.922231384382976e-07, + "loss": 0.3487, + "step": 10680 + }, + { + "epoch": 3.633306407040517, + "grad_norm": 10.655149420447447, + "learning_rate": 8.901471568454734e-07, + "loss": 0.3623, + "step": 10685 + }, + { + "epoch": 3.635007015007865, + "grad_norm": 9.126701672294454, + "learning_rate": 8.880730699888565e-07, + "loss": 0.341, + "step": 10690 + }, + { + "epoch": 3.6367076229752135, + "grad_norm": 4.087730378521851, + "learning_rate": 8.86000880309568e-07, + "loss": 0.366, + "step": 10695 + }, + { + "epoch": 3.638408230942562, + "grad_norm": 3.4169310173879475, + "learning_rate": 8.839305902464982e-07, + "loss": 0.3535, + "step": 10700 + }, + { + "epoch": 3.64010883890991, + "grad_norm": 5.473157698564127, + "learning_rate": 8.818622022362991e-07, + "loss": 0.3298, + "step": 10705 + }, + { + "epoch": 3.6418094468772586, + "grad_norm": 3.986001428965386, + "learning_rate": 8.797957187133866e-07, + "loss": 0.3345, + "step": 10710 + }, + { + "epoch": 3.643510054844607, + "grad_norm": 5.36691398359891, + "learning_rate": 8.777311421099347e-07, + "loss": 0.3472, + "step": 10715 + }, + { + "epoch": 3.645210662811955, + "grad_norm": 8.850301792304556, + "learning_rate": 8.756684748558708e-07, + "loss": 0.3324, + "step": 10720 + }, + { + "epoch": 3.6469112707793037, + "grad_norm": 4.686613735711385, + "learning_rate": 8.736077193788781e-07, + "loss": 0.3572, + "step": 10725 + }, + { + "epoch": 3.648611878746652, + "grad_norm": 5.313370553096401, + "learning_rate": 8.715488781043869e-07, + "loss": 0.3466, + "step": 10730 + }, + { + "epoch": 3.650312486714, + "grad_norm": 4.201836558647826, + "learning_rate": 8.694919534555771e-07, + "loss": 0.3613, + "step": 10735 + }, + { + "epoch": 3.6520130946813487, + "grad_norm": 4.391708804040991, + "learning_rate": 8.674369478533701e-07, + "loss": 0.3609, + "step": 10740 + }, + { + "epoch": 3.6537137026486968, + "grad_norm": 5.3463161467714855, + "learning_rate": 8.653838637164321e-07, + "loss": 0.3543, + "step": 10745 + }, + { + "epoch": 3.6554143106160453, + "grad_norm": 3.983674499953354, + "learning_rate": 8.633327034611638e-07, + "loss": 0.3384, + "step": 10750 + }, + { + "epoch": 3.6571149185833933, + "grad_norm": 28.895736753259612, + "learning_rate": 8.612834695017055e-07, + "loss": 0.3391, + "step": 10755 + }, + { + "epoch": 3.658815526550742, + "grad_norm": 4.895856073408972, + "learning_rate": 8.59236164249927e-07, + "loss": 0.3486, + "step": 10760 + }, + { + "epoch": 3.6605161345180903, + "grad_norm": 7.232101311424993, + "learning_rate": 8.571907901154297e-07, + "loss": 0.3631, + "step": 10765 + }, + { + "epoch": 3.6622167424854384, + "grad_norm": 5.333254007085302, + "learning_rate": 8.551473495055435e-07, + "loss": 0.3474, + "step": 10770 + }, + { + "epoch": 3.663917350452787, + "grad_norm": 4.052411850251206, + "learning_rate": 8.53105844825319e-07, + "loss": 0.3477, + "step": 10775 + }, + { + "epoch": 3.6656179584201354, + "grad_norm": 6.471828711368031, + "learning_rate": 8.510662784775322e-07, + "loss": 0.3633, + "step": 10780 + }, + { + "epoch": 3.6673185663874834, + "grad_norm": 4.578665714371467, + "learning_rate": 8.490286528626743e-07, + "loss": 0.3554, + "step": 10785 + }, + { + "epoch": 3.669019174354832, + "grad_norm": 4.960024978436364, + "learning_rate": 8.469929703789554e-07, + "loss": 0.3398, + "step": 10790 + }, + { + "epoch": 3.6707197823221804, + "grad_norm": 7.649576478459408, + "learning_rate": 8.449592334222956e-07, + "loss": 0.3676, + "step": 10795 + }, + { + "epoch": 3.6724203902895285, + "grad_norm": 3.27366413812755, + "learning_rate": 8.429274443863286e-07, + "loss": 0.3652, + "step": 10800 + }, + { + "epoch": 3.674120998256877, + "grad_norm": 5.600994248558118, + "learning_rate": 8.408976056623919e-07, + "loss": 0.3524, + "step": 10805 + }, + { + "epoch": 3.675821606224225, + "grad_norm": 5.647153791168265, + "learning_rate": 8.388697196395309e-07, + "loss": 0.3387, + "step": 10810 + }, + { + "epoch": 3.6775222141915735, + "grad_norm": 4.115828451734707, + "learning_rate": 8.368437887044895e-07, + "loss": 0.3659, + "step": 10815 + }, + { + "epoch": 3.6792228221589216, + "grad_norm": 3.1285968581963863, + "learning_rate": 8.348198152417136e-07, + "loss": 0.3435, + "step": 10820 + }, + { + "epoch": 3.68092343012627, + "grad_norm": 3.332093467858973, + "learning_rate": 8.327978016333424e-07, + "loss": 0.3332, + "step": 10825 + }, + { + "epoch": 3.6826240380936186, + "grad_norm": 5.434188342715908, + "learning_rate": 8.307777502592104e-07, + "loss": 0.3415, + "step": 10830 + }, + { + "epoch": 3.6843246460609667, + "grad_norm": 5.252703651996574, + "learning_rate": 8.287596634968431e-07, + "loss": 0.3264, + "step": 10835 + }, + { + "epoch": 3.686025254028315, + "grad_norm": 5.025378368137891, + "learning_rate": 8.26743543721451e-07, + "loss": 0.3666, + "step": 10840 + }, + { + "epoch": 3.6877258619956637, + "grad_norm": 6.449464171627322, + "learning_rate": 8.247293933059328e-07, + "loss": 0.3587, + "step": 10845 + }, + { + "epoch": 3.6894264699630117, + "grad_norm": 27.15790696281429, + "learning_rate": 8.227172146208659e-07, + "loss": 0.3541, + "step": 10850 + }, + { + "epoch": 3.69112707793036, + "grad_norm": 3.6536930635245595, + "learning_rate": 8.207070100345107e-07, + "loss": 0.3496, + "step": 10855 + }, + { + "epoch": 3.6928276858977083, + "grad_norm": 6.4663982405136835, + "learning_rate": 8.186987819128008e-07, + "loss": 0.3581, + "step": 10860 + }, + { + "epoch": 3.6945282938650568, + "grad_norm": 3.8326194223116095, + "learning_rate": 8.166925326193464e-07, + "loss": 0.3538, + "step": 10865 + }, + { + "epoch": 3.696228901832405, + "grad_norm": 5.176863609255767, + "learning_rate": 8.146882645154263e-07, + "loss": 0.3684, + "step": 10870 + }, + { + "epoch": 3.6979295097997533, + "grad_norm": 4.983850690407328, + "learning_rate": 8.126859799599898e-07, + "loss": 0.3334, + "step": 10875 + }, + { + "epoch": 3.699630117767102, + "grad_norm": 14.71099045181069, + "learning_rate": 8.106856813096492e-07, + "loss": 0.3413, + "step": 10880 + }, + { + "epoch": 3.70133072573445, + "grad_norm": 13.00188359280808, + "learning_rate": 8.08687370918681e-07, + "loss": 0.3545, + "step": 10885 + }, + { + "epoch": 3.7030313337017984, + "grad_norm": 3.8544979857678414, + "learning_rate": 8.066910511390228e-07, + "loss": 0.3567, + "step": 10890 + }, + { + "epoch": 3.704731941669147, + "grad_norm": 5.130480817192012, + "learning_rate": 8.046967243202656e-07, + "loss": 0.3415, + "step": 10895 + }, + { + "epoch": 3.706432549636495, + "grad_norm": 7.468282471790433, + "learning_rate": 8.02704392809659e-07, + "loss": 0.3237, + "step": 10900 + }, + { + "epoch": 3.7081331576038434, + "grad_norm": 9.985122000430085, + "learning_rate": 8.007140589521006e-07, + "loss": 0.3304, + "step": 10905 + }, + { + "epoch": 3.709833765571192, + "grad_norm": 6.9889392828863475, + "learning_rate": 7.987257250901398e-07, + "loss": 0.3331, + "step": 10910 + }, + { + "epoch": 3.71153437353854, + "grad_norm": 11.080424473644747, + "learning_rate": 7.967393935639695e-07, + "loss": 0.346, + "step": 10915 + }, + { + "epoch": 3.7132349815058885, + "grad_norm": 4.061944846859679, + "learning_rate": 7.947550667114284e-07, + "loss": 0.3547, + "step": 10920 + }, + { + "epoch": 3.7149355894732365, + "grad_norm": 4.106047339463541, + "learning_rate": 7.92772746867993e-07, + "loss": 0.3422, + "step": 10925 + }, + { + "epoch": 3.716636197440585, + "grad_norm": 3.838078261798806, + "learning_rate": 7.907924363667807e-07, + "loss": 0.3369, + "step": 10930 + }, + { + "epoch": 3.718336805407933, + "grad_norm": 6.69051307658985, + "learning_rate": 7.888141375385411e-07, + "loss": 0.3575, + "step": 10935 + }, + { + "epoch": 3.7200374133752816, + "grad_norm": 3.667423958159118, + "learning_rate": 7.868378527116588e-07, + "loss": 0.3403, + "step": 10940 + }, + { + "epoch": 3.72173802134263, + "grad_norm": 4.360620344296931, + "learning_rate": 7.848635842121452e-07, + "loss": 0.3491, + "step": 10945 + }, + { + "epoch": 3.723438629309978, + "grad_norm": 2.873390222661289, + "learning_rate": 7.828913343636407e-07, + "loss": 0.3198, + "step": 10950 + }, + { + "epoch": 3.7251392372773267, + "grad_norm": 2.610382891982884, + "learning_rate": 7.809211054874102e-07, + "loss": 0.3493, + "step": 10955 + }, + { + "epoch": 3.726839845244675, + "grad_norm": 5.011617850920955, + "learning_rate": 7.789528999023369e-07, + "loss": 0.3472, + "step": 10960 + }, + { + "epoch": 3.728540453212023, + "grad_norm": 4.893587266486058, + "learning_rate": 7.769867199249264e-07, + "loss": 0.3603, + "step": 10965 + }, + { + "epoch": 3.7302410611793717, + "grad_norm": 4.605818910102079, + "learning_rate": 7.750225678692974e-07, + "loss": 0.3347, + "step": 10970 + }, + { + "epoch": 3.73194166914672, + "grad_norm": 3.330070000188197, + "learning_rate": 7.730604460471841e-07, + "loss": 0.3323, + "step": 10975 + }, + { + "epoch": 3.7336422771140683, + "grad_norm": 5.063438012995593, + "learning_rate": 7.711003567679285e-07, + "loss": 0.3396, + "step": 10980 + }, + { + "epoch": 3.7353428850814168, + "grad_norm": 5.161235017584308, + "learning_rate": 7.691423023384837e-07, + "loss": 0.3579, + "step": 10985 + }, + { + "epoch": 3.737043493048765, + "grad_norm": 5.530851535182855, + "learning_rate": 7.671862850634041e-07, + "loss": 0.3605, + "step": 10990 + }, + { + "epoch": 3.7387441010161133, + "grad_norm": 6.390053527359117, + "learning_rate": 7.652323072448503e-07, + "loss": 0.3411, + "step": 10995 + }, + { + "epoch": 3.7404447089834614, + "grad_norm": 7.104792676720178, + "learning_rate": 7.63280371182579e-07, + "loss": 0.3454, + "step": 11000 + }, + { + "epoch": 3.74214531695081, + "grad_norm": 2.8862857830970543, + "learning_rate": 7.613304791739465e-07, + "loss": 0.3636, + "step": 11005 + }, + { + "epoch": 3.7438459249181584, + "grad_norm": 5.550515786579572, + "learning_rate": 7.593826335139026e-07, + "loss": 0.3541, + "step": 11010 + }, + { + "epoch": 3.7455465328855064, + "grad_norm": 4.427229901279991, + "learning_rate": 7.574368364949872e-07, + "loss": 0.3323, + "step": 11015 + }, + { + "epoch": 3.747247140852855, + "grad_norm": 4.178971671598606, + "learning_rate": 7.554930904073313e-07, + "loss": 0.3335, + "step": 11020 + }, + { + "epoch": 3.7489477488202034, + "grad_norm": 3.8652351408257153, + "learning_rate": 7.535513975386496e-07, + "loss": 0.3588, + "step": 11025 + }, + { + "epoch": 3.7506483567875515, + "grad_norm": 5.292558860177557, + "learning_rate": 7.516117601742434e-07, + "loss": 0.34, + "step": 11030 + }, + { + "epoch": 3.7523489647549, + "grad_norm": 3.826163337894789, + "learning_rate": 7.496741805969907e-07, + "loss": 0.3537, + "step": 11035 + }, + { + "epoch": 3.754049572722248, + "grad_norm": 6.067442756842923, + "learning_rate": 7.477386610873516e-07, + "loss": 0.3455, + "step": 11040 + }, + { + "epoch": 3.7557501806895965, + "grad_norm": 2.9473887584803076, + "learning_rate": 7.458052039233582e-07, + "loss": 0.3453, + "step": 11045 + }, + { + "epoch": 3.7574507886569446, + "grad_norm": 4.690369858336077, + "learning_rate": 7.438738113806184e-07, + "loss": 0.3557, + "step": 11050 + }, + { + "epoch": 3.759151396624293, + "grad_norm": 4.491174167199348, + "learning_rate": 7.419444857323068e-07, + "loss": 0.3472, + "step": 11055 + }, + { + "epoch": 3.7608520045916416, + "grad_norm": 3.799149701193832, + "learning_rate": 7.400172292491686e-07, + "loss": 0.3458, + "step": 11060 + }, + { + "epoch": 3.7625526125589897, + "grad_norm": 4.673187961284107, + "learning_rate": 7.38092044199511e-07, + "loss": 0.3556, + "step": 11065 + }, + { + "epoch": 3.764253220526338, + "grad_norm": 4.23084028829816, + "learning_rate": 7.361689328492044e-07, + "loss": 0.3521, + "step": 11070 + }, + { + "epoch": 3.7659538284936867, + "grad_norm": 3.152798840011891, + "learning_rate": 7.342478974616799e-07, + "loss": 0.3581, + "step": 11075 + }, + { + "epoch": 3.7676544364610347, + "grad_norm": 23.4908790433505, + "learning_rate": 7.323289402979223e-07, + "loss": 0.3593, + "step": 11080 + }, + { + "epoch": 3.769355044428383, + "grad_norm": 16.479284492601987, + "learning_rate": 7.30412063616473e-07, + "loss": 0.3544, + "step": 11085 + }, + { + "epoch": 3.7710556523957317, + "grad_norm": 4.3508974680303, + "learning_rate": 7.284972696734225e-07, + "loss": 0.3333, + "step": 11090 + }, + { + "epoch": 3.7727562603630798, + "grad_norm": 3.952590145244473, + "learning_rate": 7.265845607224125e-07, + "loss": 0.3402, + "step": 11095 + }, + { + "epoch": 3.7744568683304283, + "grad_norm": 3.421443781155386, + "learning_rate": 7.246739390146285e-07, + "loss": 0.3609, + "step": 11100 + }, + { + "epoch": 3.7761574762977763, + "grad_norm": 8.340835601491221, + "learning_rate": 7.227654067988013e-07, + "loss": 0.3547, + "step": 11105 + }, + { + "epoch": 3.777858084265125, + "grad_norm": 3.411412544859863, + "learning_rate": 7.208589663212002e-07, + "loss": 0.3497, + "step": 11110 + }, + { + "epoch": 3.779558692232473, + "grad_norm": 3.567161121256871, + "learning_rate": 7.18954619825635e-07, + "loss": 0.37, + "step": 11115 + }, + { + "epoch": 3.7812593001998214, + "grad_norm": 3.615767506793355, + "learning_rate": 7.17052369553449e-07, + "loss": 0.3336, + "step": 11120 + }, + { + "epoch": 3.78295990816717, + "grad_norm": 4.134690328545315, + "learning_rate": 7.151522177435196e-07, + "loss": 0.3551, + "step": 11125 + }, + { + "epoch": 3.784660516134518, + "grad_norm": 4.0514203148253385, + "learning_rate": 7.132541666322548e-07, + "loss": 0.3326, + "step": 11130 + }, + { + "epoch": 3.7863611241018664, + "grad_norm": 4.172031178322271, + "learning_rate": 7.113582184535874e-07, + "loss": 0.3582, + "step": 11135 + }, + { + "epoch": 3.788061732069215, + "grad_norm": 2.7164967983329187, + "learning_rate": 7.094643754389794e-07, + "loss": 0.3478, + "step": 11140 + }, + { + "epoch": 3.789762340036563, + "grad_norm": 3.3892192131767804, + "learning_rate": 7.075726398174104e-07, + "loss": 0.3636, + "step": 11145 + }, + { + "epoch": 3.7914629480039115, + "grad_norm": 2.963601178791977, + "learning_rate": 7.056830138153842e-07, + "loss": 0.361, + "step": 11150 + }, + { + "epoch": 3.79316355597126, + "grad_norm": 12.594805720891362, + "learning_rate": 7.037954996569174e-07, + "loss": 0.3305, + "step": 11155 + }, + { + "epoch": 3.794864163938608, + "grad_norm": 3.162660613013296, + "learning_rate": 7.01910099563545e-07, + "loss": 0.3404, + "step": 11160 + }, + { + "epoch": 3.7965647719059565, + "grad_norm": 3.8671981846704053, + "learning_rate": 7.000268157543102e-07, + "loss": 0.352, + "step": 11165 + }, + { + "epoch": 3.7982653798733046, + "grad_norm": 3.331325951800168, + "learning_rate": 6.981456504457687e-07, + "loss": 0.3444, + "step": 11170 + }, + { + "epoch": 3.799965987840653, + "grad_norm": 4.034136357496109, + "learning_rate": 6.962666058519801e-07, + "loss": 0.329, + "step": 11175 + }, + { + "epoch": 3.801666595808001, + "grad_norm": 5.7048239174405735, + "learning_rate": 6.943896841845105e-07, + "loss": 0.3606, + "step": 11180 + }, + { + "epoch": 3.8033672037753496, + "grad_norm": 3.9899757130108324, + "learning_rate": 6.925148876524243e-07, + "loss": 0.3587, + "step": 11185 + }, + { + "epoch": 3.805067811742698, + "grad_norm": 8.230030083565003, + "learning_rate": 6.906422184622874e-07, + "loss": 0.3427, + "step": 11190 + }, + { + "epoch": 3.806768419710046, + "grad_norm": 73.47520067805937, + "learning_rate": 6.88771678818162e-07, + "loss": 0.3474, + "step": 11195 + }, + { + "epoch": 3.8084690276773947, + "grad_norm": 3.933562785537101, + "learning_rate": 6.869032709216009e-07, + "loss": 0.3227, + "step": 11200 + }, + { + "epoch": 3.810169635644743, + "grad_norm": 4.750777773820389, + "learning_rate": 6.850369969716514e-07, + "loss": 0.3598, + "step": 11205 + }, + { + "epoch": 3.8118702436120913, + "grad_norm": 4.465645703924592, + "learning_rate": 6.831728591648465e-07, + "loss": 0.3295, + "step": 11210 + }, + { + "epoch": 3.8135708515794398, + "grad_norm": 5.031035772172988, + "learning_rate": 6.813108596952075e-07, + "loss": 0.3438, + "step": 11215 + }, + { + "epoch": 3.815271459546788, + "grad_norm": 3.7204766759148225, + "learning_rate": 6.794510007542363e-07, + "loss": 0.3468, + "step": 11220 + }, + { + "epoch": 3.8169720675141363, + "grad_norm": 31.049482893342063, + "learning_rate": 6.775932845309183e-07, + "loss": 0.3222, + "step": 11225 + }, + { + "epoch": 3.8186726754814844, + "grad_norm": 3.281552307266753, + "learning_rate": 6.757377132117144e-07, + "loss": 0.3525, + "step": 11230 + }, + { + "epoch": 3.820373283448833, + "grad_norm": 6.6996368343978645, + "learning_rate": 6.73884288980563e-07, + "loss": 0.3483, + "step": 11235 + }, + { + "epoch": 3.8220738914161814, + "grad_norm": 8.196919341429526, + "learning_rate": 6.720330140188738e-07, + "loss": 0.3287, + "step": 11240 + }, + { + "epoch": 3.8237744993835294, + "grad_norm": 6.205721311864081, + "learning_rate": 6.701838905055283e-07, + "loss": 0.3471, + "step": 11245 + }, + { + "epoch": 3.825475107350878, + "grad_norm": 3.5349480188081577, + "learning_rate": 6.68336920616876e-07, + "loss": 0.354, + "step": 11250 + }, + { + "epoch": 3.8271757153182264, + "grad_norm": 7.080195985500879, + "learning_rate": 6.664921065267294e-07, + "loss": 0.346, + "step": 11255 + }, + { + "epoch": 3.8288763232855745, + "grad_norm": 3.8942073965633903, + "learning_rate": 6.646494504063669e-07, + "loss": 0.3302, + "step": 11260 + }, + { + "epoch": 3.830576931252923, + "grad_norm": 4.2304240711639896, + "learning_rate": 6.628089544245239e-07, + "loss": 0.3488, + "step": 11265 + }, + { + "epoch": 3.8322775392202715, + "grad_norm": 6.856566626034545, + "learning_rate": 6.609706207473962e-07, + "loss": 0.3347, + "step": 11270 + }, + { + "epoch": 3.8339781471876195, + "grad_norm": 4.627238887062445, + "learning_rate": 6.591344515386322e-07, + "loss": 0.3409, + "step": 11275 + }, + { + "epoch": 3.835678755154968, + "grad_norm": 3.9126156850905693, + "learning_rate": 6.573004489593352e-07, + "loss": 0.3273, + "step": 11280 + }, + { + "epoch": 3.837379363122316, + "grad_norm": 9.229389838058808, + "learning_rate": 6.554686151680553e-07, + "loss": 0.3605, + "step": 11285 + }, + { + "epoch": 3.8390799710896646, + "grad_norm": 4.1718374787018835, + "learning_rate": 6.536389523207942e-07, + "loss": 0.3593, + "step": 11290 + }, + { + "epoch": 3.8407805790570126, + "grad_norm": 4.279657448841572, + "learning_rate": 6.518114625709946e-07, + "loss": 0.3401, + "step": 11295 + }, + { + "epoch": 3.842481187024361, + "grad_norm": 3.7404141093085412, + "learning_rate": 6.499861480695441e-07, + "loss": 0.3624, + "step": 11300 + }, + { + "epoch": 3.8441817949917096, + "grad_norm": 4.092752721420215, + "learning_rate": 6.48163010964768e-07, + "loss": 0.3372, + "step": 11305 + }, + { + "epoch": 3.8458824029590577, + "grad_norm": 3.9814207225263187, + "learning_rate": 6.463420534024309e-07, + "loss": 0.3357, + "step": 11310 + }, + { + "epoch": 3.847583010926406, + "grad_norm": 4.154660851481634, + "learning_rate": 6.445232775257318e-07, + "loss": 0.331, + "step": 11315 + }, + { + "epoch": 3.8492836188937547, + "grad_norm": 4.795935575498587, + "learning_rate": 6.427066854753003e-07, + "loss": 0.3449, + "step": 11320 + }, + { + "epoch": 3.8509842268611028, + "grad_norm": 5.521989090755959, + "learning_rate": 6.408922793891981e-07, + "loss": 0.3345, + "step": 11325 + }, + { + "epoch": 3.8526848348284513, + "grad_norm": 3.6124264673601765, + "learning_rate": 6.390800614029116e-07, + "loss": 0.3611, + "step": 11330 + }, + { + "epoch": 3.8543854427957998, + "grad_norm": 3.958122727061788, + "learning_rate": 6.372700336493546e-07, + "loss": 0.3374, + "step": 11335 + }, + { + "epoch": 3.856086050763148, + "grad_norm": 9.65760106782031, + "learning_rate": 6.354621982588596e-07, + "loss": 0.3428, + "step": 11340 + }, + { + "epoch": 3.8577866587304963, + "grad_norm": 4.3528128756831475, + "learning_rate": 6.336565573591833e-07, + "loss": 0.3384, + "step": 11345 + }, + { + "epoch": 3.8594872666978444, + "grad_norm": 4.133636095887551, + "learning_rate": 6.318531130754949e-07, + "loss": 0.3488, + "step": 11350 + }, + { + "epoch": 3.861187874665193, + "grad_norm": 4.171733396761034, + "learning_rate": 6.300518675303821e-07, + "loss": 0.3519, + "step": 11355 + }, + { + "epoch": 3.862888482632541, + "grad_norm": 14.418268068439486, + "learning_rate": 6.282528228438417e-07, + "loss": 0.3318, + "step": 11360 + }, + { + "epoch": 3.8645890905998894, + "grad_norm": 14.77951062584472, + "learning_rate": 6.264559811332829e-07, + "loss": 0.3408, + "step": 11365 + }, + { + "epoch": 3.866289698567238, + "grad_norm": 3.52859372811698, + "learning_rate": 6.246613445135194e-07, + "loss": 0.3484, + "step": 11370 + }, + { + "epoch": 3.867990306534586, + "grad_norm": 5.050039245168925, + "learning_rate": 6.228689150967718e-07, + "loss": 0.3387, + "step": 11375 + }, + { + "epoch": 3.8696909145019345, + "grad_norm": 3.082733389247098, + "learning_rate": 6.210786949926626e-07, + "loss": 0.3356, + "step": 11380 + }, + { + "epoch": 3.871391522469283, + "grad_norm": 3.621152839290671, + "learning_rate": 6.192906863082121e-07, + "loss": 0.3294, + "step": 11385 + }, + { + "epoch": 3.873092130436631, + "grad_norm": 21.51319810625766, + "learning_rate": 6.175048911478407e-07, + "loss": 0.3412, + "step": 11390 + }, + { + "epoch": 3.8747927384039795, + "grad_norm": 11.121889177588187, + "learning_rate": 6.157213116133604e-07, + "loss": 0.3311, + "step": 11395 + }, + { + "epoch": 3.8764933463713276, + "grad_norm": 23.979128719969413, + "learning_rate": 6.139399498039791e-07, + "loss": 0.3547, + "step": 11400 + }, + { + "epoch": 3.878193954338676, + "grad_norm": 5.665138082890954, + "learning_rate": 6.12160807816291e-07, + "loss": 0.3434, + "step": 11405 + }, + { + "epoch": 3.879894562306024, + "grad_norm": 6.6538289108242745, + "learning_rate": 6.103838877442806e-07, + "loss": 0.3622, + "step": 11410 + }, + { + "epoch": 3.8815951702733726, + "grad_norm": 5.380750179055174, + "learning_rate": 6.086091916793144e-07, + "loss": 0.3529, + "step": 11415 + }, + { + "epoch": 3.883295778240721, + "grad_norm": 3.6063176955128182, + "learning_rate": 6.068367217101446e-07, + "loss": 0.3644, + "step": 11420 + }, + { + "epoch": 3.884996386208069, + "grad_norm": 7.8881610245357345, + "learning_rate": 6.050664799228998e-07, + "loss": 0.3597, + "step": 11425 + }, + { + "epoch": 3.8866969941754177, + "grad_norm": 4.508278572980578, + "learning_rate": 6.03298468401089e-07, + "loss": 0.3466, + "step": 11430 + }, + { + "epoch": 3.888397602142766, + "grad_norm": 3.061000778590297, + "learning_rate": 6.01532689225596e-07, + "loss": 0.3433, + "step": 11435 + }, + { + "epoch": 3.8900982101101143, + "grad_norm": 3.6907552434175925, + "learning_rate": 5.997691444746748e-07, + "loss": 0.3466, + "step": 11440 + }, + { + "epoch": 3.8917988180774628, + "grad_norm": 4.739034167572349, + "learning_rate": 5.980078362239525e-07, + "loss": 0.3391, + "step": 11445 + }, + { + "epoch": 3.8934994260448113, + "grad_norm": 6.560917584236525, + "learning_rate": 5.962487665464217e-07, + "loss": 0.3555, + "step": 11450 + }, + { + "epoch": 3.8952000340121593, + "grad_norm": 4.935951791754003, + "learning_rate": 5.94491937512442e-07, + "loss": 0.3389, + "step": 11455 + }, + { + "epoch": 3.896900641979508, + "grad_norm": 3.786243244585295, + "learning_rate": 5.927373511897341e-07, + "loss": 0.3352, + "step": 11460 + }, + { + "epoch": 3.898601249946856, + "grad_norm": 4.731739492632203, + "learning_rate": 5.909850096433814e-07, + "loss": 0.3542, + "step": 11465 + }, + { + "epoch": 3.9003018579142044, + "grad_norm": 5.166070287462641, + "learning_rate": 5.892349149358223e-07, + "loss": 0.3354, + "step": 11470 + }, + { + "epoch": 3.9020024658815524, + "grad_norm": 5.070315415224376, + "learning_rate": 5.874870691268542e-07, + "loss": 0.3449, + "step": 11475 + }, + { + "epoch": 3.903703073848901, + "grad_norm": 5.32512289281027, + "learning_rate": 5.857414742736239e-07, + "loss": 0.342, + "step": 11480 + }, + { + "epoch": 3.9054036818162494, + "grad_norm": 11.970175542429455, + "learning_rate": 5.839981324306327e-07, + "loss": 0.3565, + "step": 11485 + }, + { + "epoch": 3.9071042897835975, + "grad_norm": 4.697387642701096, + "learning_rate": 5.822570456497267e-07, + "loss": 0.3442, + "step": 11490 + }, + { + "epoch": 3.908804897750946, + "grad_norm": 3.2676261209216415, + "learning_rate": 5.805182159801003e-07, + "loss": 0.3518, + "step": 11495 + }, + { + "epoch": 3.9105055057182945, + "grad_norm": 3.527915684751866, + "learning_rate": 5.787816454682913e-07, + "loss": 0.3454, + "step": 11500 + }, + { + "epoch": 3.9122061136856425, + "grad_norm": 3.9774769863607853, + "learning_rate": 5.770473361581763e-07, + "loss": 0.3573, + "step": 11505 + }, + { + "epoch": 3.913906721652991, + "grad_norm": 4.598293619230731, + "learning_rate": 5.753152900909739e-07, + "loss": 0.3219, + "step": 11510 + }, + { + "epoch": 3.9156073296203395, + "grad_norm": 4.803845055709252, + "learning_rate": 5.73585509305235e-07, + "loss": 0.343, + "step": 11515 + }, + { + "epoch": 3.9173079375876876, + "grad_norm": 5.752345994677968, + "learning_rate": 5.718579958368486e-07, + "loss": 0.3603, + "step": 11520 + }, + { + "epoch": 3.919008545555036, + "grad_norm": 3.582750747285596, + "learning_rate": 5.701327517190311e-07, + "loss": 0.3415, + "step": 11525 + }, + { + "epoch": 3.920709153522384, + "grad_norm": 6.75928754986394, + "learning_rate": 5.684097789823318e-07, + "loss": 0.3307, + "step": 11530 + }, + { + "epoch": 3.9224097614897326, + "grad_norm": 4.849660560457972, + "learning_rate": 5.666890796546228e-07, + "loss": 0.3349, + "step": 11535 + }, + { + "epoch": 3.9241103694570807, + "grad_norm": 4.078913800118893, + "learning_rate": 5.649706557611043e-07, + "loss": 0.3587, + "step": 11540 + }, + { + "epoch": 3.925810977424429, + "grad_norm": 5.998123726294037, + "learning_rate": 5.632545093242949e-07, + "loss": 0.3386, + "step": 11545 + }, + { + "epoch": 3.9275115853917777, + "grad_norm": 12.909793774483513, + "learning_rate": 5.615406423640355e-07, + "loss": 0.3558, + "step": 11550 + }, + { + "epoch": 3.9292121933591257, + "grad_norm": 9.161852962276848, + "learning_rate": 5.598290568974829e-07, + "loss": 0.3484, + "step": 11555 + }, + { + "epoch": 3.9309128013264742, + "grad_norm": 5.681001215541077, + "learning_rate": 5.58119754939108e-07, + "loss": 0.345, + "step": 11560 + }, + { + "epoch": 3.9326134092938227, + "grad_norm": 18.390125379797897, + "learning_rate": 5.564127385006962e-07, + "loss": 0.3573, + "step": 11565 + }, + { + "epoch": 3.934314017261171, + "grad_norm": 4.473431760016146, + "learning_rate": 5.547080095913399e-07, + "loss": 0.3613, + "step": 11570 + }, + { + "epoch": 3.9360146252285193, + "grad_norm": 5.011396356874849, + "learning_rate": 5.530055702174428e-07, + "loss": 0.3577, + "step": 11575 + }, + { + "epoch": 3.9377152331958674, + "grad_norm": 3.4884679262972322, + "learning_rate": 5.513054223827099e-07, + "loss": 0.3486, + "step": 11580 + }, + { + "epoch": 3.939415841163216, + "grad_norm": 7.556067494975034, + "learning_rate": 5.49607568088153e-07, + "loss": 0.34, + "step": 11585 + }, + { + "epoch": 3.941116449130564, + "grad_norm": 3.5129385303314553, + "learning_rate": 5.479120093320814e-07, + "loss": 0.3394, + "step": 11590 + }, + { + "epoch": 3.9428170570979124, + "grad_norm": 5.3164494083512395, + "learning_rate": 5.462187481101053e-07, + "loss": 0.3578, + "step": 11595 + }, + { + "epoch": 3.944517665065261, + "grad_norm": 6.204423170047807, + "learning_rate": 5.445277864151277e-07, + "loss": 0.3382, + "step": 11600 + }, + { + "epoch": 3.946218273032609, + "grad_norm": 4.216013760119267, + "learning_rate": 5.428391262373483e-07, + "loss": 0.3263, + "step": 11605 + }, + { + "epoch": 3.9479188809999575, + "grad_norm": 4.254713132798691, + "learning_rate": 5.411527695642557e-07, + "loss": 0.3322, + "step": 11610 + }, + { + "epoch": 3.949619488967306, + "grad_norm": 7.176228807971487, + "learning_rate": 5.394687183806286e-07, + "loss": 0.3306, + "step": 11615 + }, + { + "epoch": 3.951320096934654, + "grad_norm": 4.554733786870882, + "learning_rate": 5.377869746685326e-07, + "loss": 0.3653, + "step": 11620 + }, + { + "epoch": 3.9530207049020025, + "grad_norm": 5.883493743275225, + "learning_rate": 5.361075404073151e-07, + "loss": 0.3578, + "step": 11625 + }, + { + "epoch": 3.954721312869351, + "grad_norm": 5.552533536404681, + "learning_rate": 5.344304175736089e-07, + "loss": 0.3498, + "step": 11630 + }, + { + "epoch": 3.956421920836699, + "grad_norm": 3.6293689879441855, + "learning_rate": 5.327556081413221e-07, + "loss": 0.3344, + "step": 11635 + }, + { + "epoch": 3.9581225288040476, + "grad_norm": 5.834120539117083, + "learning_rate": 5.31083114081645e-07, + "loss": 0.3537, + "step": 11640 + }, + { + "epoch": 3.9598231367713956, + "grad_norm": 3.559537450025789, + "learning_rate": 5.294129373630383e-07, + "loss": 0.3453, + "step": 11645 + }, + { + "epoch": 3.961523744738744, + "grad_norm": 5.880313505645204, + "learning_rate": 5.277450799512382e-07, + "loss": 0.3399, + "step": 11650 + }, + { + "epoch": 3.963224352706092, + "grad_norm": 30.648536531961824, + "learning_rate": 5.260795438092492e-07, + "loss": 0.359, + "step": 11655 + }, + { + "epoch": 3.9649249606734407, + "grad_norm": 5.685154949790931, + "learning_rate": 5.244163308973457e-07, + "loss": 0.344, + "step": 11660 + }, + { + "epoch": 3.966625568640789, + "grad_norm": 7.036854083405207, + "learning_rate": 5.227554431730655e-07, + "loss": 0.3735, + "step": 11665 + }, + { + "epoch": 3.9683261766081372, + "grad_norm": 4.390123520957557, + "learning_rate": 5.210968825912125e-07, + "loss": 0.35, + "step": 11670 + }, + { + "epoch": 3.9700267845754857, + "grad_norm": 13.586586371890261, + "learning_rate": 5.19440651103848e-07, + "loss": 0.3502, + "step": 11675 + }, + { + "epoch": 3.9717273925428342, + "grad_norm": 8.034737107456621, + "learning_rate": 5.177867506602962e-07, + "loss": 0.3438, + "step": 11680 + }, + { + "epoch": 3.9734280005101823, + "grad_norm": 3.861685766413837, + "learning_rate": 5.16135183207134e-07, + "loss": 0.3406, + "step": 11685 + }, + { + "epoch": 3.975128608477531, + "grad_norm": 5.740961553671016, + "learning_rate": 5.144859506881955e-07, + "loss": 0.3297, + "step": 11690 + }, + { + "epoch": 3.9768292164448793, + "grad_norm": 5.307904514450347, + "learning_rate": 5.128390550445642e-07, + "loss": 0.3348, + "step": 11695 + }, + { + "epoch": 3.9785298244122274, + "grad_norm": 4.416571373165715, + "learning_rate": 5.111944982145744e-07, + "loss": 0.3519, + "step": 11700 + }, + { + "epoch": 3.980230432379576, + "grad_norm": 4.4894924508616265, + "learning_rate": 5.095522821338089e-07, + "loss": 0.3511, + "step": 11705 + }, + { + "epoch": 3.981931040346924, + "grad_norm": 4.571727707355725, + "learning_rate": 5.079124087350925e-07, + "loss": 0.3525, + "step": 11710 + }, + { + "epoch": 3.9836316483142724, + "grad_norm": 4.111811038770526, + "learning_rate": 5.062748799484962e-07, + "loss": 0.3294, + "step": 11715 + }, + { + "epoch": 3.9853322562816205, + "grad_norm": 4.800365658437861, + "learning_rate": 5.046396977013279e-07, + "loss": 0.3443, + "step": 11720 + }, + { + "epoch": 3.987032864248969, + "grad_norm": 10.987577764196464, + "learning_rate": 5.03006863918137e-07, + "loss": 0.3356, + "step": 11725 + }, + { + "epoch": 3.9887334722163175, + "grad_norm": 4.778769125156549, + "learning_rate": 5.013763805207065e-07, + "loss": 0.3397, + "step": 11730 + }, + { + "epoch": 3.9904340801836655, + "grad_norm": 3.846104030028701, + "learning_rate": 4.997482494280545e-07, + "loss": 0.3285, + "step": 11735 + }, + { + "epoch": 3.992134688151014, + "grad_norm": 4.834777993843937, + "learning_rate": 4.981224725564296e-07, + "loss": 0.3601, + "step": 11740 + }, + { + "epoch": 3.9938352961183625, + "grad_norm": 10.691376689487361, + "learning_rate": 4.964990518193108e-07, + "loss": 0.3092, + "step": 11745 + }, + { + "epoch": 3.9955359040857106, + "grad_norm": 5.0267727496093855, + "learning_rate": 4.948779891274022e-07, + "loss": 0.3442, + "step": 11750 + }, + { + "epoch": 3.997236512053059, + "grad_norm": 6.727200424471898, + "learning_rate": 4.93259286388634e-07, + "loss": 0.3259, + "step": 11755 + }, + { + "epoch": 3.998937120020407, + "grad_norm": 3.938920036381914, + "learning_rate": 4.916429455081589e-07, + "loss": 0.362, + "step": 11760 + }, + { + "epoch": 4.00034012159347, + "grad_norm": 5.135731470701529, + "learning_rate": 4.900289683883483e-07, + "loss": 0.2891, + "step": 11765 + }, + { + "epoch": 4.002040729560818, + "grad_norm": 4.496709671994853, + "learning_rate": 4.884173569287937e-07, + "loss": 0.3317, + "step": 11770 + }, + { + "epoch": 4.003741337528166, + "grad_norm": 6.208874128074458, + "learning_rate": 4.868081130263e-07, + "loss": 0.3266, + "step": 11775 + }, + { + "epoch": 4.005441945495515, + "grad_norm": 3.4389004641980785, + "learning_rate": 4.852012385748875e-07, + "loss": 0.3181, + "step": 11780 + }, + { + "epoch": 4.007142553462863, + "grad_norm": 3.8965981469706903, + "learning_rate": 4.835967354657864e-07, + "loss": 0.3317, + "step": 11785 + }, + { + "epoch": 4.008843161430211, + "grad_norm": 4.244085913003326, + "learning_rate": 4.81994605587437e-07, + "loss": 0.3383, + "step": 11790 + }, + { + "epoch": 4.010543769397559, + "grad_norm": 4.734834888059188, + "learning_rate": 4.803948508254852e-07, + "loss": 0.3075, + "step": 11795 + }, + { + "epoch": 4.012244377364908, + "grad_norm": 5.687682401041763, + "learning_rate": 4.787974730627832e-07, + "loss": 0.3404, + "step": 11800 + }, + { + "epoch": 4.013944985332256, + "grad_norm": 4.60895234001638, + "learning_rate": 4.772024741793829e-07, + "loss": 0.3288, + "step": 11805 + }, + { + "epoch": 4.015645593299604, + "grad_norm": 3.3667932580721316, + "learning_rate": 4.756098560525396e-07, + "loss": 0.318, + "step": 11810 + }, + { + "epoch": 4.017346201266953, + "grad_norm": 7.397026909437256, + "learning_rate": 4.7401962055670383e-07, + "loss": 0.3168, + "step": 11815 + }, + { + "epoch": 4.019046809234301, + "grad_norm": 3.8001404263947447, + "learning_rate": 4.7243176956352304e-07, + "loss": 0.3255, + "step": 11820 + }, + { + "epoch": 4.020747417201649, + "grad_norm": 4.107057495039566, + "learning_rate": 4.7084630494183907e-07, + "loss": 0.3262, + "step": 11825 + }, + { + "epoch": 4.022448025168998, + "grad_norm": 4.666556025494843, + "learning_rate": 4.692632285576826e-07, + "loss": 0.3335, + "step": 11830 + }, + { + "epoch": 4.024148633136346, + "grad_norm": 17.10531736370999, + "learning_rate": 4.6768254227427644e-07, + "loss": 0.3335, + "step": 11835 + }, + { + "epoch": 4.025849241103694, + "grad_norm": 3.8543574029556456, + "learning_rate": 4.661042479520275e-07, + "loss": 0.354, + "step": 11840 + }, + { + "epoch": 4.027549849071043, + "grad_norm": 4.412476871447233, + "learning_rate": 4.6452834744853013e-07, + "loss": 0.3471, + "step": 11845 + }, + { + "epoch": 4.029250457038391, + "grad_norm": 6.062941877478426, + "learning_rate": 4.6295484261855845e-07, + "loss": 0.3565, + "step": 11850 + }, + { + "epoch": 4.030951065005739, + "grad_norm": 4.212755558174354, + "learning_rate": 4.613837353140696e-07, + "loss": 0.3366, + "step": 11855 + }, + { + "epoch": 4.0326516729730875, + "grad_norm": 8.39574948864062, + "learning_rate": 4.59815027384197e-07, + "loss": 0.3487, + "step": 11860 + }, + { + "epoch": 4.034352280940436, + "grad_norm": 3.6531708455443277, + "learning_rate": 4.5824872067525173e-07, + "loss": 0.3356, + "step": 11865 + }, + { + "epoch": 4.0360528889077845, + "grad_norm": 10.149279127989958, + "learning_rate": 4.566848170307167e-07, + "loss": 0.3399, + "step": 11870 + }, + { + "epoch": 4.0377534968751325, + "grad_norm": 8.009957540856208, + "learning_rate": 4.551233182912482e-07, + "loss": 0.3355, + "step": 11875 + }, + { + "epoch": 4.0394541048424815, + "grad_norm": 6.730013534884652, + "learning_rate": 4.5356422629467183e-07, + "loss": 0.3265, + "step": 11880 + }, + { + "epoch": 4.0411547128098295, + "grad_norm": 3.4953852628691076, + "learning_rate": 4.5200754287597957e-07, + "loss": 0.3584, + "step": 11885 + }, + { + "epoch": 4.042855320777178, + "grad_norm": 5.58030837836292, + "learning_rate": 4.504532698673297e-07, + "loss": 0.3411, + "step": 11890 + }, + { + "epoch": 4.0445559287445265, + "grad_norm": 5.8306395005855975, + "learning_rate": 4.4890140909804256e-07, + "loss": 0.3174, + "step": 11895 + }, + { + "epoch": 4.046256536711875, + "grad_norm": 5.031289449422286, + "learning_rate": 4.473519623946007e-07, + "loss": 0.3415, + "step": 11900 + }, + { + "epoch": 4.047957144679223, + "grad_norm": 3.398075231787273, + "learning_rate": 4.4580493158064344e-07, + "loss": 0.3361, + "step": 11905 + }, + { + "epoch": 4.049657752646572, + "grad_norm": 5.040352157836528, + "learning_rate": 4.442603184769692e-07, + "loss": 0.3512, + "step": 11910 + }, + { + "epoch": 4.05135836061392, + "grad_norm": 4.506327411419073, + "learning_rate": 4.4271812490152777e-07, + "loss": 0.331, + "step": 11915 + }, + { + "epoch": 4.053058968581268, + "grad_norm": 4.242167387806916, + "learning_rate": 4.411783526694247e-07, + "loss": 0.3537, + "step": 11920 + }, + { + "epoch": 4.054759576548616, + "grad_norm": 4.766638417890972, + "learning_rate": 4.396410035929122e-07, + "loss": 0.3614, + "step": 11925 + }, + { + "epoch": 4.056460184515965, + "grad_norm": 4.9021226309565336, + "learning_rate": 4.3810607948139404e-07, + "loss": 0.3467, + "step": 11930 + }, + { + "epoch": 4.058160792483313, + "grad_norm": 6.173945901366074, + "learning_rate": 4.365735821414163e-07, + "loss": 0.3183, + "step": 11935 + }, + { + "epoch": 4.059861400450661, + "grad_norm": 6.092350405877605, + "learning_rate": 4.35043513376672e-07, + "loss": 0.3365, + "step": 11940 + }, + { + "epoch": 4.06156200841801, + "grad_norm": 9.884534395369656, + "learning_rate": 4.3351587498799474e-07, + "loss": 0.3301, + "step": 11945 + }, + { + "epoch": 4.063262616385358, + "grad_norm": 7.639977419079223, + "learning_rate": 4.3199066877335604e-07, + "loss": 0.3391, + "step": 11950 + }, + { + "epoch": 4.064963224352706, + "grad_norm": 5.863388161011214, + "learning_rate": 4.3046789652786776e-07, + "loss": 0.3283, + "step": 11955 + }, + { + "epoch": 4.066663832320055, + "grad_norm": 9.919872534614637, + "learning_rate": 4.2894756004377395e-07, + "loss": 0.3301, + "step": 11960 + }, + { + "epoch": 4.068364440287403, + "grad_norm": 4.407329918289548, + "learning_rate": 4.2742966111045474e-07, + "loss": 0.3328, + "step": 11965 + }, + { + "epoch": 4.070065048254751, + "grad_norm": 8.995267251325002, + "learning_rate": 4.2591420151441907e-07, + "loss": 0.3232, + "step": 11970 + }, + { + "epoch": 4.0717656562221, + "grad_norm": 4.424394805699658, + "learning_rate": 4.244011830393069e-07, + "loss": 0.3306, + "step": 11975 + }, + { + "epoch": 4.073466264189448, + "grad_norm": 4.906959662453473, + "learning_rate": 4.2289060746588287e-07, + "loss": 0.3681, + "step": 11980 + }, + { + "epoch": 4.075166872156796, + "grad_norm": 5.071382231563371, + "learning_rate": 4.213824765720384e-07, + "loss": 0.3211, + "step": 11985 + }, + { + "epoch": 4.076867480124144, + "grad_norm": 3.9643075804477026, + "learning_rate": 4.198767921327862e-07, + "loss": 0.3458, + "step": 11990 + }, + { + "epoch": 4.078568088091493, + "grad_norm": 10.401584275639529, + "learning_rate": 4.183735559202606e-07, + "loss": 0.3218, + "step": 11995 + }, + { + "epoch": 4.080268696058841, + "grad_norm": 6.065678934112641, + "learning_rate": 4.168727697037142e-07, + "loss": 0.3357, + "step": 12000 + }, + { + "epoch": 4.081969304026189, + "grad_norm": 6.745161884490871, + "learning_rate": 4.1537443524951535e-07, + "loss": 0.3313, + "step": 12005 + }, + { + "epoch": 4.083669911993538, + "grad_norm": 7.948902751138924, + "learning_rate": 4.138785543211482e-07, + "loss": 0.3399, + "step": 12010 + }, + { + "epoch": 4.085370519960886, + "grad_norm": 3.3476008743028696, + "learning_rate": 4.123851286792069e-07, + "loss": 0.352, + "step": 12015 + }, + { + "epoch": 4.087071127928234, + "grad_norm": 8.785461189996955, + "learning_rate": 4.1089416008139896e-07, + "loss": 0.3309, + "step": 12020 + }, + { + "epoch": 4.088771735895583, + "grad_norm": 8.244632978963141, + "learning_rate": 4.0940565028253637e-07, + "loss": 0.3129, + "step": 12025 + }, + { + "epoch": 4.090472343862931, + "grad_norm": 3.7533078229490675, + "learning_rate": 4.07919601034541e-07, + "loss": 0.3202, + "step": 12030 + }, + { + "epoch": 4.092172951830279, + "grad_norm": 4.390947639905816, + "learning_rate": 4.064360140864354e-07, + "loss": 0.3221, + "step": 12035 + }, + { + "epoch": 4.093873559797627, + "grad_norm": 5.598651161063812, + "learning_rate": 4.0495489118434676e-07, + "loss": 0.3559, + "step": 12040 + }, + { + "epoch": 4.095574167764976, + "grad_norm": 4.761020664773868, + "learning_rate": 4.034762340714998e-07, + "loss": 0.3254, + "step": 12045 + }, + { + "epoch": 4.097274775732324, + "grad_norm": 6.763489160147899, + "learning_rate": 4.0200004448821965e-07, + "loss": 0.3429, + "step": 12050 + }, + { + "epoch": 4.098975383699672, + "grad_norm": 5.971382865942539, + "learning_rate": 4.0052632417192456e-07, + "loss": 0.3337, + "step": 12055 + }, + { + "epoch": 4.100675991667021, + "grad_norm": 6.7303913678187826, + "learning_rate": 3.990550748571284e-07, + "loss": 0.3256, + "step": 12060 + }, + { + "epoch": 4.102376599634369, + "grad_norm": 5.083156364017551, + "learning_rate": 3.975862982754369e-07, + "loss": 0.3276, + "step": 12065 + }, + { + "epoch": 4.104077207601717, + "grad_norm": 9.445355887927477, + "learning_rate": 3.961199961555437e-07, + "loss": 0.3381, + "step": 12070 + }, + { + "epoch": 4.105777815569066, + "grad_norm": 5.829236079661058, + "learning_rate": 3.946561702232321e-07, + "loss": 0.3232, + "step": 12075 + }, + { + "epoch": 4.107478423536414, + "grad_norm": 5.016957899041762, + "learning_rate": 3.9319482220136955e-07, + "loss": 0.3531, + "step": 12080 + }, + { + "epoch": 4.109179031503762, + "grad_norm": 5.537718475199659, + "learning_rate": 3.917359538099083e-07, + "loss": 0.3133, + "step": 12085 + }, + { + "epoch": 4.110879639471111, + "grad_norm": 4.802846544132063, + "learning_rate": 3.902795667658804e-07, + "loss": 0.3379, + "step": 12090 + }, + { + "epoch": 4.112580247438459, + "grad_norm": 7.493129998410939, + "learning_rate": 3.8882566278340003e-07, + "loss": 0.3264, + "step": 12095 + }, + { + "epoch": 4.1142808554058075, + "grad_norm": 7.924193722489419, + "learning_rate": 3.8737424357365634e-07, + "loss": 0.313, + "step": 12100 + }, + { + "epoch": 4.1159814633731555, + "grad_norm": 6.096111272079246, + "learning_rate": 3.8592531084491594e-07, + "loss": 0.3567, + "step": 12105 + }, + { + "epoch": 4.1176820713405045, + "grad_norm": 8.966564981592384, + "learning_rate": 3.84478866302517e-07, + "loss": 0.326, + "step": 12110 + }, + { + "epoch": 4.1193826793078525, + "grad_norm": 6.2294686801633015, + "learning_rate": 3.8303491164887185e-07, + "loss": 0.345, + "step": 12115 + }, + { + "epoch": 4.121083287275201, + "grad_norm": 8.22935998004008, + "learning_rate": 3.815934485834591e-07, + "loss": 0.3341, + "step": 12120 + }, + { + "epoch": 4.1227838952425495, + "grad_norm": 5.905648836128258, + "learning_rate": 3.801544788028275e-07, + "loss": 0.3152, + "step": 12125 + }, + { + "epoch": 4.124484503209898, + "grad_norm": 3.4890504674637013, + "learning_rate": 3.787180040005908e-07, + "loss": 0.3425, + "step": 12130 + }, + { + "epoch": 4.126185111177246, + "grad_norm": 3.7863186325993596, + "learning_rate": 3.772840258674243e-07, + "loss": 0.3399, + "step": 12135 + }, + { + "epoch": 4.127885719144595, + "grad_norm": 5.226220359513184, + "learning_rate": 3.758525460910681e-07, + "loss": 0.3464, + "step": 12140 + }, + { + "epoch": 4.129586327111943, + "grad_norm": 9.037509795642451, + "learning_rate": 3.744235663563181e-07, + "loss": 0.3428, + "step": 12145 + }, + { + "epoch": 4.131286935079291, + "grad_norm": 5.30353100529993, + "learning_rate": 3.729970883450315e-07, + "loss": 0.3329, + "step": 12150 + }, + { + "epoch": 4.132987543046639, + "grad_norm": 5.967745002578333, + "learning_rate": 3.715731137361178e-07, + "loss": 0.3414, + "step": 12155 + }, + { + "epoch": 4.134688151013988, + "grad_norm": 3.6828922962503916, + "learning_rate": 3.701516442055425e-07, + "loss": 0.3219, + "step": 12160 + }, + { + "epoch": 4.136388758981336, + "grad_norm": 4.306941870592475, + "learning_rate": 3.687326814263209e-07, + "loss": 0.3327, + "step": 12165 + }, + { + "epoch": 4.138089366948684, + "grad_norm": 20.32543527178242, + "learning_rate": 3.673162270685196e-07, + "loss": 0.3409, + "step": 12170 + }, + { + "epoch": 4.139789974916033, + "grad_norm": 9.974612321929813, + "learning_rate": 3.6590228279925116e-07, + "loss": 0.3322, + "step": 12175 + }, + { + "epoch": 4.141490582883381, + "grad_norm": 5.616076419213856, + "learning_rate": 3.644908502826755e-07, + "loss": 0.332, + "step": 12180 + }, + { + "epoch": 4.143191190850729, + "grad_norm": 6.9246878710743465, + "learning_rate": 3.6308193117999573e-07, + "loss": 0.3365, + "step": 12185 + }, + { + "epoch": 4.144891798818078, + "grad_norm": 5.369312933630458, + "learning_rate": 3.616755271494557e-07, + "loss": 0.3192, + "step": 12190 + }, + { + "epoch": 4.146592406785426, + "grad_norm": 5.059545246742273, + "learning_rate": 3.60271639846341e-07, + "loss": 0.317, + "step": 12195 + }, + { + "epoch": 4.148293014752774, + "grad_norm": 3.4426579367914507, + "learning_rate": 3.5887027092297327e-07, + "loss": 0.3312, + "step": 12200 + }, + { + "epoch": 4.149993622720123, + "grad_norm": 5.554490698341612, + "learning_rate": 3.5747142202871204e-07, + "loss": 0.3408, + "step": 12205 + }, + { + "epoch": 4.151694230687471, + "grad_norm": 19.692853991946798, + "learning_rate": 3.560750948099484e-07, + "loss": 0.3324, + "step": 12210 + }, + { + "epoch": 4.153394838654819, + "grad_norm": 4.387421607416816, + "learning_rate": 3.5468129091010854e-07, + "loss": 0.3025, + "step": 12215 + }, + { + "epoch": 4.155095446622167, + "grad_norm": 3.695903233173566, + "learning_rate": 3.5329001196964557e-07, + "loss": 0.3233, + "step": 12220 + }, + { + "epoch": 4.156796054589516, + "grad_norm": 8.639861256721842, + "learning_rate": 3.5190125962604405e-07, + "loss": 0.3372, + "step": 12225 + }, + { + "epoch": 4.158496662556864, + "grad_norm": 5.297218588291433, + "learning_rate": 3.5051503551381175e-07, + "loss": 0.3599, + "step": 12230 + }, + { + "epoch": 4.160197270524212, + "grad_norm": 4.766813102258966, + "learning_rate": 3.4913134126448365e-07, + "loss": 0.3458, + "step": 12235 + }, + { + "epoch": 4.161897878491561, + "grad_norm": 3.890680257164456, + "learning_rate": 3.4775017850661425e-07, + "loss": 0.3197, + "step": 12240 + }, + { + "epoch": 4.163598486458909, + "grad_norm": 2.740938586587348, + "learning_rate": 3.4637154886578123e-07, + "loss": 0.3442, + "step": 12245 + }, + { + "epoch": 4.165299094426257, + "grad_norm": 4.018664745701717, + "learning_rate": 3.4499545396458e-07, + "loss": 0.3376, + "step": 12250 + }, + { + "epoch": 4.166999702393606, + "grad_norm": 13.659635890780235, + "learning_rate": 3.436218954226214e-07, + "loss": 0.3299, + "step": 12255 + }, + { + "epoch": 4.168700310360954, + "grad_norm": 4.199637246793097, + "learning_rate": 3.422508748565334e-07, + "loss": 0.3346, + "step": 12260 + }, + { + "epoch": 4.170400918328302, + "grad_norm": 4.015286212983886, + "learning_rate": 3.408823938799544e-07, + "loss": 0.3408, + "step": 12265 + }, + { + "epoch": 4.172101526295651, + "grad_norm": 14.278961850137955, + "learning_rate": 3.395164541035359e-07, + "loss": 0.3329, + "step": 12270 + }, + { + "epoch": 4.173802134262999, + "grad_norm": 4.070675448977317, + "learning_rate": 3.38153057134937e-07, + "loss": 0.3293, + "step": 12275 + }, + { + "epoch": 4.175502742230347, + "grad_norm": 7.07040459557904, + "learning_rate": 3.3679220457882525e-07, + "loss": 0.3378, + "step": 12280 + }, + { + "epoch": 4.177203350197695, + "grad_norm": 6.11377470581607, + "learning_rate": 3.3543389803687207e-07, + "loss": 0.3417, + "step": 12285 + }, + { + "epoch": 4.178903958165044, + "grad_norm": 5.743366459885062, + "learning_rate": 3.340781391077541e-07, + "loss": 0.3521, + "step": 12290 + }, + { + "epoch": 4.180604566132392, + "grad_norm": 12.218672829962044, + "learning_rate": 3.3272492938714733e-07, + "loss": 0.3117, + "step": 12295 + }, + { + "epoch": 4.18230517409974, + "grad_norm": 9.896118453917886, + "learning_rate": 3.3137427046772975e-07, + "loss": 0.3359, + "step": 12300 + }, + { + "epoch": 4.184005782067089, + "grad_norm": 5.284094524863797, + "learning_rate": 3.300261639391761e-07, + "loss": 0.3132, + "step": 12305 + }, + { + "epoch": 4.185706390034437, + "grad_norm": 3.7749420884738427, + "learning_rate": 3.28680611388156e-07, + "loss": 0.3173, + "step": 12310 + }, + { + "epoch": 4.187406998001785, + "grad_norm": 20.930912970941833, + "learning_rate": 3.273376143983356e-07, + "loss": 0.3447, + "step": 12315 + }, + { + "epoch": 4.189107605969134, + "grad_norm": 4.613633442599528, + "learning_rate": 3.259971745503704e-07, + "loss": 0.3421, + "step": 12320 + }, + { + "epoch": 4.190808213936482, + "grad_norm": 5.5828361989817115, + "learning_rate": 3.2465929342190867e-07, + "loss": 0.3452, + "step": 12325 + }, + { + "epoch": 4.1925088219038305, + "grad_norm": 4.37957903611316, + "learning_rate": 3.233239725875853e-07, + "loss": 0.3574, + "step": 12330 + }, + { + "epoch": 4.194209429871179, + "grad_norm": 4.51865029906568, + "learning_rate": 3.219912136190237e-07, + "loss": 0.3309, + "step": 12335 + }, + { + "epoch": 4.1959100378385275, + "grad_norm": 6.1822263663390755, + "learning_rate": 3.206610180848296e-07, + "loss": 0.3358, + "step": 12340 + }, + { + "epoch": 4.1976106458058755, + "grad_norm": 3.701188018585034, + "learning_rate": 3.1933338755059497e-07, + "loss": 0.3205, + "step": 12345 + }, + { + "epoch": 4.199311253773224, + "grad_norm": 7.457425086689321, + "learning_rate": 3.1800832357888887e-07, + "loss": 0.3302, + "step": 12350 + }, + { + "epoch": 4.2010118617405725, + "grad_norm": 5.98366094900533, + "learning_rate": 3.1668582772926367e-07, + "loss": 0.3231, + "step": 12355 + }, + { + "epoch": 4.202712469707921, + "grad_norm": 6.143107452522967, + "learning_rate": 3.1536590155824554e-07, + "loss": 0.3348, + "step": 12360 + }, + { + "epoch": 4.204413077675269, + "grad_norm": 4.104381555667731, + "learning_rate": 3.1404854661933853e-07, + "loss": 0.3218, + "step": 12365 + }, + { + "epoch": 4.206113685642618, + "grad_norm": 23.656709595759143, + "learning_rate": 3.1273376446302073e-07, + "loss": 0.3162, + "step": 12370 + }, + { + "epoch": 4.207814293609966, + "grad_norm": 13.627337480427906, + "learning_rate": 3.1142155663674013e-07, + "loss": 0.3279, + "step": 12375 + }, + { + "epoch": 4.209514901577314, + "grad_norm": 3.976971210462784, + "learning_rate": 3.1011192468491657e-07, + "loss": 0.3242, + "step": 12380 + }, + { + "epoch": 4.211215509544663, + "grad_norm": 9.02059291841068, + "learning_rate": 3.088048701489368e-07, + "loss": 0.3399, + "step": 12385 + }, + { + "epoch": 4.212916117512011, + "grad_norm": 7.79347074633455, + "learning_rate": 3.075003945671559e-07, + "loss": 0.3187, + "step": 12390 + }, + { + "epoch": 4.214616725479359, + "grad_norm": 3.501972624953021, + "learning_rate": 3.0619849947489123e-07, + "loss": 0.3289, + "step": 12395 + }, + { + "epoch": 4.216317333446707, + "grad_norm": 4.558927656676324, + "learning_rate": 3.0489918640442593e-07, + "loss": 0.3453, + "step": 12400 + }, + { + "epoch": 4.218017941414056, + "grad_norm": 11.039193918155615, + "learning_rate": 3.03602456885001e-07, + "loss": 0.3145, + "step": 12405 + }, + { + "epoch": 4.219718549381404, + "grad_norm": 4.691774804474439, + "learning_rate": 3.0230831244281943e-07, + "loss": 0.3244, + "step": 12410 + }, + { + "epoch": 4.221419157348752, + "grad_norm": 37.11292414140762, + "learning_rate": 3.010167546010395e-07, + "loss": 0.316, + "step": 12415 + }, + { + "epoch": 4.223119765316101, + "grad_norm": 4.6314090129134735, + "learning_rate": 2.997277848797769e-07, + "loss": 0.3254, + "step": 12420 + }, + { + "epoch": 4.224820373283449, + "grad_norm": 4.314612424750355, + "learning_rate": 2.9844140479610067e-07, + "loss": 0.3282, + "step": 12425 + }, + { + "epoch": 4.226520981250797, + "grad_norm": 9.376175709616241, + "learning_rate": 2.97157615864031e-07, + "loss": 0.3271, + "step": 12430 + }, + { + "epoch": 4.228221589218146, + "grad_norm": 7.5985891288198175, + "learning_rate": 2.9587641959454016e-07, + "loss": 0.3177, + "step": 12435 + }, + { + "epoch": 4.229922197185494, + "grad_norm": 3.748623010254426, + "learning_rate": 2.945978174955466e-07, + "loss": 0.3318, + "step": 12440 + }, + { + "epoch": 4.231622805152842, + "grad_norm": 3.5006880481945637, + "learning_rate": 2.9332181107191827e-07, + "loss": 0.3305, + "step": 12445 + }, + { + "epoch": 4.233323413120191, + "grad_norm": 5.626310084463864, + "learning_rate": 2.9204840182546547e-07, + "loss": 0.3236, + "step": 12450 + }, + { + "epoch": 4.235024021087539, + "grad_norm": 4.888099802880378, + "learning_rate": 2.90777591254944e-07, + "loss": 0.3361, + "step": 12455 + }, + { + "epoch": 4.236724629054887, + "grad_norm": 5.996948734691151, + "learning_rate": 2.895093808560492e-07, + "loss": 0.3179, + "step": 12460 + }, + { + "epoch": 4.238425237022235, + "grad_norm": 9.206406290347116, + "learning_rate": 2.882437721214179e-07, + "loss": 0.3351, + "step": 12465 + }, + { + "epoch": 4.240125844989584, + "grad_norm": 3.434852619964506, + "learning_rate": 2.8698076654062266e-07, + "loss": 0.3554, + "step": 12470 + }, + { + "epoch": 4.241826452956932, + "grad_norm": 21.163064351724614, + "learning_rate": 2.8572036560017504e-07, + "loss": 0.3268, + "step": 12475 + }, + { + "epoch": 4.24352706092428, + "grad_norm": 4.7982412121970635, + "learning_rate": 2.84462570783518e-07, + "loss": 0.3428, + "step": 12480 + }, + { + "epoch": 4.245227668891629, + "grad_norm": 6.091033321935728, + "learning_rate": 2.832073835710295e-07, + "loss": 0.3298, + "step": 12485 + }, + { + "epoch": 4.246928276858977, + "grad_norm": 3.8101277792160797, + "learning_rate": 2.819548054400181e-07, + "loss": 0.3448, + "step": 12490 + }, + { + "epoch": 4.248628884826325, + "grad_norm": 4.627526846992593, + "learning_rate": 2.8070483786472036e-07, + "loss": 0.3364, + "step": 12495 + }, + { + "epoch": 4.250329492793674, + "grad_norm": 11.60964562591399, + "learning_rate": 2.7945748231630154e-07, + "loss": 0.3361, + "step": 12500 + }, + { + "epoch": 4.252030100761022, + "grad_norm": 3.4926976379349997, + "learning_rate": 2.782127402628515e-07, + "loss": 0.3433, + "step": 12505 + }, + { + "epoch": 4.25373070872837, + "grad_norm": 4.21342937678202, + "learning_rate": 2.7697061316938586e-07, + "loss": 0.3018, + "step": 12510 + }, + { + "epoch": 4.255431316695718, + "grad_norm": 7.127233077067102, + "learning_rate": 2.7573110249784014e-07, + "loss": 0.3407, + "step": 12515 + }, + { + "epoch": 4.257131924663067, + "grad_norm": 8.685577768966226, + "learning_rate": 2.7449420970707297e-07, + "loss": 0.3041, + "step": 12520 + }, + { + "epoch": 4.258832532630415, + "grad_norm": 4.294271976213762, + "learning_rate": 2.732599362528596e-07, + "loss": 0.332, + "step": 12525 + }, + { + "epoch": 4.260533140597763, + "grad_norm": 4.503960368111351, + "learning_rate": 2.7202828358789455e-07, + "loss": 0.3232, + "step": 12530 + }, + { + "epoch": 4.262233748565112, + "grad_norm": 4.529662416358458, + "learning_rate": 2.7079925316178536e-07, + "loss": 0.3121, + "step": 12535 + }, + { + "epoch": 4.26393435653246, + "grad_norm": 4.065393870043167, + "learning_rate": 2.6957284642105536e-07, + "loss": 0.3622, + "step": 12540 + }, + { + "epoch": 4.265634964499808, + "grad_norm": 8.070950137139976, + "learning_rate": 2.6834906480913943e-07, + "loss": 0.3289, + "step": 12545 + }, + { + "epoch": 4.267335572467157, + "grad_norm": 4.742477935472126, + "learning_rate": 2.671279097663818e-07, + "loss": 0.3404, + "step": 12550 + }, + { + "epoch": 4.269036180434505, + "grad_norm": 10.85672255056592, + "learning_rate": 2.659093827300366e-07, + "loss": 0.3502, + "step": 12555 + }, + { + "epoch": 4.2707367884018534, + "grad_norm": 7.5889566127981105, + "learning_rate": 2.6469348513426336e-07, + "loss": 0.333, + "step": 12560 + }, + { + "epoch": 4.272437396369202, + "grad_norm": 4.797312226441103, + "learning_rate": 2.634802184101287e-07, + "loss": 0.3096, + "step": 12565 + }, + { + "epoch": 4.2741380043365504, + "grad_norm": 3.940287114364061, + "learning_rate": 2.6226958398560124e-07, + "loss": 0.3189, + "step": 12570 + }, + { + "epoch": 4.2758386123038985, + "grad_norm": 5.914220125672408, + "learning_rate": 2.6106158328555313e-07, + "loss": 0.3348, + "step": 12575 + }, + { + "epoch": 4.277539220271247, + "grad_norm": 13.579740966019758, + "learning_rate": 2.598562177317543e-07, + "loss": 0.3529, + "step": 12580 + }, + { + "epoch": 4.2792398282385955, + "grad_norm": 15.490252641479021, + "learning_rate": 2.58653488742876e-07, + "loss": 0.3415, + "step": 12585 + }, + { + "epoch": 4.2809404362059436, + "grad_norm": 6.938888473730673, + "learning_rate": 2.574533977344837e-07, + "loss": 0.3337, + "step": 12590 + }, + { + "epoch": 4.282641044173292, + "grad_norm": 3.7425468287506827, + "learning_rate": 2.562559461190406e-07, + "loss": 0.3344, + "step": 12595 + }, + { + "epoch": 4.2843416521406406, + "grad_norm": 7.1439052669196865, + "learning_rate": 2.550611353059013e-07, + "loss": 0.3367, + "step": 12600 + }, + { + "epoch": 4.286042260107989, + "grad_norm": 14.236621388928102, + "learning_rate": 2.5386896670131336e-07, + "loss": 0.3216, + "step": 12605 + }, + { + "epoch": 4.287742868075337, + "grad_norm": 3.9103518012490803, + "learning_rate": 2.5267944170841494e-07, + "loss": 0.3453, + "step": 12610 + }, + { + "epoch": 4.289443476042686, + "grad_norm": 15.03324667573704, + "learning_rate": 2.5149256172723095e-07, + "loss": 0.336, + "step": 12615 + }, + { + "epoch": 4.291144084010034, + "grad_norm": 4.9505553564093825, + "learning_rate": 2.5030832815467615e-07, + "loss": 0.3391, + "step": 12620 + }, + { + "epoch": 4.292844691977382, + "grad_norm": 12.07133173210356, + "learning_rate": 2.4912674238454724e-07, + "loss": 0.3191, + "step": 12625 + }, + { + "epoch": 4.29454529994473, + "grad_norm": 4.240895964099319, + "learning_rate": 2.479478058075274e-07, + "loss": 0.3407, + "step": 12630 + }, + { + "epoch": 4.296245907912079, + "grad_norm": 5.293194735108943, + "learning_rate": 2.4677151981117946e-07, + "loss": 0.3018, + "step": 12635 + }, + { + "epoch": 4.297946515879427, + "grad_norm": 6.1948378074512025, + "learning_rate": 2.4559788577994903e-07, + "loss": 0.3442, + "step": 12640 + }, + { + "epoch": 4.299647123846775, + "grad_norm": 4.208129472196771, + "learning_rate": 2.4442690509515835e-07, + "loss": 0.342, + "step": 12645 + }, + { + "epoch": 4.301347731814124, + "grad_norm": 5.572045612595394, + "learning_rate": 2.432585791350081e-07, + "loss": 0.3345, + "step": 12650 + }, + { + "epoch": 4.303048339781472, + "grad_norm": 25.10758918360812, + "learning_rate": 2.420929092745733e-07, + "loss": 0.3606, + "step": 12655 + }, + { + "epoch": 4.30474894774882, + "grad_norm": 3.9346043818193657, + "learning_rate": 2.4092989688580373e-07, + "loss": 0.3102, + "step": 12660 + }, + { + "epoch": 4.306449555716169, + "grad_norm": 15.556170787230272, + "learning_rate": 2.3976954333752216e-07, + "loss": 0.3162, + "step": 12665 + }, + { + "epoch": 4.308150163683517, + "grad_norm": 13.705266638952072, + "learning_rate": 2.386118499954196e-07, + "loss": 0.3277, + "step": 12670 + }, + { + "epoch": 4.309850771650865, + "grad_norm": 3.5485079389858263, + "learning_rate": 2.374568182220588e-07, + "loss": 0.3131, + "step": 12675 + }, + { + "epoch": 4.311551379618214, + "grad_norm": 4.700940412896038, + "learning_rate": 2.3630444937686763e-07, + "loss": 0.3549, + "step": 12680 + }, + { + "epoch": 4.313251987585562, + "grad_norm": 6.919243111865925, + "learning_rate": 2.3515474481614175e-07, + "loss": 0.3373, + "step": 12685 + }, + { + "epoch": 4.31495259555291, + "grad_norm": 26.410264409630383, + "learning_rate": 2.3400770589303901e-07, + "loss": 0.3449, + "step": 12690 + }, + { + "epoch": 4.316653203520259, + "grad_norm": 4.110181771728492, + "learning_rate": 2.3286333395758253e-07, + "loss": 0.3302, + "step": 12695 + }, + { + "epoch": 4.318353811487607, + "grad_norm": 5.228698212074843, + "learning_rate": 2.3172163035665386e-07, + "loss": 0.35, + "step": 12700 + }, + { + "epoch": 4.320054419454955, + "grad_norm": 4.7766472552551535, + "learning_rate": 2.3058259643399584e-07, + "loss": 0.3256, + "step": 12705 + }, + { + "epoch": 4.321755027422303, + "grad_norm": 4.290504770111564, + "learning_rate": 2.294462335302078e-07, + "loss": 0.3249, + "step": 12710 + }, + { + "epoch": 4.323455635389652, + "grad_norm": 3.541077946999206, + "learning_rate": 2.283125429827468e-07, + "loss": 0.335, + "step": 12715 + }, + { + "epoch": 4.325156243357, + "grad_norm": 3.962210605540325, + "learning_rate": 2.271815261259236e-07, + "loss": 0.3184, + "step": 12720 + }, + { + "epoch": 4.326856851324348, + "grad_norm": 5.562231012105735, + "learning_rate": 2.2605318429090224e-07, + "loss": 0.3531, + "step": 12725 + }, + { + "epoch": 4.328557459291697, + "grad_norm": 4.038234094819427, + "learning_rate": 2.2492751880569958e-07, + "loss": 0.3241, + "step": 12730 + }, + { + "epoch": 4.330258067259045, + "grad_norm": 4.27507229275323, + "learning_rate": 2.2380453099518057e-07, + "loss": 0.3374, + "step": 12735 + }, + { + "epoch": 4.331958675226393, + "grad_norm": 7.264680467702832, + "learning_rate": 2.2268422218106017e-07, + "loss": 0.3411, + "step": 12740 + }, + { + "epoch": 4.333659283193742, + "grad_norm": 3.1918500942610173, + "learning_rate": 2.2156659368189892e-07, + "loss": 0.3428, + "step": 12745 + }, + { + "epoch": 4.33535989116109, + "grad_norm": 9.260426826471768, + "learning_rate": 2.2045164681310434e-07, + "loss": 0.3258, + "step": 12750 + }, + { + "epoch": 4.337060499128438, + "grad_norm": 5.721978841971327, + "learning_rate": 2.193393828869264e-07, + "loss": 0.3509, + "step": 12755 + }, + { + "epoch": 4.338761107095786, + "grad_norm": 5.891909315177032, + "learning_rate": 2.182298032124583e-07, + "loss": 0.3381, + "step": 12760 + }, + { + "epoch": 4.340461715063135, + "grad_norm": 7.437928772365634, + "learning_rate": 2.171229090956331e-07, + "loss": 0.3438, + "step": 12765 + }, + { + "epoch": 4.342162323030483, + "grad_norm": 7.680142942754952, + "learning_rate": 2.1601870183922402e-07, + "loss": 0.3327, + "step": 12770 + }, + { + "epoch": 4.343862930997831, + "grad_norm": 3.5973104304876835, + "learning_rate": 2.1491718274284063e-07, + "loss": 0.3397, + "step": 12775 + }, + { + "epoch": 4.34556353896518, + "grad_norm": 4.860272530473138, + "learning_rate": 2.1381835310293004e-07, + "loss": 0.3532, + "step": 12780 + }, + { + "epoch": 4.347264146932528, + "grad_norm": 3.7557972617596223, + "learning_rate": 2.1272221421277383e-07, + "loss": 0.3424, + "step": 12785 + }, + { + "epoch": 4.348964754899876, + "grad_norm": 7.552587829453479, + "learning_rate": 2.1162876736248534e-07, + "loss": 0.3552, + "step": 12790 + }, + { + "epoch": 4.350665362867225, + "grad_norm": 5.377756282356434, + "learning_rate": 2.1053801383901117e-07, + "loss": 0.3191, + "step": 12795 + }, + { + "epoch": 4.352365970834573, + "grad_norm": 8.332497388343738, + "learning_rate": 2.0944995492612614e-07, + "loss": 0.3337, + "step": 12800 + }, + { + "epoch": 4.3540665788019215, + "grad_norm": 8.811937511223524, + "learning_rate": 2.0836459190443552e-07, + "loss": 0.3191, + "step": 12805 + }, + { + "epoch": 4.35576718676927, + "grad_norm": 8.97010518298963, + "learning_rate": 2.072819260513703e-07, + "loss": 0.3228, + "step": 12810 + }, + { + "epoch": 4.3574677947366185, + "grad_norm": 53.90870953043256, + "learning_rate": 2.0620195864118786e-07, + "loss": 0.3305, + "step": 12815 + }, + { + "epoch": 4.3591684027039665, + "grad_norm": 4.202582388810684, + "learning_rate": 2.051246909449686e-07, + "loss": 0.3373, + "step": 12820 + }, + { + "epoch": 4.360869010671315, + "grad_norm": 11.864689055727705, + "learning_rate": 2.0405012423061671e-07, + "loss": 0.334, + "step": 12825 + }, + { + "epoch": 4.3625696186386635, + "grad_norm": 4.62385496393284, + "learning_rate": 2.0297825976285602e-07, + "loss": 0.3125, + "step": 12830 + }, + { + "epoch": 4.364270226606012, + "grad_norm": 8.993048469078259, + "learning_rate": 2.0190909880323157e-07, + "loss": 0.3504, + "step": 12835 + }, + { + "epoch": 4.36597083457336, + "grad_norm": 6.876532401332347, + "learning_rate": 2.0084264261010427e-07, + "loss": 0.3355, + "step": 12840 + }, + { + "epoch": 4.367671442540709, + "grad_norm": 5.528186576955831, + "learning_rate": 1.9977889243865429e-07, + "loss": 0.3237, + "step": 12845 + }, + { + "epoch": 4.369372050508057, + "grad_norm": 7.219172837643221, + "learning_rate": 1.9871784954087509e-07, + "loss": 0.3386, + "step": 12850 + }, + { + "epoch": 4.371072658475405, + "grad_norm": 4.891547091814564, + "learning_rate": 1.9765951516557358e-07, + "loss": 0.3086, + "step": 12855 + }, + { + "epoch": 4.372773266442754, + "grad_norm": 6.735058635539884, + "learning_rate": 1.9660389055837032e-07, + "loss": 0.3194, + "step": 12860 + }, + { + "epoch": 4.374473874410102, + "grad_norm": 6.233073816777856, + "learning_rate": 1.9555097696169505e-07, + "loss": 0.332, + "step": 12865 + }, + { + "epoch": 4.37617448237745, + "grad_norm": 7.408104700323201, + "learning_rate": 1.9450077561478875e-07, + "loss": 0.3456, + "step": 12870 + }, + { + "epoch": 4.377875090344798, + "grad_norm": 10.966215800829005, + "learning_rate": 1.934532877536971e-07, + "loss": 0.3387, + "step": 12875 + }, + { + "epoch": 4.379575698312147, + "grad_norm": 7.624994132110487, + "learning_rate": 1.9240851461127556e-07, + "loss": 0.3444, + "step": 12880 + }, + { + "epoch": 4.381276306279495, + "grad_norm": 4.453753920721134, + "learning_rate": 1.913664574171814e-07, + "loss": 0.3312, + "step": 12885 + }, + { + "epoch": 4.382976914246843, + "grad_norm": 3.3374771090875863, + "learning_rate": 1.9032711739787767e-07, + "loss": 0.307, + "step": 12890 + }, + { + "epoch": 4.384677522214192, + "grad_norm": 5.736321962818629, + "learning_rate": 1.8929049577662783e-07, + "loss": 0.3551, + "step": 12895 + }, + { + "epoch": 4.38637813018154, + "grad_norm": 6.918296159726646, + "learning_rate": 1.8825659377349704e-07, + "loss": 0.3317, + "step": 12900 + }, + { + "epoch": 4.388078738148888, + "grad_norm": 5.970245167001063, + "learning_rate": 1.8722541260534856e-07, + "loss": 0.3164, + "step": 12905 + }, + { + "epoch": 4.389779346116237, + "grad_norm": 8.067603469833086, + "learning_rate": 1.8619695348584433e-07, + "loss": 0.353, + "step": 12910 + }, + { + "epoch": 4.391479954083585, + "grad_norm": 4.4113207435854696, + "learning_rate": 1.8517121762544138e-07, + "loss": 0.3536, + "step": 12915 + }, + { + "epoch": 4.393180562050933, + "grad_norm": 3.9116648499255584, + "learning_rate": 1.841482062313929e-07, + "loss": 0.3313, + "step": 12920 + }, + { + "epoch": 4.394881170018282, + "grad_norm": 5.199219652097869, + "learning_rate": 1.8312792050774408e-07, + "loss": 0.3239, + "step": 12925 + }, + { + "epoch": 4.39658177798563, + "grad_norm": 6.230748647200844, + "learning_rate": 1.8211036165533324e-07, + "loss": 0.3279, + "step": 12930 + }, + { + "epoch": 4.398282385952978, + "grad_norm": 4.369606011124405, + "learning_rate": 1.8109553087178906e-07, + "loss": 0.3315, + "step": 12935 + }, + { + "epoch": 4.399982993920326, + "grad_norm": 3.671190072982252, + "learning_rate": 1.8008342935152855e-07, + "loss": 0.3244, + "step": 12940 + }, + { + "epoch": 4.401683601887675, + "grad_norm": 15.715314195589261, + "learning_rate": 1.7907405828575808e-07, + "loss": 0.3397, + "step": 12945 + }, + { + "epoch": 4.403384209855023, + "grad_norm": 4.114398666967137, + "learning_rate": 1.780674188624684e-07, + "loss": 0.3246, + "step": 12950 + }, + { + "epoch": 4.405084817822371, + "grad_norm": 4.237456116028077, + "learning_rate": 1.7706351226643687e-07, + "loss": 0.3046, + "step": 12955 + }, + { + "epoch": 4.40678542578972, + "grad_norm": 5.355889648244975, + "learning_rate": 1.7606233967922327e-07, + "loss": 0.3192, + "step": 12960 + }, + { + "epoch": 4.408486033757068, + "grad_norm": 4.169810263927873, + "learning_rate": 1.7506390227917086e-07, + "loss": 0.3324, + "step": 12965 + }, + { + "epoch": 4.410186641724416, + "grad_norm": 3.2655003902600384, + "learning_rate": 1.7406820124140162e-07, + "loss": 0.3339, + "step": 12970 + }, + { + "epoch": 4.411887249691765, + "grad_norm": 8.793739027776986, + "learning_rate": 1.7307523773781948e-07, + "loss": 0.326, + "step": 12975 + }, + { + "epoch": 4.413587857659113, + "grad_norm": 4.087601343185249, + "learning_rate": 1.7208501293710433e-07, + "loss": 0.3265, + "step": 12980 + }, + { + "epoch": 4.415288465626461, + "grad_norm": 7.843239771835991, + "learning_rate": 1.7109752800471392e-07, + "loss": 0.3446, + "step": 12985 + }, + { + "epoch": 4.416989073593809, + "grad_norm": 4.0065398536899695, + "learning_rate": 1.701127841028802e-07, + "loss": 0.33, + "step": 12990 + }, + { + "epoch": 4.418689681561158, + "grad_norm": 7.207831924182851, + "learning_rate": 1.691307823906102e-07, + "loss": 0.3259, + "step": 12995 + }, + { + "epoch": 4.420390289528506, + "grad_norm": 6.902820446735924, + "learning_rate": 1.6815152402368334e-07, + "loss": 0.3333, + "step": 13000 + }, + { + "epoch": 4.422090897495854, + "grad_norm": 4.112236792551807, + "learning_rate": 1.6717501015464905e-07, + "loss": 0.3222, + "step": 13005 + }, + { + "epoch": 4.423791505463203, + "grad_norm": 4.152300939447504, + "learning_rate": 1.6620124193282793e-07, + "loss": 0.316, + "step": 13010 + }, + { + "epoch": 4.425492113430551, + "grad_norm": 4.39405264061652, + "learning_rate": 1.6523022050430826e-07, + "loss": 0.3165, + "step": 13015 + }, + { + "epoch": 4.427192721397899, + "grad_norm": 4.366272517121309, + "learning_rate": 1.642619470119461e-07, + "loss": 0.3398, + "step": 13020 + }, + { + "epoch": 4.428893329365248, + "grad_norm": 3.589598372160312, + "learning_rate": 1.6329642259536234e-07, + "loss": 0.3312, + "step": 13025 + }, + { + "epoch": 4.430593937332596, + "grad_norm": 3.4086614751423157, + "learning_rate": 1.6233364839094324e-07, + "loss": 0.3381, + "step": 13030 + }, + { + "epoch": 4.4322945452999445, + "grad_norm": 8.301177756196116, + "learning_rate": 1.6137362553183766e-07, + "loss": 0.3644, + "step": 13035 + }, + { + "epoch": 4.433995153267293, + "grad_norm": 4.825972903763466, + "learning_rate": 1.604163551479568e-07, + "loss": 0.3157, + "step": 13040 + }, + { + "epoch": 4.4356957612346415, + "grad_norm": 3.3252927866456305, + "learning_rate": 1.59461838365971e-07, + "loss": 0.3311, + "step": 13045 + }, + { + "epoch": 4.4373963692019895, + "grad_norm": 15.790663252217936, + "learning_rate": 1.5851007630931115e-07, + "loss": 0.3477, + "step": 13050 + }, + { + "epoch": 4.4390969771693385, + "grad_norm": 5.767935348002293, + "learning_rate": 1.5756107009816586e-07, + "loss": 0.3274, + "step": 13055 + }, + { + "epoch": 4.4407975851366865, + "grad_norm": 4.79697016294387, + "learning_rate": 1.566148208494786e-07, + "loss": 0.3316, + "step": 13060 + }, + { + "epoch": 4.442498193104035, + "grad_norm": 4.731467520877539, + "learning_rate": 1.556713296769502e-07, + "loss": 0.3309, + "step": 13065 + }, + { + "epoch": 4.444198801071383, + "grad_norm": 5.88336906810782, + "learning_rate": 1.547305976910335e-07, + "loss": 0.3464, + "step": 13070 + }, + { + "epoch": 4.445899409038732, + "grad_norm": 5.955078035740091, + "learning_rate": 1.5379262599893501e-07, + "loss": 0.3263, + "step": 13075 + }, + { + "epoch": 4.44760001700608, + "grad_norm": 5.12040950557112, + "learning_rate": 1.5285741570461198e-07, + "loss": 0.3085, + "step": 13080 + }, + { + "epoch": 4.449300624973428, + "grad_norm": 10.023332032947414, + "learning_rate": 1.5192496790877198e-07, + "loss": 0.3169, + "step": 13085 + }, + { + "epoch": 4.451001232940777, + "grad_norm": 4.201150526747543, + "learning_rate": 1.5099528370887018e-07, + "loss": 0.3304, + "step": 13090 + }, + { + "epoch": 4.452701840908125, + "grad_norm": 3.888545762383645, + "learning_rate": 1.5006836419911102e-07, + "loss": 0.3412, + "step": 13095 + }, + { + "epoch": 4.454402448875473, + "grad_norm": 3.65215765164716, + "learning_rate": 1.4914421047044297e-07, + "loss": 0.3324, + "step": 13100 + }, + { + "epoch": 4.456103056842822, + "grad_norm": 4.101559637581969, + "learning_rate": 1.4822282361056095e-07, + "loss": 0.3137, + "step": 13105 + }, + { + "epoch": 4.45780366481017, + "grad_norm": 3.6110004134728793, + "learning_rate": 1.4730420470390193e-07, + "loss": 0.3293, + "step": 13110 + }, + { + "epoch": 4.459504272777518, + "grad_norm": 5.576116168819829, + "learning_rate": 1.4638835483164581e-07, + "loss": 0.3155, + "step": 13115 + }, + { + "epoch": 4.461204880744866, + "grad_norm": 11.093818283062696, + "learning_rate": 1.4547527507171422e-07, + "loss": 0.3331, + "step": 13120 + }, + { + "epoch": 4.462905488712215, + "grad_norm": 9.074945847283377, + "learning_rate": 1.4456496649876668e-07, + "loss": 0.3107, + "step": 13125 + }, + { + "epoch": 4.464606096679563, + "grad_norm": 6.604581904508539, + "learning_rate": 1.436574301842028e-07, + "loss": 0.2965, + "step": 13130 + }, + { + "epoch": 4.466306704646911, + "grad_norm": 4.158699458490385, + "learning_rate": 1.427526671961582e-07, + "loss": 0.3348, + "step": 13135 + }, + { + "epoch": 4.46800731261426, + "grad_norm": 5.955671410779653, + "learning_rate": 1.4185067859950553e-07, + "loss": 0.3288, + "step": 13140 + }, + { + "epoch": 4.469707920581608, + "grad_norm": 7.5923118833897165, + "learning_rate": 1.4095146545585052e-07, + "loss": 0.32, + "step": 13145 + }, + { + "epoch": 4.471408528548956, + "grad_norm": 5.261442511978854, + "learning_rate": 1.4005502882353418e-07, + "loss": 0.3353, + "step": 13150 + }, + { + "epoch": 4.473109136516305, + "grad_norm": 5.426856594721494, + "learning_rate": 1.3916136975762772e-07, + "loss": 0.305, + "step": 13155 + }, + { + "epoch": 4.474809744483653, + "grad_norm": 4.841462577673882, + "learning_rate": 1.3827048930993487e-07, + "loss": 0.3428, + "step": 13160 + }, + { + "epoch": 4.476510352451001, + "grad_norm": 3.6496581728439055, + "learning_rate": 1.3738238852898794e-07, + "loss": 0.3318, + "step": 13165 + }, + { + "epoch": 4.47821096041835, + "grad_norm": 5.300579498588721, + "learning_rate": 1.3649706846004862e-07, + "loss": 0.3286, + "step": 13170 + }, + { + "epoch": 4.479911568385698, + "grad_norm": 7.826093550107106, + "learning_rate": 1.3561453014510506e-07, + "loss": 0.3335, + "step": 13175 + }, + { + "epoch": 4.481612176353046, + "grad_norm": 5.432994510418763, + "learning_rate": 1.3473477462287166e-07, + "loss": 0.3135, + "step": 13180 + }, + { + "epoch": 4.483312784320394, + "grad_norm": 7.125487104046547, + "learning_rate": 1.3385780292878764e-07, + "loss": 0.3433, + "step": 13185 + }, + { + "epoch": 4.485013392287743, + "grad_norm": 8.747829967326172, + "learning_rate": 1.3298361609501513e-07, + "loss": 0.3417, + "step": 13190 + }, + { + "epoch": 4.486714000255091, + "grad_norm": 3.2934469329019627, + "learning_rate": 1.321122151504403e-07, + "loss": 0.3395, + "step": 13195 + }, + { + "epoch": 4.488414608222439, + "grad_norm": 4.913953515495719, + "learning_rate": 1.3124360112066775e-07, + "loss": 0.3437, + "step": 13200 + }, + { + "epoch": 4.490115216189788, + "grad_norm": 4.147259235923274, + "learning_rate": 1.3037777502802497e-07, + "loss": 0.3222, + "step": 13205 + }, + { + "epoch": 4.491815824157136, + "grad_norm": 5.4785928158207335, + "learning_rate": 1.2951473789155568e-07, + "loss": 0.3309, + "step": 13210 + }, + { + "epoch": 4.493516432124484, + "grad_norm": 4.861894897417913, + "learning_rate": 1.2865449072702263e-07, + "loss": 0.3389, + "step": 13215 + }, + { + "epoch": 4.495217040091833, + "grad_norm": 4.048798735180959, + "learning_rate": 1.27797034546904e-07, + "loss": 0.3193, + "step": 13220 + }, + { + "epoch": 4.496917648059181, + "grad_norm": 5.437187565523013, + "learning_rate": 1.2694237036039393e-07, + "loss": 0.3283, + "step": 13225 + }, + { + "epoch": 4.498618256026529, + "grad_norm": 8.55892872499695, + "learning_rate": 1.260904991733991e-07, + "loss": 0.3128, + "step": 13230 + }, + { + "epoch": 4.500318863993877, + "grad_norm": 7.9516763528490335, + "learning_rate": 1.2524142198854062e-07, + "loss": 0.3372, + "step": 13235 + }, + { + "epoch": 4.502019471961226, + "grad_norm": 4.4812834219775155, + "learning_rate": 1.2439513980515045e-07, + "loss": 0.3501, + "step": 13240 + }, + { + "epoch": 4.503720079928574, + "grad_norm": 5.268924961688235, + "learning_rate": 1.2355165361927045e-07, + "loss": 0.3548, + "step": 13245 + }, + { + "epoch": 4.505420687895922, + "grad_norm": 3.3432882139172673, + "learning_rate": 1.227109644236524e-07, + "loss": 0.2985, + "step": 13250 + }, + { + "epoch": 4.507121295863271, + "grad_norm": 4.14663984609508, + "learning_rate": 1.2187307320775526e-07, + "loss": 0.3348, + "step": 13255 + }, + { + "epoch": 4.508821903830619, + "grad_norm": 3.9700860026519345, + "learning_rate": 1.2103798095774665e-07, + "loss": 0.3044, + "step": 13260 + }, + { + "epoch": 4.5105225117979675, + "grad_norm": 3.9006630355912826, + "learning_rate": 1.202056886564973e-07, + "loss": 0.3241, + "step": 13265 + }, + { + "epoch": 4.512223119765316, + "grad_norm": 6.998962310279112, + "learning_rate": 1.1937619728358496e-07, + "loss": 0.3182, + "step": 13270 + }, + { + "epoch": 4.5139237277326645, + "grad_norm": 4.369582853724072, + "learning_rate": 1.1854950781528901e-07, + "loss": 0.3303, + "step": 13275 + }, + { + "epoch": 4.5156243357000125, + "grad_norm": 4.789326373618072, + "learning_rate": 1.177256212245928e-07, + "loss": 0.3501, + "step": 13280 + }, + { + "epoch": 4.5173249436673615, + "grad_norm": 8.395740382579294, + "learning_rate": 1.1690453848117872e-07, + "loss": 0.3328, + "step": 13285 + }, + { + "epoch": 4.5190255516347095, + "grad_norm": 4.082702969850315, + "learning_rate": 1.1608626055143068e-07, + "loss": 0.3265, + "step": 13290 + }, + { + "epoch": 4.520726159602058, + "grad_norm": 5.506729170134501, + "learning_rate": 1.1527078839843164e-07, + "loss": 0.3313, + "step": 13295 + }, + { + "epoch": 4.5224267675694065, + "grad_norm": 4.600119129222766, + "learning_rate": 1.1445812298196079e-07, + "loss": 0.3422, + "step": 13300 + }, + { + "epoch": 4.524127375536755, + "grad_norm": 5.250707685450341, + "learning_rate": 1.1364826525849526e-07, + "loss": 0.308, + "step": 13305 + }, + { + "epoch": 4.525827983504103, + "grad_norm": 5.813778459770343, + "learning_rate": 1.1284121618120675e-07, + "loss": 0.3287, + "step": 13310 + }, + { + "epoch": 4.527528591471451, + "grad_norm": 8.482249036813347, + "learning_rate": 1.120369766999621e-07, + "loss": 0.3542, + "step": 13315 + }, + { + "epoch": 4.5292291994388, + "grad_norm": 44.4986547425875, + "learning_rate": 1.1123554776132028e-07, + "loss": 0.3279, + "step": 13320 + }, + { + "epoch": 4.530929807406148, + "grad_norm": 5.616438060806645, + "learning_rate": 1.1043693030853369e-07, + "loss": 0.3327, + "step": 13325 + }, + { + "epoch": 4.532630415373496, + "grad_norm": 6.246008971073881, + "learning_rate": 1.0964112528154408e-07, + "loss": 0.3208, + "step": 13330 + }, + { + "epoch": 4.534331023340845, + "grad_norm": 20.38889674582648, + "learning_rate": 1.0884813361698526e-07, + "loss": 0.3313, + "step": 13335 + }, + { + "epoch": 4.536031631308193, + "grad_norm": 4.253780013214621, + "learning_rate": 1.0805795624817733e-07, + "loss": 0.3053, + "step": 13340 + }, + { + "epoch": 4.537732239275541, + "grad_norm": 6.978035722690048, + "learning_rate": 1.0727059410513024e-07, + "loss": 0.3326, + "step": 13345 + }, + { + "epoch": 4.539432847242889, + "grad_norm": 4.420401496695248, + "learning_rate": 1.0648604811453911e-07, + "loss": 0.3306, + "step": 13350 + }, + { + "epoch": 4.541133455210238, + "grad_norm": 9.816058519908898, + "learning_rate": 1.0570431919978503e-07, + "loss": 0.3374, + "step": 13355 + }, + { + "epoch": 4.542834063177586, + "grad_norm": 9.606878566032291, + "learning_rate": 1.0492540828093395e-07, + "loss": 0.3263, + "step": 13360 + }, + { + "epoch": 4.544534671144934, + "grad_norm": 7.715141830725023, + "learning_rate": 1.0414931627473396e-07, + "loss": 0.3301, + "step": 13365 + }, + { + "epoch": 4.546235279112283, + "grad_norm": 4.680687545110913, + "learning_rate": 1.0337604409461715e-07, + "loss": 0.3276, + "step": 13370 + }, + { + "epoch": 4.547935887079631, + "grad_norm": 4.453814003722188, + "learning_rate": 1.0260559265069497e-07, + "loss": 0.3377, + "step": 13375 + }, + { + "epoch": 4.549636495046979, + "grad_norm": 23.469453151538225, + "learning_rate": 1.0183796284976011e-07, + "loss": 0.3589, + "step": 13380 + }, + { + "epoch": 4.551337103014328, + "grad_norm": 4.088982279683541, + "learning_rate": 1.0107315559528374e-07, + "loss": 0.3125, + "step": 13385 + }, + { + "epoch": 4.553037710981676, + "grad_norm": 4.8239304234020635, + "learning_rate": 1.0031117178741557e-07, + "loss": 0.3282, + "step": 13390 + }, + { + "epoch": 4.554738318949024, + "grad_norm": 9.11888618207721, + "learning_rate": 9.955201232298123e-08, + "loss": 0.3315, + "step": 13395 + }, + { + "epoch": 4.556438926916373, + "grad_norm": 3.8658203336746086, + "learning_rate": 9.879567809548351e-08, + "loss": 0.3154, + "step": 13400 + }, + { + "epoch": 4.558139534883721, + "grad_norm": 4.233775252166157, + "learning_rate": 9.804216999509897e-08, + "loss": 0.3329, + "step": 13405 + }, + { + "epoch": 4.559840142851069, + "grad_norm": 5.832164655350215, + "learning_rate": 9.729148890867818e-08, + "loss": 0.3363, + "step": 13410 + }, + { + "epoch": 4.561540750818418, + "grad_norm": 4.4491927675603336, + "learning_rate": 9.654363571974496e-08, + "loss": 0.3341, + "step": 13415 + }, + { + "epoch": 4.563241358785766, + "grad_norm": 4.530429269530576, + "learning_rate": 9.57986113084941e-08, + "loss": 0.3352, + "step": 13420 + }, + { + "epoch": 4.564941966753114, + "grad_norm": 3.258923521195808, + "learning_rate": 9.505641655179144e-08, + "loss": 0.3142, + "step": 13425 + }, + { + "epoch": 4.566642574720462, + "grad_norm": 4.257615335573072, + "learning_rate": 9.431705232317179e-08, + "loss": 0.3349, + "step": 13430 + }, + { + "epoch": 4.568343182687811, + "grad_norm": 13.592790964567337, + "learning_rate": 9.358051949283991e-08, + "loss": 0.3212, + "step": 13435 + }, + { + "epoch": 4.570043790655159, + "grad_norm": 6.975885256964604, + "learning_rate": 9.284681892766629e-08, + "loss": 0.3447, + "step": 13440 + }, + { + "epoch": 4.571744398622507, + "grad_norm": 4.395602256939808, + "learning_rate": 9.211595149118957e-08, + "loss": 0.341, + "step": 13445 + }, + { + "epoch": 4.573445006589856, + "grad_norm": 3.8954209473335295, + "learning_rate": 9.138791804361253e-08, + "loss": 0.3265, + "step": 13450 + }, + { + "epoch": 4.575145614557204, + "grad_norm": 5.803626434070616, + "learning_rate": 9.066271944180388e-08, + "loss": 0.3122, + "step": 13455 + }, + { + "epoch": 4.576846222524552, + "grad_norm": 4.045096160827611, + "learning_rate": 8.99403565392945e-08, + "loss": 0.3242, + "step": 13460 + }, + { + "epoch": 4.5785468304919, + "grad_norm": 4.1985007006023425, + "learning_rate": 8.922083018627875e-08, + "loss": 0.3342, + "step": 13465 + }, + { + "epoch": 4.580247438459249, + "grad_norm": 4.812436169736084, + "learning_rate": 8.850414122961171e-08, + "loss": 0.3441, + "step": 13470 + }, + { + "epoch": 4.581948046426597, + "grad_norm": 3.7440161835946806, + "learning_rate": 8.779029051280946e-08, + "loss": 0.3282, + "step": 13475 + }, + { + "epoch": 4.583648654393945, + "grad_norm": 5.370574064417769, + "learning_rate": 8.7079278876048e-08, + "loss": 0.3103, + "step": 13480 + }, + { + "epoch": 4.585349262361294, + "grad_norm": 6.219787022073194, + "learning_rate": 8.637110715616015e-08, + "loss": 0.3431, + "step": 13485 + }, + { + "epoch": 4.587049870328642, + "grad_norm": 4.935446514700299, + "learning_rate": 8.566577618663807e-08, + "loss": 0.3125, + "step": 13490 + }, + { + "epoch": 4.5887504782959905, + "grad_norm": 12.87214378064997, + "learning_rate": 8.496328679762967e-08, + "loss": 0.341, + "step": 13495 + }, + { + "epoch": 4.590451086263339, + "grad_norm": 4.231651641870352, + "learning_rate": 8.426363981593855e-08, + "loss": 0.3313, + "step": 13500 + }, + { + "epoch": 4.5921516942306875, + "grad_norm": 3.879091516868406, + "learning_rate": 8.356683606502269e-08, + "loss": 0.3263, + "step": 13505 + }, + { + "epoch": 4.5938523021980355, + "grad_norm": 5.468098031628809, + "learning_rate": 8.287287636499414e-08, + "loss": 0.3275, + "step": 13510 + }, + { + "epoch": 4.5955529101653845, + "grad_norm": 3.575457095129105, + "learning_rate": 8.218176153261704e-08, + "loss": 0.326, + "step": 13515 + }, + { + "epoch": 4.5972535181327325, + "grad_norm": 4.953138668741619, + "learning_rate": 8.149349238130793e-08, + "loss": 0.3381, + "step": 13520 + }, + { + "epoch": 4.598954126100081, + "grad_norm": 5.24919771321698, + "learning_rate": 8.080806972113331e-08, + "loss": 0.3381, + "step": 13525 + }, + { + "epoch": 4.6006547340674295, + "grad_norm": 4.288167964297829, + "learning_rate": 8.012549435881007e-08, + "loss": 0.3103, + "step": 13530 + }, + { + "epoch": 4.602355342034778, + "grad_norm": 5.194859941964073, + "learning_rate": 7.944576709770363e-08, + "loss": 0.3264, + "step": 13535 + }, + { + "epoch": 4.604055950002126, + "grad_norm": 5.901346190501034, + "learning_rate": 7.87688887378274e-08, + "loss": 0.3298, + "step": 13540 + }, + { + "epoch": 4.605756557969475, + "grad_norm": 16.68260982892744, + "learning_rate": 7.809486007584216e-08, + "loss": 0.3375, + "step": 13545 + }, + { + "epoch": 4.607457165936823, + "grad_norm": 4.701726667900006, + "learning_rate": 7.742368190505334e-08, + "loss": 0.3323, + "step": 13550 + }, + { + "epoch": 4.609157773904171, + "grad_norm": 5.9509914549389595, + "learning_rate": 7.67553550154132e-08, + "loss": 0.3222, + "step": 13555 + }, + { + "epoch": 4.610858381871519, + "grad_norm": 13.976456333749962, + "learning_rate": 7.608988019351699e-08, + "loss": 0.3507, + "step": 13560 + }, + { + "epoch": 4.612558989838868, + "grad_norm": 3.8581391762891872, + "learning_rate": 7.542725822260371e-08, + "loss": 0.3353, + "step": 13565 + }, + { + "epoch": 4.614259597806216, + "grad_norm": 4.830330543756697, + "learning_rate": 7.476748988255428e-08, + "loss": 0.3094, + "step": 13570 + }, + { + "epoch": 4.615960205773564, + "grad_norm": 4.012557265721932, + "learning_rate": 7.41105759498914e-08, + "loss": 0.3173, + "step": 13575 + }, + { + "epoch": 4.617660813740913, + "grad_norm": 4.248863274590059, + "learning_rate": 7.345651719777775e-08, + "loss": 0.3523, + "step": 13580 + }, + { + "epoch": 4.619361421708261, + "grad_norm": 6.913313645146682, + "learning_rate": 7.280531439601641e-08, + "loss": 0.3098, + "step": 13585 + }, + { + "epoch": 4.621062029675609, + "grad_norm": 3.841679848706814, + "learning_rate": 7.215696831104791e-08, + "loss": 0.3524, + "step": 13590 + }, + { + "epoch": 4.622762637642957, + "grad_norm": 2.9552779005802967, + "learning_rate": 7.151147970595129e-08, + "loss": 0.3329, + "step": 13595 + }, + { + "epoch": 4.624463245610306, + "grad_norm": 3.7823566013155685, + "learning_rate": 7.086884934044302e-08, + "loss": 0.3349, + "step": 13600 + }, + { + "epoch": 4.626163853577654, + "grad_norm": 17.23201939605056, + "learning_rate": 7.02290779708742e-08, + "loss": 0.351, + "step": 13605 + }, + { + "epoch": 4.627864461545002, + "grad_norm": 10.04970677446816, + "learning_rate": 6.959216635023191e-08, + "loss": 0.3265, + "step": 13610 + }, + { + "epoch": 4.629565069512351, + "grad_norm": 5.0992004095698995, + "learning_rate": 6.895811522813683e-08, + "loss": 0.3352, + "step": 13615 + }, + { + "epoch": 4.631265677479699, + "grad_norm": 4.928106483892405, + "learning_rate": 6.832692535084395e-08, + "loss": 0.325, + "step": 13620 + }, + { + "epoch": 4.632966285447047, + "grad_norm": 4.225590866855814, + "learning_rate": 6.769859746123931e-08, + "loss": 0.3152, + "step": 13625 + }, + { + "epoch": 4.634666893414396, + "grad_norm": 6.029621948014496, + "learning_rate": 6.70731322988416e-08, + "loss": 0.3344, + "step": 13630 + }, + { + "epoch": 4.636367501381744, + "grad_norm": 5.840225724774846, + "learning_rate": 6.645053059979923e-08, + "loss": 0.3422, + "step": 13635 + }, + { + "epoch": 4.638068109349092, + "grad_norm": 5.255603762891737, + "learning_rate": 6.583079309689183e-08, + "loss": 0.3314, + "step": 13640 + }, + { + "epoch": 4.639768717316441, + "grad_norm": 4.098687599519045, + "learning_rate": 6.521392051952653e-08, + "loss": 0.3333, + "step": 13645 + }, + { + "epoch": 4.641469325283789, + "grad_norm": 5.255583875205857, + "learning_rate": 6.45999135937389e-08, + "loss": 0.3271, + "step": 13650 + }, + { + "epoch": 4.643169933251137, + "grad_norm": 5.571635080468063, + "learning_rate": 6.398877304219287e-08, + "loss": 0.3267, + "step": 13655 + }, + { + "epoch": 4.644870541218486, + "grad_norm": 5.6399676170928315, + "learning_rate": 6.338049958417692e-08, + "loss": 0.3274, + "step": 13660 + }, + { + "epoch": 4.646571149185834, + "grad_norm": 9.774654917394303, + "learning_rate": 6.277509393560672e-08, + "loss": 0.3303, + "step": 13665 + }, + { + "epoch": 4.648271757153182, + "grad_norm": 4.612499635823047, + "learning_rate": 6.217255680902146e-08, + "loss": 0.3338, + "step": 13670 + }, + { + "epoch": 4.64997236512053, + "grad_norm": 16.70420980081881, + "learning_rate": 6.157288891358498e-08, + "loss": 0.3191, + "step": 13675 + }, + { + "epoch": 4.651672973087879, + "grad_norm": 3.7846285855313115, + "learning_rate": 6.097609095508355e-08, + "loss": 0.3203, + "step": 13680 + }, + { + "epoch": 4.653373581055227, + "grad_norm": 4.60325436502952, + "learning_rate": 6.038216363592614e-08, + "loss": 0.3381, + "step": 13685 + }, + { + "epoch": 4.655074189022575, + "grad_norm": 142.50050566162332, + "learning_rate": 5.979110765514273e-08, + "loss": 0.3313, + "step": 13690 + }, + { + "epoch": 4.656774796989924, + "grad_norm": 23.86895854571442, + "learning_rate": 5.92029237083841e-08, + "loss": 0.3359, + "step": 13695 + }, + { + "epoch": 4.658475404957272, + "grad_norm": 5.283896466005684, + "learning_rate": 5.8617612487920364e-08, + "loss": 0.3283, + "step": 13700 + }, + { + "epoch": 4.66017601292462, + "grad_norm": 4.779382002247554, + "learning_rate": 5.8035174682641024e-08, + "loss": 0.3206, + "step": 13705 + }, + { + "epoch": 4.661876620891968, + "grad_norm": 7.054684759020897, + "learning_rate": 5.74556109780533e-08, + "loss": 0.3202, + "step": 13710 + }, + { + "epoch": 4.663577228859317, + "grad_norm": 7.206859116855702, + "learning_rate": 5.6878922056281816e-08, + "loss": 0.3219, + "step": 13715 + }, + { + "epoch": 4.665277836826665, + "grad_norm": 6.1873721233225725, + "learning_rate": 5.630510859606808e-08, + "loss": 0.3104, + "step": 13720 + }, + { + "epoch": 4.6669784447940135, + "grad_norm": 7.693323171395487, + "learning_rate": 5.573417127276853e-08, + "loss": 0.326, + "step": 13725 + }, + { + "epoch": 4.668679052761362, + "grad_norm": 3.5717677038539812, + "learning_rate": 5.5166110758355375e-08, + "loss": 0.3046, + "step": 13730 + }, + { + "epoch": 4.6703796607287105, + "grad_norm": 3.875300346012126, + "learning_rate": 5.4600927721413786e-08, + "loss": 0.3232, + "step": 13735 + }, + { + "epoch": 4.6720802686960585, + "grad_norm": 4.185631255677343, + "learning_rate": 5.403862282714362e-08, + "loss": 0.3321, + "step": 13740 + }, + { + "epoch": 4.6737808766634075, + "grad_norm": 14.202455049654725, + "learning_rate": 5.347919673735602e-08, + "loss": 0.3181, + "step": 13745 + }, + { + "epoch": 4.6754814846307555, + "grad_norm": 4.3145911797537115, + "learning_rate": 5.292265011047487e-08, + "loss": 0.3529, + "step": 13750 + }, + { + "epoch": 4.677182092598104, + "grad_norm": 4.891949675153146, + "learning_rate": 5.236898360153425e-08, + "loss": 0.3432, + "step": 13755 + }, + { + "epoch": 4.6788827005654525, + "grad_norm": 6.60604531632661, + "learning_rate": 5.181819786217901e-08, + "loss": 0.3129, + "step": 13760 + }, + { + "epoch": 4.680583308532801, + "grad_norm": 3.4956157623637902, + "learning_rate": 5.1270293540663095e-08, + "loss": 0.3234, + "step": 13765 + }, + { + "epoch": 4.682283916500149, + "grad_norm": 6.734484318198087, + "learning_rate": 5.072527128184956e-08, + "loss": 0.3537, + "step": 13770 + }, + { + "epoch": 4.683984524467498, + "grad_norm": 10.341178508350671, + "learning_rate": 5.01831317272089e-08, + "loss": 0.3357, + "step": 13775 + }, + { + "epoch": 4.685685132434846, + "grad_norm": 4.047827109768496, + "learning_rate": 4.964387551481875e-08, + "loss": 0.3282, + "step": 13780 + }, + { + "epoch": 4.687385740402194, + "grad_norm": 6.8714161053726315, + "learning_rate": 4.910750327936392e-08, + "loss": 0.3501, + "step": 13785 + }, + { + "epoch": 4.689086348369542, + "grad_norm": 8.505102959858387, + "learning_rate": 4.85740156521336e-08, + "loss": 0.3168, + "step": 13790 + }, + { + "epoch": 4.690786956336891, + "grad_norm": 4.7115482090418634, + "learning_rate": 4.804341326102358e-08, + "loss": 0.3459, + "step": 13795 + }, + { + "epoch": 4.692487564304239, + "grad_norm": 6.277115723449739, + "learning_rate": 4.75156967305318e-08, + "loss": 0.3508, + "step": 13800 + }, + { + "epoch": 4.694188172271587, + "grad_norm": 7.547462766787185, + "learning_rate": 4.699086668176173e-08, + "loss": 0.3464, + "step": 13805 + }, + { + "epoch": 4.695888780238936, + "grad_norm": 4.296686807226226, + "learning_rate": 4.646892373241812e-08, + "loss": 0.3405, + "step": 13810 + }, + { + "epoch": 4.697589388206284, + "grad_norm": 4.407793513535197, + "learning_rate": 4.594986849680821e-08, + "loss": 0.3285, + "step": 13815 + }, + { + "epoch": 4.699289996173632, + "grad_norm": 10.953418019870245, + "learning_rate": 4.543370158584054e-08, + "loss": 0.3294, + "step": 13820 + }, + { + "epoch": 4.70099060414098, + "grad_norm": 4.815143078865413, + "learning_rate": 4.4920423607024144e-08, + "loss": 0.3429, + "step": 13825 + }, + { + "epoch": 4.702691212108329, + "grad_norm": 3.8748734207286373, + "learning_rate": 4.441003516446773e-08, + "loss": 0.3321, + "step": 13830 + }, + { + "epoch": 4.704391820075677, + "grad_norm": 5.964705868539915, + "learning_rate": 4.390253685887941e-08, + "loss": 0.3069, + "step": 13835 + }, + { + "epoch": 4.706092428043025, + "grad_norm": 4.517574517561679, + "learning_rate": 4.339792928756581e-08, + "loss": 0.3323, + "step": 13840 + }, + { + "epoch": 4.707793036010374, + "grad_norm": 5.3957387491489985, + "learning_rate": 4.289621304443076e-08, + "loss": 0.34, + "step": 13845 + }, + { + "epoch": 4.709493643977722, + "grad_norm": 7.735334001093503, + "learning_rate": 4.239738871997551e-08, + "loss": 0.3408, + "step": 13850 + }, + { + "epoch": 4.71119425194507, + "grad_norm": 4.054492180541433, + "learning_rate": 4.190145690129738e-08, + "loss": 0.3139, + "step": 13855 + }, + { + "epoch": 4.712894859912419, + "grad_norm": 6.348049582111246, + "learning_rate": 4.140841817208946e-08, + "loss": 0.3166, + "step": 13860 + }, + { + "epoch": 4.714595467879767, + "grad_norm": 4.1275479057128575, + "learning_rate": 4.091827311264007e-08, + "loss": 0.3214, + "step": 13865 + }, + { + "epoch": 4.716296075847115, + "grad_norm": 3.657059874633293, + "learning_rate": 4.043102229983109e-08, + "loss": 0.3241, + "step": 13870 + }, + { + "epoch": 4.717996683814464, + "grad_norm": 8.612859433209668, + "learning_rate": 3.994666630713878e-08, + "loss": 0.3113, + "step": 13875 + }, + { + "epoch": 4.719697291781812, + "grad_norm": 9.237348760144329, + "learning_rate": 3.946520570463158e-08, + "loss": 0.3434, + "step": 13880 + }, + { + "epoch": 4.72139789974916, + "grad_norm": 7.960686824647489, + "learning_rate": 3.898664105897065e-08, + "loss": 0.3488, + "step": 13885 + }, + { + "epoch": 4.723098507716509, + "grad_norm": 5.552691101089457, + "learning_rate": 3.851097293340877e-08, + "loss": 0.3466, + "step": 13890 + }, + { + "epoch": 4.724799115683857, + "grad_norm": 4.29663431060009, + "learning_rate": 3.803820188778895e-08, + "loss": 0.3259, + "step": 13895 + }, + { + "epoch": 4.726499723651205, + "grad_norm": 5.090123301435192, + "learning_rate": 3.756832847854525e-08, + "loss": 0.3119, + "step": 13900 + }, + { + "epoch": 4.728200331618554, + "grad_norm": 6.696393552711733, + "learning_rate": 3.710135325870085e-08, + "loss": 0.3354, + "step": 13905 + }, + { + "epoch": 4.729900939585902, + "grad_norm": 4.918072272655382, + "learning_rate": 3.663727677786833e-08, + "loss": 0.3327, + "step": 13910 + }, + { + "epoch": 4.73160154755325, + "grad_norm": 5.2599558489731315, + "learning_rate": 3.6176099582247716e-08, + "loss": 0.3405, + "step": 13915 + }, + { + "epoch": 4.733302155520598, + "grad_norm": 7.8895944654960815, + "learning_rate": 3.5717822214627606e-08, + "loss": 0.3335, + "step": 13920 + }, + { + "epoch": 4.735002763487947, + "grad_norm": 4.802501219254316, + "learning_rate": 3.526244521438321e-08, + "loss": 0.3502, + "step": 13925 + }, + { + "epoch": 4.736703371455295, + "grad_norm": 9.590248448123884, + "learning_rate": 3.4809969117475806e-08, + "loss": 0.3316, + "step": 13930 + }, + { + "epoch": 4.738403979422643, + "grad_norm": 3.695100725537164, + "learning_rate": 3.4360394456453004e-08, + "loss": 0.3461, + "step": 13935 + }, + { + "epoch": 4.740104587389992, + "grad_norm": 4.148114638485843, + "learning_rate": 3.3913721760447104e-08, + "loss": 0.3223, + "step": 13940 + }, + { + "epoch": 4.74180519535734, + "grad_norm": 10.58552100482856, + "learning_rate": 3.3469951555175075e-08, + "loss": 0.3346, + "step": 13945 + }, + { + "epoch": 4.743505803324688, + "grad_norm": 2.508138019154806, + "learning_rate": 3.3029084362938005e-08, + "loss": 0.332, + "step": 13950 + }, + { + "epoch": 4.7452064112920365, + "grad_norm": 3.9719421989029775, + "learning_rate": 3.259112070261944e-08, + "loss": 0.3278, + "step": 13955 + }, + { + "epoch": 4.746907019259385, + "grad_norm": 4.152108309347853, + "learning_rate": 3.2156061089686776e-08, + "loss": 0.3303, + "step": 13960 + }, + { + "epoch": 4.7486076272267335, + "grad_norm": 5.436167936445684, + "learning_rate": 3.172390603618847e-08, + "loss": 0.3251, + "step": 13965 + }, + { + "epoch": 4.7503082351940815, + "grad_norm": 4.499253144753351, + "learning_rate": 3.129465605075488e-08, + "loss": 0.3328, + "step": 13970 + }, + { + "epoch": 4.7520088431614305, + "grad_norm": 14.816044571670451, + "learning_rate": 3.086831163859661e-08, + "loss": 0.3413, + "step": 13975 + }, + { + "epoch": 4.7537094511287785, + "grad_norm": 4.341032786225299, + "learning_rate": 3.044487330150558e-08, + "loss": 0.3258, + "step": 13980 + }, + { + "epoch": 4.755410059096127, + "grad_norm": 13.714824551807652, + "learning_rate": 3.002434153785261e-08, + "loss": 0.3166, + "step": 13985 + }, + { + "epoch": 4.7571106670634755, + "grad_norm": 5.47757169285566, + "learning_rate": 2.960671684258759e-08, + "loss": 0.3161, + "step": 13990 + }, + { + "epoch": 4.758811275030824, + "grad_norm": 3.7629727614857464, + "learning_rate": 2.9191999707239292e-08, + "loss": 0.3173, + "step": 13995 + }, + { + "epoch": 4.760511882998172, + "grad_norm": 3.65754739288696, + "learning_rate": 2.8780190619914216e-08, + "loss": 0.3233, + "step": 14000 + }, + { + "epoch": 4.762212490965521, + "grad_norm": 8.729693460847791, + "learning_rate": 2.8371290065295764e-08, + "loss": 0.3526, + "step": 14005 + }, + { + "epoch": 4.763913098932869, + "grad_norm": 4.676875112956524, + "learning_rate": 2.79652985246448e-08, + "loss": 0.3408, + "step": 14010 + }, + { + "epoch": 4.765613706900217, + "grad_norm": 17.946843511877965, + "learning_rate": 2.7562216475797986e-08, + "loss": 0.3343, + "step": 14015 + }, + { + "epoch": 4.767314314867566, + "grad_norm": 3.772720874376048, + "learning_rate": 2.7162044393167498e-08, + "loss": 0.3221, + "step": 14020 + }, + { + "epoch": 4.769014922834914, + "grad_norm": 7.405290276612255, + "learning_rate": 2.676478274774158e-08, + "loss": 0.3259, + "step": 14025 + }, + { + "epoch": 4.770715530802262, + "grad_norm": 9.481284247622, + "learning_rate": 2.6370432007081502e-08, + "loss": 0.3209, + "step": 14030 + }, + { + "epoch": 4.77241613876961, + "grad_norm": 6.465908227524739, + "learning_rate": 2.5978992635323773e-08, + "loss": 0.3221, + "step": 14035 + }, + { + "epoch": 4.774116746736959, + "grad_norm": 6.157097149133531, + "learning_rate": 2.5590465093177087e-08, + "loss": 0.3272, + "step": 14040 + }, + { + "epoch": 4.775817354704307, + "grad_norm": 3.8184457373119436, + "learning_rate": 2.520484983792454e-08, + "loss": 0.325, + "step": 14045 + }, + { + "epoch": 4.777517962671655, + "grad_norm": 7.076044755832123, + "learning_rate": 2.4822147323420032e-08, + "loss": 0.3345, + "step": 14050 + }, + { + "epoch": 4.779218570639004, + "grad_norm": 4.090298221303333, + "learning_rate": 2.444235800009076e-08, + "loss": 0.3248, + "step": 14055 + }, + { + "epoch": 4.780919178606352, + "grad_norm": 11.176518096915373, + "learning_rate": 2.406548231493361e-08, + "loss": 0.3425, + "step": 14060 + }, + { + "epoch": 4.7826197865737, + "grad_norm": 7.616449195698886, + "learning_rate": 2.3691520711517923e-08, + "loss": 0.3395, + "step": 14065 + }, + { + "epoch": 4.784320394541048, + "grad_norm": 3.341981099864446, + "learning_rate": 2.332047362998191e-08, + "loss": 0.3482, + "step": 14070 + }, + { + "epoch": 4.786021002508397, + "grad_norm": 6.11811041814841, + "learning_rate": 2.295234150703429e-08, + "loss": 0.345, + "step": 14075 + }, + { + "epoch": 4.787721610475745, + "grad_norm": 4.48366183201597, + "learning_rate": 2.258712477595265e-08, + "loss": 0.3466, + "step": 14080 + }, + { + "epoch": 4.789422218443093, + "grad_norm": 3.3110746330710454, + "learning_rate": 2.2224823866583145e-08, + "loss": 0.3197, + "step": 14085 + }, + { + "epoch": 4.791122826410442, + "grad_norm": 5.6015300192522, + "learning_rate": 2.186543920534051e-08, + "loss": 0.3269, + "step": 14090 + }, + { + "epoch": 4.79282343437779, + "grad_norm": 3.731054334037487, + "learning_rate": 2.1508971215206953e-08, + "loss": 0.3583, + "step": 14095 + }, + { + "epoch": 4.794524042345138, + "grad_norm": 4.66466334818196, + "learning_rate": 2.115542031573159e-08, + "loss": 0.3386, + "step": 14100 + }, + { + "epoch": 4.796224650312487, + "grad_norm": 5.25031709764381, + "learning_rate": 2.0804786923031008e-08, + "loss": 0.3379, + "step": 14105 + }, + { + "epoch": 4.797925258279835, + "grad_norm": 10.516944628469465, + "learning_rate": 2.0457071449787315e-08, + "loss": 0.3225, + "step": 14110 + }, + { + "epoch": 4.799625866247183, + "grad_norm": 3.5611550276427715, + "learning_rate": 2.0112274305248426e-08, + "loss": 0.3161, + "step": 14115 + }, + { + "epoch": 4.801326474214532, + "grad_norm": 7.576371200543571, + "learning_rate": 1.977039589522778e-08, + "loss": 0.3405, + "step": 14120 + }, + { + "epoch": 4.80302708218188, + "grad_norm": 5.451582141759962, + "learning_rate": 1.943143662210295e-08, + "loss": 0.3234, + "step": 14125 + }, + { + "epoch": 4.804727690149228, + "grad_norm": 5.586511869261489, + "learning_rate": 1.9095396884817043e-08, + "loss": 0.3386, + "step": 14130 + }, + { + "epoch": 4.806428298116577, + "grad_norm": 12.158692728698131, + "learning_rate": 1.8762277078875346e-08, + "loss": 0.3168, + "step": 14135 + }, + { + "epoch": 4.808128906083925, + "grad_norm": 3.6248018478769986, + "learning_rate": 1.843207759634813e-08, + "loss": 0.3253, + "step": 14140 + }, + { + "epoch": 4.809829514051273, + "grad_norm": 11.92575864371344, + "learning_rate": 1.810479882586702e-08, + "loss": 0.335, + "step": 14145 + }, + { + "epoch": 4.811530122018621, + "grad_norm": 4.050455765642693, + "learning_rate": 1.7780441152627227e-08, + "loss": 0.3384, + "step": 14150 + }, + { + "epoch": 4.81323072998597, + "grad_norm": 7.165397203447299, + "learning_rate": 1.7459004958385317e-08, + "loss": 0.3406, + "step": 14155 + }, + { + "epoch": 4.814931337953318, + "grad_norm": 6.031532642070139, + "learning_rate": 1.7140490621459782e-08, + "loss": 0.3431, + "step": 14160 + }, + { + "epoch": 4.816631945920666, + "grad_norm": 16.529377007280242, + "learning_rate": 1.6824898516729916e-08, + "loss": 0.2999, + "step": 14165 + }, + { + "epoch": 4.818332553888015, + "grad_norm": 6.834412260587851, + "learning_rate": 1.6512229015635817e-08, + "loss": 0.3104, + "step": 14170 + }, + { + "epoch": 4.820033161855363, + "grad_norm": 4.2081382672489775, + "learning_rate": 1.620248248617784e-08, + "loss": 0.324, + "step": 14175 + }, + { + "epoch": 4.821733769822711, + "grad_norm": 4.531693339378879, + "learning_rate": 1.5895659292915477e-08, + "loss": 0.3341, + "step": 14180 + }, + { + "epoch": 4.8234343777900595, + "grad_norm": 11.082332131058152, + "learning_rate": 1.559175979696875e-08, + "loss": 0.3127, + "step": 14185 + }, + { + "epoch": 4.825134985757408, + "grad_norm": 6.595292995331069, + "learning_rate": 1.5290784356015166e-08, + "loss": 0.3168, + "step": 14190 + }, + { + "epoch": 4.8268355937247565, + "grad_norm": 5.052925451408609, + "learning_rate": 1.4992733324292465e-08, + "loss": 0.299, + "step": 14195 + }, + { + "epoch": 4.8285362016921045, + "grad_norm": 4.504702164111421, + "learning_rate": 1.4697607052594487e-08, + "loss": 0.3065, + "step": 14200 + }, + { + "epoch": 4.8302368096594535, + "grad_norm": 5.758256087628319, + "learning_rate": 1.4405405888274492e-08, + "loss": 0.3106, + "step": 14205 + }, + { + "epoch": 4.8319374176268015, + "grad_norm": 5.2199824865839215, + "learning_rate": 1.4116130175241826e-08, + "loss": 0.3041, + "step": 14210 + }, + { + "epoch": 4.83363802559415, + "grad_norm": 3.9418870967087383, + "learning_rate": 1.382978025396331e-08, + "loss": 0.3182, + "step": 14215 + }, + { + "epoch": 4.8353386335614985, + "grad_norm": 4.6303149964657875, + "learning_rate": 1.3546356461462129e-08, + "loss": 0.326, + "step": 14220 + }, + { + "epoch": 4.837039241528847, + "grad_norm": 5.553330113835605, + "learning_rate": 1.3265859131317004e-08, + "loss": 0.3169, + "step": 14225 + }, + { + "epoch": 4.838739849496195, + "grad_norm": 6.2443968915828405, + "learning_rate": 1.2988288593663301e-08, + "loss": 0.3101, + "step": 14230 + }, + { + "epoch": 4.840440457463544, + "grad_norm": 4.663283684252, + "learning_rate": 1.2713645175190526e-08, + "loss": 0.3028, + "step": 14235 + }, + { + "epoch": 4.842141065430892, + "grad_norm": 3.766050860064333, + "learning_rate": 1.2441929199143998e-08, + "loss": 0.3188, + "step": 14240 + }, + { + "epoch": 4.84384167339824, + "grad_norm": 4.965225020822388, + "learning_rate": 1.2173140985323183e-08, + "loss": 0.3326, + "step": 14245 + }, + { + "epoch": 4.845542281365589, + "grad_norm": 4.884112848650207, + "learning_rate": 1.1907280850081416e-08, + "loss": 0.3297, + "step": 14250 + }, + { + "epoch": 4.847242889332937, + "grad_norm": 4.079118794301395, + "learning_rate": 1.1644349106326446e-08, + "loss": 0.3458, + "step": 14255 + }, + { + "epoch": 4.848943497300285, + "grad_norm": 4.606447266319099, + "learning_rate": 1.138434606351907e-08, + "loss": 0.304, + "step": 14260 + }, + { + "epoch": 4.850644105267634, + "grad_norm": 7.463303603237622, + "learning_rate": 1.1127272027672553e-08, + "loss": 0.3416, + "step": 14265 + }, + { + "epoch": 4.852344713234982, + "grad_norm": 5.457559221356962, + "learning_rate": 1.0873127301353759e-08, + "loss": 0.3433, + "step": 14270 + }, + { + "epoch": 4.85404532120233, + "grad_norm": 3.8041665918547927, + "learning_rate": 1.0621912183681471e-08, + "loss": 0.3236, + "step": 14275 + }, + { + "epoch": 4.855745929169678, + "grad_norm": 4.930478268701416, + "learning_rate": 1.0373626970326122e-08, + "loss": 0.3263, + "step": 14280 + }, + { + "epoch": 4.857446537137027, + "grad_norm": 3.8579986437154767, + "learning_rate": 1.0128271953510627e-08, + "loss": 0.3287, + "step": 14285 + }, + { + "epoch": 4.859147145104375, + "grad_norm": 3.938039554816089, + "learning_rate": 9.885847422008155e-09, + "loss": 0.3169, + "step": 14290 + }, + { + "epoch": 4.860847753071723, + "grad_norm": 3.2680166121584744, + "learning_rate": 9.646353661143248e-09, + "loss": 0.2959, + "step": 14295 + }, + { + "epoch": 4.862548361039072, + "grad_norm": 4.24184915115148, + "learning_rate": 9.409790952791265e-09, + "loss": 0.3199, + "step": 14300 + }, + { + "epoch": 4.86424896900642, + "grad_norm": 4.150684459822878, + "learning_rate": 9.176159575377542e-09, + "loss": 0.3193, + "step": 14305 + }, + { + "epoch": 4.865949576973768, + "grad_norm": 4.386856416297783, + "learning_rate": 8.945459803877399e-09, + "loss": 0.3256, + "step": 14310 + }, + { + "epoch": 4.867650184941116, + "grad_norm": 7.261086576109466, + "learning_rate": 8.717691909815861e-09, + "loss": 0.3317, + "step": 14315 + }, + { + "epoch": 4.869350792908465, + "grad_norm": 4.526817758996456, + "learning_rate": 8.492856161266827e-09, + "loss": 0.3382, + "step": 14320 + }, + { + "epoch": 4.871051400875813, + "grad_norm": 9.745905029849254, + "learning_rate": 8.270952822854173e-09, + "loss": 0.3144, + "step": 14325 + }, + { + "epoch": 4.872752008843161, + "grad_norm": 4.157833604852971, + "learning_rate": 8.051982155748983e-09, + "loss": 0.3324, + "step": 14330 + }, + { + "epoch": 4.87445261681051, + "grad_norm": 5.334779588145237, + "learning_rate": 7.835944417672047e-09, + "loss": 0.3368, + "step": 14335 + }, + { + "epoch": 4.876153224777858, + "grad_norm": 4.234055385260185, + "learning_rate": 7.622839862891363e-09, + "loss": 0.3215, + "step": 14340 + }, + { + "epoch": 4.877853832745206, + "grad_norm": 6.635208082117994, + "learning_rate": 7.412668742223239e-09, + "loss": 0.3282, + "step": 14345 + }, + { + "epoch": 4.879554440712555, + "grad_norm": 4.579780726869808, + "learning_rate": 7.205431303030919e-09, + "loss": 0.3453, + "step": 14350 + }, + { + "epoch": 4.881255048679903, + "grad_norm": 7.936028346894105, + "learning_rate": 7.001127789225404e-09, + "loss": 0.3272, + "step": 14355 + }, + { + "epoch": 4.882955656647251, + "grad_norm": 5.496662933139069, + "learning_rate": 6.799758441263793e-09, + "loss": 0.3329, + "step": 14360 + }, + { + "epoch": 4.8846562646146, + "grad_norm": 7.823396970762261, + "learning_rate": 6.601323496150391e-09, + "loss": 0.3202, + "step": 14365 + }, + { + "epoch": 4.886356872581948, + "grad_norm": 33.03284728252866, + "learning_rate": 6.405823187435878e-09, + "loss": 0.336, + "step": 14370 + }, + { + "epoch": 4.888057480549296, + "grad_norm": 4.953899415870555, + "learning_rate": 6.21325774521675e-09, + "loss": 0.3167, + "step": 14375 + }, + { + "epoch": 4.889758088516645, + "grad_norm": 4.117838498124844, + "learning_rate": 6.023627396135046e-09, + "loss": 0.3446, + "step": 14380 + }, + { + "epoch": 4.891458696483993, + "grad_norm": 3.9367115304478677, + "learning_rate": 5.836932363378345e-09, + "loss": 0.3222, + "step": 14385 + }, + { + "epoch": 4.893159304451341, + "grad_norm": 4.510819927911293, + "learning_rate": 5.653172866680323e-09, + "loss": 0.3249, + "step": 14390 + }, + { + "epoch": 4.894859912418689, + "grad_norm": 10.001543119830593, + "learning_rate": 5.472349122318532e-09, + "loss": 0.3575, + "step": 14395 + }, + { + "epoch": 4.896560520386038, + "grad_norm": 4.7034873943443385, + "learning_rate": 5.294461343115509e-09, + "loss": 0.3358, + "step": 14400 + }, + { + "epoch": 4.898261128353386, + "grad_norm": 5.090134015217698, + "learning_rate": 5.119509738439332e-09, + "loss": 0.3306, + "step": 14405 + }, + { + "epoch": 4.899961736320734, + "grad_norm": 4.590455859703564, + "learning_rate": 4.947494514200568e-09, + "loss": 0.3223, + "step": 14410 + }, + { + "epoch": 4.901662344288083, + "grad_norm": 6.328085485050238, + "learning_rate": 4.778415872855047e-09, + "loss": 0.3443, + "step": 14415 + }, + { + "epoch": 4.903362952255431, + "grad_norm": 3.538138410045203, + "learning_rate": 4.612274013401918e-09, + "loss": 0.3497, + "step": 14420 + }, + { + "epoch": 4.9050635602227795, + "grad_norm": 6.790611192620718, + "learning_rate": 4.449069131383932e-09, + "loss": 0.3301, + "step": 14425 + }, + { + "epoch": 4.9067641681901275, + "grad_norm": 6.7806268756366945, + "learning_rate": 4.288801418887156e-09, + "loss": 0.336, + "step": 14430 + }, + { + "epoch": 4.9084647761574765, + "grad_norm": 10.540979291550155, + "learning_rate": 4.131471064540427e-09, + "loss": 0.3295, + "step": 14435 + }, + { + "epoch": 4.9101653841248245, + "grad_norm": 3.7211698539878655, + "learning_rate": 3.977078253515898e-09, + "loss": 0.3135, + "step": 14440 + }, + { + "epoch": 4.911865992092173, + "grad_norm": 3.5706156017562978, + "learning_rate": 3.825623167527936e-09, + "loss": 0.3122, + "step": 14445 + }, + { + "epoch": 4.9135666000595215, + "grad_norm": 5.847975032828931, + "learning_rate": 3.6771059848333956e-09, + "loss": 0.3311, + "step": 14450 + }, + { + "epoch": 4.91526720802687, + "grad_norm": 3.381943041804788, + "learning_rate": 3.531526880231617e-09, + "loss": 0.3191, + "step": 14455 + }, + { + "epoch": 4.916967815994218, + "grad_norm": 5.237877413713388, + "learning_rate": 3.388886025063598e-09, + "loss": 0.3235, + "step": 14460 + }, + { + "epoch": 4.918668423961567, + "grad_norm": 4.369904248870116, + "learning_rate": 3.2491835872125455e-09, + "loss": 0.3283, + "step": 14465 + }, + { + "epoch": 4.920369031928915, + "grad_norm": 6.527563018262631, + "learning_rate": 3.1124197311024896e-09, + "loss": 0.2977, + "step": 14470 + }, + { + "epoch": 4.922069639896263, + "grad_norm": 6.879941803520668, + "learning_rate": 2.9785946176996703e-09, + "loss": 0.3217, + "step": 14475 + }, + { + "epoch": 4.923770247863612, + "grad_norm": 5.22069543753945, + "learning_rate": 2.8477084045111513e-09, + "loss": 0.3536, + "step": 14480 + }, + { + "epoch": 4.92547085583096, + "grad_norm": 4.145546124082018, + "learning_rate": 2.7197612455850952e-09, + "loss": 0.3381, + "step": 14485 + }, + { + "epoch": 4.927171463798308, + "grad_norm": 4.2734827922564875, + "learning_rate": 2.5947532915102105e-09, + "loss": 0.34, + "step": 14490 + }, + { + "epoch": 4.928872071765657, + "grad_norm": 6.600727915702121, + "learning_rate": 2.4726846894165823e-09, + "loss": 0.3416, + "step": 14495 + }, + { + "epoch": 4.930572679733005, + "grad_norm": 4.013607295901653, + "learning_rate": 2.353555582974287e-09, + "loss": 0.3262, + "step": 14500 + }, + { + "epoch": 4.932273287700353, + "grad_norm": 4.028623933831119, + "learning_rate": 2.2373661123936687e-09, + "loss": 0.3392, + "step": 14505 + }, + { + "epoch": 4.933973895667701, + "grad_norm": 4.982694995695445, + "learning_rate": 2.124116414425059e-09, + "loss": 0.3388, + "step": 14510 + }, + { + "epoch": 4.93567450363505, + "grad_norm": 22.898140404287325, + "learning_rate": 2.0138066223596153e-09, + "loss": 0.3288, + "step": 14515 + }, + { + "epoch": 4.937375111602398, + "grad_norm": 7.380963270845392, + "learning_rate": 1.90643686602765e-09, + "loss": 0.3225, + "step": 14520 + }, + { + "epoch": 4.939075719569746, + "grad_norm": 4.637343762454011, + "learning_rate": 1.8020072717991889e-09, + "loss": 0.3244, + "step": 14525 + }, + { + "epoch": 4.940776327537095, + "grad_norm": 6.8534054772689315, + "learning_rate": 1.7005179625842471e-09, + "loss": 0.3398, + "step": 14530 + }, + { + "epoch": 4.942476935504443, + "grad_norm": 4.787676916447148, + "learning_rate": 1.6019690578314428e-09, + "loss": 0.3373, + "step": 14535 + }, + { + "epoch": 4.944177543471791, + "grad_norm": 6.9960588478076, + "learning_rate": 1.5063606735293835e-09, + "loss": 0.3367, + "step": 14540 + }, + { + "epoch": 4.945878151439139, + "grad_norm": 4.598526229363827, + "learning_rate": 1.4136929222058337e-09, + "loss": 0.3277, + "step": 14545 + }, + { + "epoch": 4.947578759406488, + "grad_norm": 5.057956091708963, + "learning_rate": 1.3239659129266059e-09, + "loss": 0.324, + "step": 14550 + }, + { + "epoch": 4.949279367373836, + "grad_norm": 5.521031541373788, + "learning_rate": 1.2371797512975014e-09, + "loss": 0.3574, + "step": 14555 + }, + { + "epoch": 4.950979975341184, + "grad_norm": 4.256729862265275, + "learning_rate": 1.1533345394623696e-09, + "loss": 0.3278, + "step": 14560 + }, + { + "epoch": 4.952680583308533, + "grad_norm": 7.058631958237392, + "learning_rate": 1.0724303761042165e-09, + "loss": 0.3161, + "step": 14565 + }, + { + "epoch": 4.954381191275881, + "grad_norm": 4.0105963723265114, + "learning_rate": 9.944673564435403e-10, + "loss": 0.3388, + "step": 14570 + }, + { + "epoch": 4.956081799243229, + "grad_norm": 5.0775775613873515, + "learning_rate": 9.194455722405515e-10, + "loss": 0.3169, + "step": 14575 + }, + { + "epoch": 4.957782407210578, + "grad_norm": 16.218514657286182, + "learning_rate": 8.473651117923976e-10, + "loss": 0.3212, + "step": 14580 + }, + { + "epoch": 4.959483015177926, + "grad_norm": 4.491387347459075, + "learning_rate": 7.782260599356606e-10, + "loss": 0.3479, + "step": 14585 + }, + { + "epoch": 4.961183623145274, + "grad_norm": 6.522099950741569, + "learning_rate": 7.120284980441372e-10, + "loss": 0.3316, + "step": 14590 + }, + { + "epoch": 4.962884231112623, + "grad_norm": 6.5595159956988205, + "learning_rate": 6.487725040299487e-10, + "loss": 0.3399, + "step": 14595 + }, + { + "epoch": 4.964584839079971, + "grad_norm": 12.52246280240769, + "learning_rate": 5.884581523429856e-10, + "loss": 0.3101, + "step": 14600 + }, + { + "epoch": 4.966285447047319, + "grad_norm": 3.867868696316935, + "learning_rate": 5.310855139709082e-10, + "loss": 0.3392, + "step": 14605 + }, + { + "epoch": 4.967986055014668, + "grad_norm": 4.72438646289047, + "learning_rate": 4.766546564391461e-10, + "loss": 0.336, + "step": 14610 + }, + { + "epoch": 4.969686662982016, + "grad_norm": 4.621887636559062, + "learning_rate": 4.2516564381089864e-10, + "loss": 0.3262, + "step": 14615 + }, + { + "epoch": 4.971387270949364, + "grad_norm": 5.3117366358284315, + "learning_rate": 3.766185366868569e-10, + "loss": 0.3035, + "step": 14620 + }, + { + "epoch": 4.973087878916713, + "grad_norm": 5.98696897470915, + "learning_rate": 3.3101339220492637e-10, + "loss": 0.3294, + "step": 14625 + }, + { + "epoch": 4.974788486884061, + "grad_norm": 5.125906773564488, + "learning_rate": 2.883502640405045e-10, + "loss": 0.3312, + "step": 14630 + }, + { + "epoch": 4.976489094851409, + "grad_norm": 9.435480750740867, + "learning_rate": 2.486292024070358e-10, + "loss": 0.3434, + "step": 14635 + }, + { + "epoch": 4.978189702818757, + "grad_norm": 8.434443648518124, + "learning_rate": 2.11850254054069e-10, + "loss": 0.3285, + "step": 14640 + }, + { + "epoch": 4.979890310786106, + "grad_norm": 4.90195079746842, + "learning_rate": 1.7801346226947736e-10, + "loss": 0.3252, + "step": 14645 + }, + { + "epoch": 4.981590918753454, + "grad_norm": 21.019745490751443, + "learning_rate": 1.4711886687807097e-10, + "loss": 0.3381, + "step": 14650 + }, + { + "epoch": 4.9832915267208024, + "grad_norm": 4.632978757783584, + "learning_rate": 1.191665042410417e-10, + "loss": 0.3404, + "step": 14655 + }, + { + "epoch": 4.984992134688151, + "grad_norm": 3.9419023704852543, + "learning_rate": 9.415640725762843e-11, + "loss": 0.3189, + "step": 14660 + }, + { + "epoch": 4.986692742655499, + "grad_norm": 4.6062000770567675, + "learning_rate": 7.208860536345174e-11, + "loss": 0.3345, + "step": 14665 + }, + { + "epoch": 4.9883933506228475, + "grad_norm": 7.7648848410049585, + "learning_rate": 5.296312453217933e-11, + "loss": 0.3167, + "step": 14670 + }, + { + "epoch": 4.9900939585901956, + "grad_norm": 6.071758031929791, + "learning_rate": 3.677998727302789e-11, + "loss": 0.305, + "step": 14675 + }, + { + "epoch": 4.9917945665575445, + "grad_norm": 6.055511194549918, + "learning_rate": 2.35392126332612e-11, + "loss": 0.3061, + "step": 14680 + }, + { + "epoch": 4.9934951745248926, + "grad_norm": 5.787453807652962, + "learning_rate": 1.3240816196802287e-11, + "loss": 0.3292, + "step": 14685 + }, + { + "epoch": 4.995195782492241, + "grad_norm": 5.439973255765252, + "learning_rate": 5.884810084511028e-12, + "loss": 0.3282, + "step": 14690 + }, + { + "epoch": 4.9968963904595896, + "grad_norm": 4.592926398935226, + "learning_rate": 1.4712029539065698e-12, + "loss": 0.3253, + "step": 14695 + }, + { + "epoch": 4.998596998426938, + "grad_norm": 4.794272151329833, + "learning_rate": 0.0, + "loss": 0.3183, + "step": 14700 + }, + { + "epoch": 4.998596998426938, + "step": 14700, + "total_flos": 6727881718628352.0, + "train_loss": 0.41388291033757785, + "train_runtime": 737214.1133, + "train_samples_per_second": 1.276, + "train_steps_per_second": 0.02 + } + ], + "logging_steps": 5, + "max_steps": 14700, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6727881718628352.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}